# Gather Dataset for File Create, Update & Delete

In this notebook we will gather all the information needed to do anomaly detection on the lifetime of files

In [1]:
import sys
sys.path.append('/opt/2IMS40')
sys.path.append('../')

# Import files from current directory
from search_index import search_index
from es_client import es_client

import datetime
import pandas as pd
import elasticsearch
import elasticsearch.helpers
from IPython.display import display
from tqdm.auto import tqdm

es_client = es_client(
    local=True # connects to 'https://localhost:9200' with invalid certificate
)

  _transport = transport_class(


Gather all log entries from `EventID: 11` and `EventID: 23` 

In [2]:
# Find process creations
df = search_index(es_client, "source: (sysmon.json) AND (Event.System.EventID : 11 OR Event.System.EventID : 23)", size=None, debug=True).dropna(how='all', axis=1)
df.shape

(325704, 31)

Start and filter out only `Delete` Events to later gather all the `Create` and `Update` timestamps

In [3]:
file_create_delete = df[['Event.EventData.TargetFilename', 'Event.System.EventID', 'Event.EventData.CreationUtcTime', 'Event.EventData.UtcTime']]

file_times = file_create_delete[file_create_delete['Event.System.EventID'] == 23].reset_index()[['Event.EventData.TargetFilename', 'Event.EventData.UtcTime']]
file_times.rename({'Event.EventData.TargetFilename': 'TargetFilename', 'Event.EventData.UtcTime': 'DeletionTime'}, axis=1, inplace=True)
file_times['CreateTime'] = [0] * file_times.shape[0]
file_times['UpdateTime'] = [0] * file_times.shape[0]
file_times

Unnamed: 0,TargetFilename,DeletionTime,CreateTime,UpdateTime
0,C:\Windows\ServiceProfiles\NetworkService\AppD...,2022-12-09 09:51:40.809,0,0
1,C:\ProgramData\regid.1991-06.com.microsoft\reg...,2022-12-09 09:51:40.984,0,0
2,C:\ProgramData\Microsoft\Diagnosis\DownloadedS...,2022-12-09 09:51:41.265,0,0
3,C:\ProgramData\Microsoft\Diagnosis\parse.dat,2022-12-09 09:51:41.307,0,0
4,C:\Windows\SoftwareDistribution\Download\1cd81...,2022-12-09 09:51:41.497,0,0
...,...,...,...,...
116311,C:\Users\User\AppData\Roaming\Microsoft\Window...,2022-12-23 18:39:03.490,0,0
116312,C:\Windows\SERVIC~1\LOCALS~1\AppData\Local\Tem...,2022-12-23 18:39:18.876,0,0
116313,C:\Windows\SERVIC~1\LOCALS~1\AppData\Local\Tem...,2022-12-23 18:39:18.981,0,0
116314,C:\Windows\SERVIC~1\LOCALS~1\AppData\Local\Tem...,2022-12-23 18:39:18.988,0,0


Gather all `Create` and `Update` timestamps matching with the `Delete` event

<div class="alert alert-danger">
    Be prepared to wait 30 to 90 minutes
</div>

In [4]:
only_event_11 = file_create_delete[file_create_delete['Event.System.EventID'] == 11]
unique_filenames = file_times['TargetFilename'].unique()

for target_filename in tqdm(unique_filenames, total=len(unique_filenames)):
    
    only_event_11_filename = only_event_11[(only_event_11['Event.EventData.TargetFilename'] == target_filename)]
    
    for index, row in file_times[file_times['TargetFilename'] == target_filename].iterrows():
        
        events = only_event_11_filename[only_event_11_filename['Event.EventData.UtcTime'] <= row['DeletionTime']]
        
        if events.shape[0] > 0:
            last_createfile_event = events.iloc[-1]
            
            file_times.loc[index, 'CreateTime'] = last_createfile_event['Event.EventData.CreationUtcTime']
            file_times.loc[index, 'UpdateTime'] = last_createfile_event['Event.EventData.UtcTime']


  0%|          | 0/101754 [00:00<?, ?it/s]

KeyboardInterrupt: 

Save dataframe to `file_times.csv`

In [6]:
file_times[file_times['UpdateTime'] != 0].to_csv('file_times.csv')