Gather Dataset for File Create, Update & Delete¶

In this notebook we will gather all the information needed to do anomaly detection on the lifetime of files

In [1]:
import sys
sys.path.append('/opt/2IMS40')
sys.path.append('../')

# Import files from current directory
from search_index import search_index
from es_client import es_client

import datetime
import pandas as pd
import elasticsearch
import elasticsearch.helpers
from IPython.display import display
from tqdm.auto import tqdm

es_client = es_client(
    local=True # connects to 'https://localhost:9200' with invalid certificate
)
/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.14) or chardet (3.0.4) doesn't match a supported version!
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
/usr/local/lib/python3.8/dist-packages/elasticsearch/_sync/client/__init__.py:394: SecurityWarning: Connecting to 'https://localhost:9200' using TLS with verify_certs=False is insecure
  _transport = transport_class(

Gather all log entries from EventID: 11 and EventID: 23

In [2]:
# Find process creations
df = search_index(es_client, "source: (sysmon.json) AND (Event.System.EventID : 11 OR Event.System.EventID : 23)", size=None, debug=True).dropna(how='all', axis=1)
df.shape
Out[2]:
(325704, 31)

Start and filter out only Delete Events to later gather all the Create and Update timestamps

In [3]:
file_create_delete = df[['Event.EventData.TargetFilename', 'Event.System.EventID', 'Event.EventData.CreationUtcTime', 'Event.EventData.UtcTime']]

file_times = file_create_delete[file_create_delete['Event.System.EventID'] == 23].reset_index()[['Event.EventData.TargetFilename', 'Event.EventData.UtcTime']]
file_times.rename({'Event.EventData.TargetFilename': 'TargetFilename', 'Event.EventData.UtcTime': 'DeletionTime'}, axis=1, inplace=True)
file_times['CreateTime'] = [0] * file_times.shape[0]
file_times['UpdateTime'] = [0] * file_times.shape[0]
file_times
Out[3]:
TargetFilename DeletionTime CreateTime UpdateTime
0 C:\Windows\ServiceProfiles\NetworkService\AppD... 2022-12-09 09:51:40.809 0 0
1 C:\ProgramData\regid.1991-06.com.microsoft\reg... 2022-12-09 09:51:40.984 0 0
2 C:\ProgramData\Microsoft\Diagnosis\DownloadedS... 2022-12-09 09:51:41.265 0 0
3 C:\ProgramData\Microsoft\Diagnosis\parse.dat 2022-12-09 09:51:41.307 0 0
4 C:\Windows\SoftwareDistribution\Download\1cd81... 2022-12-09 09:51:41.497 0 0
... ... ... ... ...
116311 C:\Users\User\AppData\Roaming\Microsoft\Window... 2022-12-23 18:39:03.490 0 0
116312 C:\Windows\SERVIC~1\LOCALS~1\AppData\Local\Tem... 2022-12-23 18:39:18.876 0 0
116313 C:\Windows\SERVIC~1\LOCALS~1\AppData\Local\Tem... 2022-12-23 18:39:18.981 0 0
116314 C:\Windows\SERVIC~1\LOCALS~1\AppData\Local\Tem... 2022-12-23 18:39:18.988 0 0
116315 C:\Users\User\AppData\Roaming\Microsoft\Window... 2022-12-23 18:39:39.656 0 0

116316 rows × 4 columns

Gather all Create and Update timestamps matching with the Delete event

Be prepared to wait 30 to 90 minutes
In [4]:
only_event_11 = file_create_delete[file_create_delete['Event.System.EventID'] == 11]
unique_filenames = file_times['TargetFilename'].unique()

for target_filename in tqdm(unique_filenames, total=len(unique_filenames)):
    
    only_event_11_filename = only_event_11[(only_event_11['Event.EventData.TargetFilename'] == target_filename)]
    
    for index, row in file_times[file_times['TargetFilename'] == target_filename].iterrows():
        
        events = only_event_11_filename[only_event_11_filename['Event.EventData.UtcTime'] <= row['DeletionTime']]
        
        if events.shape[0] > 0:
            last_createfile_event = events.iloc[-1]
            
            file_times.loc[index, 'CreateTime'] = last_createfile_event['Event.EventData.CreationUtcTime']
            file_times.loc[index, 'UpdateTime'] = last_createfile_event['Event.EventData.UtcTime']
  0%|          | 0/101754 [00:00<?, ?it/s]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_139170/2781775442.py in <module>
      4 for target_filename in tqdm(unique_filenames, total=len(unique_filenames)):
      5 
----> 6     only_event_11_filename = only_event_11[(only_event_11['Event.EventData.TargetFilename'] == target_filename)]
      7 
      8     for index, row in file_times[file_times['TargetFilename'] == target_filename].iterrows():

/usr/local/lib/python3.8/dist-packages/pandas/core/ops/common.py in new_method(self, other)
     70         other = item_from_zerodim(other)
     71 
---> 72         return method(self, other)
     73 
     74     return new_method

/usr/local/lib/python3.8/dist-packages/pandas/core/arraylike.py in __eq__(self, other)
     40     @unpack_zerodim_and_defer("__eq__")
     41     def __eq__(self, other):
---> 42         return self._cmp_method(other, operator.eq)
     43 
     44     @unpack_zerodim_and_defer("__ne__")

/usr/local/lib/python3.8/dist-packages/pandas/core/series.py in _cmp_method(self, other, op)
   6241 
   6242         with np.errstate(all="ignore"):
-> 6243             res_values = ops.comparison_op(lvalues, rvalues, op)
   6244 
   6245         return self._construct_result(res_values, name=res_name)

/usr/local/lib/python3.8/dist-packages/pandas/core/ops/array_ops.py in comparison_op(left, right, op)
    285 
    286     elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
--> 287         res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
    288 
    289     else:

/usr/local/lib/python3.8/dist-packages/pandas/core/ops/array_ops.py in comp_method_OBJECT_ARRAY(op, x, y)
     73         result = libops.vec_compare(x.ravel(), y.ravel(), op)
     74     else:
---> 75         result = libops.scalar_compare(x.ravel(), y, op)
     76     return result.reshape(x.shape)
     77 

KeyboardInterrupt: 

Save dataframe to file_times.csv

In [6]:
file_times[file_times['UpdateTime'] != 0].to_csv('file_times.csv')