import datetime
import pandas as pd
import os
import json

import elasticsearch
import elasticsearch.helpers
import urllib3

import concurrent

import sys
sys.path.append('/opt/2IMS40')

# Import files from current directory
from search_index import search_index
from es_client import es_client

## The next part is copied from (template) code for the course 2IAB0 - Data Analytics for Engineers
# next command ensures that plots appear inside the notebook
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns  # also improves the look of plots
sns.set()  # set Seaborn defaults
plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn
## End copied part

es_client = es_client(local=True)

/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.14) or chardet (3.0.4) doesn't match a supported version!
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
/usr/local/lib/python3.8/dist-packages/elasticsearch/_sync/client/__init__.py:394: SecurityWarning: Connecting to 'https://localhost:9200' using TLS with verify_certs=False is insecure
  _transport = transport_class(


# Find process creations
df = search_index(es_client, "_exists_:Event.EventData.DestinationPort AND source: sysmon.json").dropna(how='all', axis=1)

df


# Clean file names to lower case
df['Event.EventData.Image'] = df['Event.EventData.Image'].apply(str.lower)


ports = df['Event.EventData.DestinationPort'].unique().astype(int)
ports.sort()
ports

array([   53,    67,    80,   123,   137,   389,   443,   547,  1900,
        5353,  5355,  6606, 27017, 27018, 27019, 27021, 27022, 27023,
       27024, 27025, 27028, 27029, 27030, 27031, 27032, 27033, 27034,
       27035, 27036, 27037, 27038, 49317, 49318, 49730, 49732, 49743,
       49745, 49858, 49860, 49985, 49989, 50066, 50068, 50184, 50186,
       50398, 50399, 50907, 50908, 51408, 51410, 51534, 51809, 51811,
       52074, 52075, 52513, 52515, 53017, 53019, 53361, 53362, 54090,
       54092, 54564, 54567, 54609, 54612, 54652, 54657, 54660, 54721,
       54728, 54731, 55185, 55187, 55658, 55660, 55914, 55915, 56031,
       56032, 56124, 56242, 56521, 56756, 56757, 56994, 56996, 57225,
       57343, 57345, 57465, 57781, 57782, 57891, 57893, 57917, 57918,
       58079, 58080, 58497, 58784, 58785, 58786, 58787, 58794, 58796,
       59113, 59115, 59673, 59674, 59810, 60628, 60629, 61497, 61498,
       61649, 61651, 61896, 61898, 62813, 62815, 62843, 62845, 63000,
       63002, 64305, 64306, 64532, 64534, 65478, 65480])


df['PrivilegedPort'] = df['Event.EventData.DestinationPort'] < 1024
df['PrivilegedPort'].value_counts()

True     8614
False    1853
Name: PrivilegedPort, dtype: int64


df[df['PrivilegedPort'] == False]['Event.EventData.Image'].unique()

array(['c:\\windows\\system32\\svchost.exe',
       'c:\\program files (x86)\\microsoft\\edge\\application\\msedge.exe',
       '<unknown process>', 'c:\\program files (x86)\\steam\\steam.exe',
       'c:\\users\\user\\appdata\\local\\mozilla firefox\\firefox.exe',
       'c:\\program files\\mozilla firefox\\firefox.exe',
       'c:\\windows\\microsoft.net\\framework\\v4.0.30319\\regsvcs.exe'],
      dtype=object)


df[df['PrivilegedPort'] == False]['Event.EventData.Protocol'].value_counts()

udp    1746
tcp     107
Name: Event.EventData.Protocol, dtype: int64


df[(df['PrivilegedPort'] == False) & (df['Event.EventData.Protocol'] == 'tcp')]['Event.EventData.Image'].unique()

array(['<unknown process>', 'c:\\program files (x86)\\steam\\steam.exe',
       'c:\\users\\user\\appdata\\local\\mozilla firefox\\firefox.exe',
       'c:\\program files\\mozilla firefox\\firefox.exe',
       'c:\\windows\\microsoft.net\\framework\\v4.0.30319\\regsvcs.exe'],
      dtype=object)


ports = df[(df['PrivilegedPort'] == False) & (df['Event.EventData.Protocol'] == 'tcp')]['Event.EventData.DestinationPort'].unique().astype(int)
ports.sort()
ports

array([ 6606, 27019, 27021, 27022, 27023, 27024, 27025, 27028, 27029,
       27030, 27031, 27032, 27033, 27034, 27035, 27036, 27037, 27038,
       49730, 49732, 49743, 49745, 49858, 49860, 49985, 49989, 50066,
       50068, 51534, 54564, 54567, 54609, 54612, 54657, 54660, 54721,
       54728, 54731, 56242, 57225, 57465, 58497, 58784, 58786, 59113,
       59115, 59810, 62843, 62845])


# This code is strongly based on code used in the course 2IAB0 (Data Analytics for Engineers).
ports_and_images = df[(df['PrivilegedPort'] == False) & (df['Event.EventData.Protocol'] == 'tcp')][['Event.EventData.DestinationPort', 'Event.EventData.Image']]
ports_and_images['Event.EventData.DestinationPort'] = ports_and_images['Event.EventData.DestinationPort'].astype(int)
ports_and_images.rename({'Event.EventData.DestinationPort': 'DestinationPort', 'Event.EventData.Image': 'Image'}, axis=1, inplace=True)

ports_and_images['count'] = 0
pi = ports_and_images.groupby(["DestinationPort", "Image"]).count().reset_index()

# fill the NaN values with 0's
pi["count"] = pi["count"].fillna(0)

# pivot the table
pi_matrix = pi.pivot("DestinationPort", "Image", "count").fillna(0)

sns.heatmap(pi_matrix.transpose(), square=True);

/tmp/ipykernel_144773/4148716046.py:13: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  pi_matrix = pi.pivot("DestinationPort", "Image", "count").fillna(0)


# Based on https://stackoverflow.com/a/34156147/2378368
pi_matrix.astype(bool).sum().plot.barh(
    # kind='bar',
    xticks=[i for i in range(0,21)],
    xlabel='Number of non-privileged TCP ports opened'
);


# These are the malware images, as given above, but encoded into properly 'formatted' strings for use in Python:
malware_images = set([
    'C:\\Users\\User\\Downloads\\2ecbf5a27adc238af0b125b985ae2a8b1bc14526faea3c9e40e6c3437245d830.exe'.lower(),
    'C:\\Users\\User\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\Systdeeem.exe'.lower(),
    'C:\\Windows\\SysWOW64\\WindowsPowerShell\\v1.0\\powershell.exe'.lower(),
    'C:\\Windows\\System32\\conhost.exe'.lower(),
    'C:\\Windows\\Microsoft.NET\\Framework\\v4.0.30319\\RegSvcs.exe'.lower()
])
malware_images

{'c:\\users\\user\\appdata\\roaming\\microsoft\\windows\\start menu\\programs\\startup\\systdeeem.exe',
 'c:\\users\\user\\downloads\\2ecbf5a27adc238af0b125b985ae2a8b1bc14526faea3c9e40e6c3437245d830.exe',
 'c:\\windows\\microsoft.net\\framework\\v4.0.30319\\regsvcs.exe',
 'c:\\windows\\system32\\conhost.exe',
 'c:\\windows\\syswow64\\windowspowershell\\v1.0\\powershell.exe'}


true_positives = len(malware_images.intersection(set(pi_matrix.columns[pi_matrix.astype(bool).sum() == 1])))
true_positives

1


false_negatives = len(malware_images.difference(set(pi_matrix.columns[pi_matrix.astype(bool).sum() == 1])))
false_negatives

4


false_positives = len(set(pi_matrix.columns[pi_matrix.astype(bool).sum() == 1]).difference(malware_images))
false_positives

0


true_negatives = len(df['Event.EventData.Image'].unique()) - true_positives - false_negatives - false_positives
true_negatives

46


accuracy = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
FPR = false_positives / (false_positives + true_negatives) # false positive rate
TNR = true_negatives / (false_positives + true_negatives)
F1_score = 2 * precision * recall / (precision + recall)

print("Accuracy            = " + "{0:.3f}".format(accuracy))
print("Precision           = " + "{0:.3f}".format(precision))
print("Recall              = " + "{0:.3f}".format(recall))
print("False Positive Rate = " + "{0:.3f}".format(FPR))
print("True  Negative Rate = " + "{0:.3f}".format(TNR))
print("F1-score            = " + "{0:.3f}".format(F1_score))

Accuracy            = 0.922
Precision           = 1.000
Recall              = 0.200
False Positive Rate = 0.000
True  Negative Rate = 1.000
F1-score            = 0.333


# This code is strongly based on code used in the course 2IAB0 (Data Analytics for Engineers).
ports_and_images = df[(df['PrivilegedPort'] == False)][['Event.EventData.DestinationPort', 'Event.EventData.Image']]
ports_and_images['Event.EventData.DestinationPort'] = ports_and_images['Event.EventData.DestinationPort'].astype(int)
ports_and_images.rename({'Event.EventData.DestinationPort': 'DestinationPort', 'Event.EventData.Image': 'Image'}, axis=1, inplace=True)

ports_and_images['count'] = 0
pi = ports_and_images.groupby(["DestinationPort", "Image"]).count().reset_index()

# fill the NaN values with 0's
pi["count"] = pi["count"].fillna(0)

# pivot the table
pi_matrix_udp = pi.pivot("DestinationPort", "Image", "count").fillna(0)

sns.heatmap(pi_matrix_udp.transpose(), square=False);

/tmp/ipykernel_144773/3676470168.py:13: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  pi_matrix_udp = pi.pivot("DestinationPort", "Image", "count").fillna(0)


# Based on https://stackoverflow.com/a/34156147/2378368
pi_matrix_udp.astype(bool).sum().plot.barh(
    # kind='bar',
    # xticks=iris_matrix.astype(bool).sum().unique(),
    xticks=list(set([1]).union([i for i in range(10, 71, 10)])),
    xlabel='Number of non-privileged ports opened'
);


true_positives = len(malware_images.intersection(set(pi_matrix_udp.columns[pi_matrix_udp.astype(bool).sum() == 1])))
true_positives

1


false_negatives = len(malware_images.difference(set(pi_matrix_udp.columns[pi_matrix_udp.astype(bool).sum() == 1])))
false_negatives

4


false_positives = len(set(pi_matrix_udp.columns[pi_matrix_udp.astype(bool).sum() == 1]).difference(malware_images))
false_positives

0


true_negatives = len(df['Event.EventData.Image'].unique()) - true_positives - false_negatives - false_positives
true_negatives

46


accuracy = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
FPR = false_positives / (false_positives + true_negatives) # false positive rate
TNR = true_negatives / (false_positives + true_negatives)
F1_score = 2 * precision * recall / (precision + recall)

print("Accuracy            = " + "{0:.3f}".format(accuracy))
print("Precision           = " + "{0:.3f}".format(precision))
print("Recall              = " + "{0:.3f}".format(recall))
print("False Positive Rate = " + "{0:.3f}".format(FPR))
print("True  Negative Rate = " + "{0:.3f}".format(TNR))
print("F1-score            = " + "{0:.3f}".format(F1_score))

Accuracy            = 0.922
Precision           = 1.000
Recall              = 0.200
False Positive Rate = 0.000
True  Negative Rate = 1.000
F1-score            = 0.333

	_index	_id	sort	Event.EventData.Image	Event.EventData.ProcessGuid	Event.EventData.ProcessId	Event.EventData.RuleName	Event.EventData.User	Event.EventData.UtcTime	Event.System.Channel	...	Event.EventData.DestinationPort	Event.EventData.DestinationPortName	Event.EventData.Initiated	Event.EventData.Protocol	Event.EventData.SourceHostname	Event.EventData.SourceIp	Event.EventData.SourceIsIpv6	Event.EventData.SourcePort	Event.EventData.SourcePortName	source
0	host_events	30807	[1670579502712]	System	EDA6DF62-051C-6393-EB03-000000000000	4.0	-	NT AUTHORITY\SYSTEM	2022-12-09 09:51:39.374	Microsoft-Windows-Sysmon/Operational	...	137.0	netbios-ns	True	udp	WinDev2211Eval.home	10.0.2.15	False	137.0	netbios-ns	04_host_windows/sysmon.json
1	host_events	30808	[1670579502713]	System	EDA6DF62-051C-6393-EB03-000000000000	4.0	-	NT AUTHORITY\SYSTEM	2022-12-09 09:51:39.375	Microsoft-Windows-Sysmon/Operational	...	137.0	netbios-ns	False	udp	-	10.0.2.255	False	137.0	netbios-ns	04_host_windows/sysmon.json
2	host_events	30809	[1670579502713]	C:\Windows\System32\svchost.exe	EDA6DF62-0529-6393-2700-000000002100	1892.0	-	NT AUTHORITY\NETWORK SERVICE	2022-12-09 09:51:40.060	Microsoft-Windows-Sysmon/Operational	...	53.0	domain	True	udp	WinDev2211Eval.home	10.0.2.15	False	54568.0	-	04_host_windows/sysmon.json
3	host_events	30810	[1670579502713]	C:\Windows\System32\svchost.exe	EDA6DF62-0529-6393-2700-000000002100	1892.0	-	NT AUTHORITY\NETWORK SERVICE	2022-12-09 09:51:40.071	Microsoft-Windows-Sysmon/Operational	...	53.0	domain	True	udp	WinDev2211Eval.home	10.0.2.15	False	51053.0	-	04_host_windows/sysmon.json
4	host_events	31146	[1670579502992]	C:\Windows\System32\svchost.exe	EDA6DF62-052B-6393-5A00-000000002100	4236.0	-	NT AUTHORITY\SYSTEM	2022-12-09 09:51:40.106	Microsoft-Windows-Sysmon/Operational	...	443.0	https	True	tcp	WinDev2211Eval.home	10.0.2.15	False	49672.0	-	04_host_windows/sysmon.json
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
462	host_events	4521131	[1671820553547]	C:\Program Files (x86)\Microsoft\Edge\Applicat...	EDA6DF62-C90A-63A5-A400-000000002900	6868.0	-	WINDEV2211EVAL\User	2022-12-23 18:36:14.322	Microsoft-Windows-Sysmon/Operational	...	1900.0	ssdp	True	udp	WinDev2211Eval.home	10.0.2.15	False	63104.0	-	04_host_windows/sysmon.json
463	host_events	4521994	[1671820578574]	System	EDA6DF62-C8DE-63A5-EB03-000000000000	4.0	-	NT AUTHORITY\SYSTEM	2022-12-23 18:36:41.552	Microsoft-Windows-Sysmon/Operational	...	137.0	netbios-ns	True	udp	WinDev2211Eval.home	10.0.2.15	False	137.0	netbios-ns	04_host_windows/sysmon.json
464	host_events	4521995	[1671820578574]	System	EDA6DF62-C8DE-63A5-EB03-000000000000	4.0	-	NT AUTHORITY\SYSTEM	2022-12-23 18:36:41.553	Microsoft-Windows-Sysmon/Operational	...	137.0	netbios-ns	False	udp	-	10.0.2.255	False	137.0	netbios-ns	04_host_windows/sysmon.json
465	host_events	4524067	[1671820634409]	C:\Program Files (x86)\Microsoft\Edge\Applicat...	EDA6DF62-C90A-63A5-A400-000000002900	6868.0	-	WINDEV2211EVAL\User	2022-12-23 18:38:14.320	Microsoft-Windows-Sysmon/Operational	...	1900.0	ssdp	True	udp	WinDev2211Eval.home	10.0.2.15	False	63105.0	-	04_host_windows/sysmon.json
466	host_events	4525142	[1671820774582]	C:\Program Files (x86)\Microsoft\Edge\Applicat...	EDA6DF62-C90A-63A5-A400-000000002900	6868.0	-	WINDEV2211EVAL\User	2022-12-23 18:40:14.339	Microsoft-Windows-Sysmon/Operational	...	1900.0	ssdp	True	udp	WinDev2211Eval.home	10.0.2.15	False	63106.0	-	04_host_windows/sysmon.json

Anomaly detection: non-privileged ports¶

Computing true/false postives/negatives¶

Considering both TCP and UDP ports¶

Computing true/false postives/negatives¶