Преглед изворни кода

Fixed bugs and sample sizes.

Now filters out unnessisary packets before operating.

Fixed bug in activity classifier that failed to account for window != 1s.

Pcaps now split into a fixed sample size instead of each being one large sample.
Thomas Flucke пре 6 година
родитељ
комит
6e497e8442
2 измењених фајлова са 46 додато и 26 уклоњено
  1. 7 2
      src/feature-extractor/extractor.py
  2. 39 24
      src/feature-extractor/sample.py

+ 7 - 2
src/feature-extractor/extractor.py

@@ -14,9 +14,12 @@ def parse_args():
     parser.add_argument('-o', '--outfile', type=argparse.FileType('wb'),
                         default="features.plo", help='Where to save the " \
                         "extracted features (default: features.plo)')
+    parser.add_argument('-s', '--sample-size', type=int,
+                        default="200", help='number of packets in a sample \
+                        (default: 200)')
     parser.add_argument('-l', '--low-act-threshold', type=float, default=1,
                         help='P/s below which is considered low activity \
-                        (default: 3.0 P/s)')
+                        (default: 1.0 P/s)')
     parser.add_argument('-i', '--high-act-threshold', type=float, default=3,
                         help='P/s above which is considered high activity \
                         (default: 3.0 P/s)')
@@ -37,7 +40,9 @@ def main():
     Sample.set_activity_thresholds(args.low_act_threshold,
                                    args.high_act_threshold,
                                    args.lookback)
-    out = [Sample(*line.split(" ")) for line in args.match_file if "pcap" in line]
+    out = [sample
+           for line in args.match_file if "pcap" in line
+           for sample in Sample.make_samples(*line.split(" "), args.sample_size)]
     try:
         import cPickle as pickle
     except:

+ 39 - 24
src/feature-extractor/sample.py

@@ -1,11 +1,27 @@
 import typing
 import pyshark
 from datetime import datetime
+from typing import List
 
 class Sample:
     EPOCH = datetime(1970, 1, 1)
     TIME_FMT = '%Y-%m-%d %H:%M:%S.%f'
+    FILTER = "tcp.flags.push == 1 && tcp.dstport == 22" # len % 8 == 6
 
+    def make_samples(keylog: typing.TextIO,
+                     pcap: typing.BinaryIO,
+                     sample_size: int):
+        f = pyshark.FileCapture(pcap.strip(),
+                                display_filter=Sample.FILTER,
+                                only_summaries=True)
+        f.load_packets()
+        # Start from sample_size to skip incomplete samples
+        samples = map(lambda slic: Sample(slic, keylog), [
+            [f[i] for i in range(end-sample_size, end)]
+            for end in range(sample_size, len(f), sample_size)
+        ])
+        return samples
+    
     # Boundaries measured in packets/second
     def set_activity_thresholds(lower_bound: float, upper_bound: float,
                                 lookback: float):
@@ -16,52 +32,51 @@ class Sample:
         Sample.low_act_threshold = lower_bound
         Sample.lookback = lookback
 
-    def packet_time(p):
+    def __packet_time(p):
         return (datetime.strptime(p.time, Sample.TIME_FMT) - Sample.EPOCH) \
             .total_seconds()
         
-    def __init__(self, keylog: typing.TextIO, pcap: typing.BinaryIO):
-        self.extract_tag(keylog)
-        f = pyshark.FileCapture(pcap.strip(), only_summaries=True)
-        f.load_packets()
-        self.extract_activity_stats(f)
-        self.extract_packet_stats(f)
+    def __init__(self, packets, keylog: typing.TextIO):
+        self.__extract_tag(keylog)
+        self.__extract_activity_stats(packets)
+        self.__extract_packet_stats(packets)
 
-    def extract_tag(self, keylog: typing.TextIO):
+    def __extract_tag(self, keylog: typing.TextIO):
         import os
         dir_guided = os.path.dirname(keylog)
         self.is_guided = os.path.basename(dir_guided) == "y"
         dir_user = os.path.dirname(dir_guided)
         self.user = os.path.basename(dir_user)
 
-    def extract_activity_stats(self, keylog: typing.TextIO):
+    def __extract_activity_stats(self, packets):
         high_activity = []
         mid_activity = []
         low_activity = []
         q = []
-        for p in keylog:
-            ptime = Sample.packet_time(p)
+        for p in packets:
+            ptime = Sample.__packet_time(p)
             q.append(p)
-            if len(q) < self.low_act_threshold:
+            rate = float(len(q)) / self.lookback
+            if rate < self.low_act_threshold:
                 low_activity.append(q)
-            elif len(q) > self.high_act_threshold:
-                high_activity.append(q)
-            else:
+            elif rate < self.high_act_threshold:
                 mid_activity.append(q)
-            while Sample.packet_time(q[0]) + self.lookback < ptime:
+            else:
+                high_activity.append(q)
+            while Sample.__packet_time(q[0]) + self.lookback < ptime:
                 q = q[1:]
         self.activities = {
-            "high": Sample.count_activity_stats(high_activity),
-            "mid": Sample.count_activity_stats(mid_activity),
-            "low": Sample.count_activity_stats(low_activity)
+            "high": Sample.__count_activity_stats(high_activity),
+            "mid": Sample.__count_activity_stats(mid_activity),
+            "low": Sample.__count_activity_stats(low_activity)
         }
 
-    def count_activity_stats(arr):
+    def __count_activity_stats(arr):
         return {
             "total packets": len(arr)
         }
         
-    def extract_packet_stats(self, pcap):
-        start = Sample.packet_time(pcap[0])
-        end = Sample.packet_time(pcap[-1])
-        self.average_iat = self.average_iat = (end - start) / len(pcap)
+    def __extract_packet_stats(self, pcap):
+        start = Sample.__packet_time(pcap[0])
+        end = Sample.__packet_time(pcap[-1])
+        self.average_iat = (end - start) / len(pcap)