Procházet zdrojové kódy

Added paste statistics to feature extraction.

Thomas Flucke před 6 roky
rodič
revize
304d955632

+ 8 - 0
src/feature-extractor/extractor.py

@@ -34,6 +34,12 @@ def parse_args():
     parser.add_argument('-b', '--lookback', type=float, default=3,
                         help='Seconds of lookback to determine activity state \
                         (default: 3.0s)')
+    parser.add_argument('-s', '--small-paste-threshold', type=int, default=2,
+                        help='Number of 8B blocks to count as a small paste \
+                        (default: 2)')
+    parser.add_argument('-p', '--large-paste-threshold', type=int, default=5,
+                        help='Number of 8B blocks to count as a large paste \
+                        (default: 5)')
     parser.add_argument('-d', '--dataset', choices=["guided", "free", "both"],
                         default="uniform", help='Which dataset to pull from. \
                         (One of guided, free, or both; default: \"both\")')
@@ -51,6 +57,8 @@ def main():
     Sample.set_activity_thresholds(args.low_act_threshold,
                                    args.high_act_threshold,
                                    args.lookback)
+    Sample.set_copypaste_thresholds(args.small_paste_threshold,
+                                    args.large_paste_threshold)
     users = {}
     for line in args.match_file:
         if "pcap" in line:

+ 19 - 0
src/feature-extractor/sample.py

@@ -9,6 +9,8 @@ class Sample:
     TIME_FMT = '%Y-%m-%d %H:%M:%S.%f'
     FILTER = "tcp.flags.push == 1 && tcp.dstport == 22" # len % 8 == 6
     EPSIOLON = 0.0000000000001
+    BASE_PACKET_SIZE = 102
+    BLOCK_SIZE = 8
 
     def make_samples(keylog: typing.TextIO,
                      pcap: typing.BinaryIO,
@@ -33,6 +35,15 @@ class Sample:
         Sample.high_act_threshold = upper_bound
         Sample.low_act_threshold = lower_bound
         Sample.lookback = lookback
+    
+    # Boundaries measured in packets/second
+    def set_copypaste_thresholds(lower_bound: float, upper_bound: float):
+        assert(lower_bound < upper_bound)
+        assert(lower_bound >= 1)
+        Sample.small_paste_threshold = Sample.BLOCK_SIZE*lower_bound + \
+                                       Sample.BASE_PACKET_SIZE
+        Sample.large_paste_threshold = Sample.BLOCK_SIZE*upper_bound + \
+                                       Sample.BASE_PACKET_SIZE
 
     def __packet_time(p):
         return (datetime.strptime(p.time, Sample.TIME_FMT) - Sample.EPOCH) \
@@ -43,6 +54,7 @@ class Sample:
         self.__extract_tag(keylog)
         self.__extract_activity_stats(packets)
         self.__extract_time_stats(packets)
+        self.__extract_paste_stats(packets)
 
     def __extract_tag(self, keylog: typing.TextIO):
         import os
@@ -131,6 +143,13 @@ class Sample:
         end = Sample.__packet_time(pcap[-1])
         self["total_time"] = end - start
         self["average_iat"] = (end - start) / len(pcap)
+    
+    def __extract_paste_stats(self, pcaps):
+        self["total_pastes"] = sum(p.length > Sample.small_paste_threshold
+                                   for p in pcaps)
+        self["large_pastes"] = sum(p.length > Sample.large_paste_threshold
+                                   for p in pcaps)
+        self["small_pastes"] = self["total_pastes"] - self["large_pastes"]
 
     def __is_valid_prefix(pre):
         return pre in ["high", "mid", "low"]