Sfoglia il codice sorgente

Fixed errors.

Includes parseargs having duplicate -p options in extractor.

Divide by zero error in paste statistics.

And tshark not being discoverable on Cal Poly Systems.
Note: This one is a hack and should really be searching the user's path.
Tom Flucke 6 anni fa
parent
commit
778b3c239a
2 ha cambiato i file con 35 aggiunte e 8 eliminazioni
  1. 8 5
      src/feature-extractor/extractor.py
  2. 27 3
      src/feature-extractor/sample.py

+ 8 - 5
src/feature-extractor/extractor.py

@@ -61,9 +61,11 @@ def main():
     Sample.set_copypaste_thresholds(args.small_paste_threshold,
                                     args.large_paste_threshold)
     users = {}
+    os.chdir(os.path.dirname(os.path.realpath(args.match_file.name)) + "/..")
     for line in args.match_file:
         if "pcap" in line:
-            samples = list(Sample.make_samples(*line.split(" "),args.sample_size))
+            text, pcap = line.split(" ")
+            samples = list(Sample.make_samples(text, pcap, args.sample_size))
             if samples and ((not samples[0].is_guided and args.dataset == "free")
                             or (samples[0].is_guided and args.dataset == "guided")
                             or args.dataset == "both"):
@@ -72,10 +74,6 @@ def main():
                     users[user].extend(samples)
                 else:
                     users[user] = samples
-    if len(users) < args.min_users:
-        print("Not enough data to build valid dataset.", file=sys.stderr)
-        os.remove(args.outfile)
-        return
     fix_point = 99999999999
     if args.fix_sample_count:
         for u in users:
@@ -84,6 +82,11 @@ def main():
     out = [sample
            for u in users if len(users[u]) >= args.min
            for sample in users[u][0:fix_point]]
+    import numpy as np
+    if len(np.unique([s.user for s in out])) < args.min_users:
+        print("Not enough data to build valid dataset.", file=sys.stderr)
+        os.remove(args.outfile.name)
+        return
     try:
         import cPickle as pickle
     except:

+ 27 - 3
src/feature-extractor/sample.py

@@ -14,8 +14,10 @@ class Sample:
     def make_samples(keylog: typing.TextIO,
                      pcap: typing.BinaryIO,
                      sample_size: int):
+        import pyshark, os
         f = pyshark.FileCapture(pcap.strip(),
                                 display_filter=Sample.FILTER,
+                                tshark_path="/usr/sbin/tshark", # TODO: os.environ['PATH']
                                 only_summaries=True)
         f.load_packets()
         # Start from sample_size to skip incomplete samples
@@ -45,11 +47,13 @@ class Sample:
                                        Sample.BASE_PACKET_SIZE
 
     def __packet_time(p):
-        return (datetime.strptime(p.time, Sample.TIME_FMT) - Sample.EPOCH) \
-            .total_seconds()
+        if p.time.isnumeric():
+            return int(p.time)
+        else:
+            return (datetime.strptime(p.time, Sample.TIME_FMT) - Sample.EPOCH) \
+                .total_seconds()
         
     def __init__(self, packets, keylog: typing.TextIO):
-        import pyshark
         self.__general = {}
         self.__extract_tag(keylog)
         self.__extract_activity_stats(packets)
@@ -74,6 +78,7 @@ class Sample:
         i = 0
         for p in packets:
             ptime = Sample.__packet_time(p)
+            p.length = int(p.length)
             p.index = i
             i += 1
             if q:
@@ -150,6 +155,25 @@ class Sample:
         self["large_pastes"] = sum(p.length > Sample.large_paste_threshold
                                    for p in pcaps)
         self["small_pastes"] = self["total_pastes"] - self["large_pastes"]
+        if self["total_pastes"] == 0:
+            self["avg_paste_size"] = 0
+        else:
+            self["avg_paste_size"] = sum(p.length for p in pcaps
+                                         if p.length > Sample.small_paste_threshold) \
+                                            / self["total_pastes"] - Sample.BASE_PACKET_SIZE
+        if self["large_pastes"] == 0:
+            self["avg_large_paste_size"] = 0
+        else:
+            self["avg_large_paste_size"] = sum(p.length for p in pcaps
+                                               if p.length > Sample.large_paste_threshold) \
+                                                  / self["large_pastes"] - Sample.BASE_PACKET_SIZE
+        if self["small_pastes"] == 0:
+            self["avg_small_paste_size"] = 0
+        else:
+            self["avg_small_paste_size"] = sum(p.length for p in pcaps
+                                               if p.length < Sample.large_paste_threshold and
+                                               p.length > Sample.small_paste_threshold) \
+                / self["small_pastes"] - Sample.BASE_PACKET_SIZE
 
     def __is_valid_prefix(pre):
         return pre in ["high", "mid", "low"]