Jelajahi Sumber

Fixed up extractor and added features to the nearestneighbor classifier

Classifier now filters out users who don't have enough samples.

Also reduces number of samples for users to have collected extra data.
Thomas Flucke 6 tahun lalu
induk
melakukan
3c7e1a3b03

+ 0 - 17
ethan_data_processing_scripts/logs.py

@@ -1,17 +0,0 @@
-import sys
-
-
-def main():
-    file = open(sys.argv[1], "r")
-    line = file.readline()
-    x = ""
-    while line:
-        x += line[27:28]
-        line = file.readline()
-
-    print(x)
-    file.close()
-
-
-if __name__ == '__main__':
-    main()

+ 0 - 5
ethan_data_processing_scripts/pcap.py

@@ -1,5 +0,0 @@
-import pyshark
-
-shark_cap = pyshark.FileCapture('/Users/ethangoldfarb/Desktop/tomthesis/SSH-Master-Thesis/data/packets/thesis-capture-2019-05-03.0.pcap')
-for packet in shark_cap:
-    print("%s" % packet)

+ 0 - 0
ethan_data_processing_scripts/README.md → src/classifiers/README.md


+ 0 - 0
ethan_data_processing_scripts/Vector.py → src/classifiers/Vector.py


+ 0 - 0
ethan_data_processing_scripts/classifier.py → src/classifiers/classifier.py


+ 1 - 1
ethan_data_processing_scripts/nearestneighbors.py → src/classifiers/nearestneighbors.py

@@ -18,9 +18,9 @@ def main():
     except:
         import pickle
     samples = pickle.load(args.features_file)
-    features = args.feature if args.feature else DEFAULT_FEATURES
     from random import shuffle
     shuffle(samples)
+    features = args.feature if args.feature else DEFAULT_FEATURES
     data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
                          for p in samples])
     res = kNearestNeighbors(np.array(data), np.array(labels),

+ 0 - 0
ethan_data_processing_scripts/runtests.py → src/classifiers/runtests.py


+ 29 - 2
src/feature-extractor/extractor.py

@@ -17,6 +17,13 @@ def parse_args():
     parser.add_argument('-s', '--sample-size', type=int,
                         default="200", help='number of packets in a sample \
                         (default: 200)')
+    parser.add_argument('-r', '--randomize', action='store_const', default=False,
+                        const=True, help='Pre-randomize sample order')
+    parser.add_argument('-f', '--fix-sample-count', action='store_const',
+                        default=False, const=True,
+                        help='Keep all users as a fixed sample count.')
+    parser.add_argument('-m', '--min', type=int, default="5",
+                        help='Minimum number of samples per user (default: 5)')
     parser.add_argument('-l', '--low-act-threshold', type=float, default=1.0,
                         help='P/s below which is considered low activity \
                         (default: 1.0 P/s)')
@@ -41,13 +48,33 @@ def main():
     Sample.set_activity_thresholds(args.low_act_threshold,
                                    args.high_act_threshold,
                                    args.lookback)
+    users = {}
+    for line in args.match_file:
+        if "pcap" in line:
+            samples = list(Sample.make_samples(*line.split(" "), args.sample_size))
+            user = samples[0].user
+            if samples:
+                if user in users:
+                    users[user].extend(samples)
+                else:
+                    users[user] = samples
+    fix_point = 99999999999
+    if args.fix_sample_count:
+        for u in users:
+            print(len(users[u]))
+            if len(users[u]) >= args.min and len(users[u]) < fix_point:
+                fix_point = len(users[u])
+        print(fix_point)
     out = [sample
-           for line in args.match_file if "pcap" in line
-           for sample in Sample.make_samples(*line.split(" "), args.sample_size)]
+           for u in users if len(users[u]) >= args.min
+           for sample in users[u][0:fix_point]]
     try:
         import cPickle as pickle
     except:
         import pickle
+    if args.randomize:
+        from random import shuffle
+        shuffle(out)
     pickle.dump(out, args.outfile)
 
 if __name__ == '__main__':