Bladeren bron

Fixed up nearest neighbor classifier. Now working.

Thomas Flucke 6 jaren geleden
bovenliggende
commit
84acdb9932

+ 8 - 20
ethan_data_processing_scripts/Vector.py

@@ -5,15 +5,14 @@ except ImportError:
     import pickle
     import pickle
 import os
 import os
 import sys
 import sys
-import sample
 import typing
 import typing
 from typing import List
 from typing import List
 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
                 '/../src/feature-extractor')
                 '/../src/feature-extractor')
-
+import sample
 
 
 class FeatureVector:
 class FeatureVector:
-    def __init__(self, s: sample):
+    def __init__(self, s: sample, features = None):
         self.activefeatures = []
         self.activefeatures = []
         # list of key, value tuples represnting values for features.
         # list of key, value tuples represnting values for features.
         self.sampleInfo = s
         self.sampleInfo = s
@@ -21,33 +20,22 @@ class FeatureVector:
             self.classification = s.user
             self.classification = s.user
         else:
         else:
             self.classification = "DUMMY DATA - DO NOT USE"
             self.classification = "DUMMY DATA - DO NOT USE"
+        if features is not None:
+            self.set_features(features)
 
 
     # set which features are active using a binary list
     # set which features are active using a binary list
     # ~~ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]~~
     # ~~ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]~~
     # ex: ["total_time", "average_iat", "dead_time"]
     # ex: ["total_time", "average_iat", "dead_time"]
     def set_features(self, features: typing.List[str]):
     def set_features(self, features: typing.List[str]):
         self.activefeatures = [self.sampleInfo[feature] for feature in features]
         self.activefeatures = [self.sampleInfo[feature] for feature in features]
+        return self
+    
+    def get(self):
+        return self.activefeatures
 
 
     def __repr__(self):
     def __repr__(self):
         return str(self.activefeatures)
         return str(self.activefeatures)
 
 
-
-def writePickledData(filename):
-    v = FeatureVector()
-    v.features = [1, 2, 3, 4]
-    v.activefeatures = [1, 2, 3, 4]
-    vs = [v,v]
-    with open(filename, 'wb') as file:
-        pickle.dump(vs, file)
-
-
-def readPickledData(filename):
-    with open(filename, 'rb') as file:
-        x = pickle.load(file)
-        # print(x)
-    return x
-
-
 def main():
 def main():
     # fv = FeatureVector()
     # fv = FeatureVector()
     # writePickledData("test.bin")
     # writePickledData("test.bin")

+ 46 - 54
ethan_data_processing_scripts/nearestneighbors.py

@@ -4,70 +4,62 @@ from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import RandomForestClassifier
 import numpy as np
 import numpy as np
 import sys
 import sys
-from Vector import *
+from Vector import FeatureVector
 
 
+DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
 
 
 def main():
 def main():
     # a test of this method using an arbitrarily generated list of 5 vectors with
     # a test of this method using an arbitrarily generated list of 5 vectors with
     # 3 features each
     # 3 features each
     # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
     # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
-    print(len(sys.argv))
-    if len(sys.argv) != 5:
-        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin " \
-              "testdatafile.bin -(p/e)")
-        exit()
-    data = readPickledData(sys.argv[1])
-    classifcations = readPickledData(sys.argv[2])
-    testdata = readPickledData(sys.argv[3])
-    newdata, newtest = [], []
-    for d in data:
-        newdata.append(d.features)
-    for d in testdata:
-        newtest.append(d.features)
-    print(newdata)
-    print(classifcations)
-    print(newtest)
-    kNearestNeighbors(newdata, classifcations, newtest)
-    # print("Random Forest:")
-    # randomForest(newdata, classifcations, newtest)
-    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]],
-    #                    ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
+    args = parse_args()
+    try:
+        import cPickle as pickle
+    except:
+        import pickle
+    samples = pickle.load(args.features_file)
+    features = args.feature if args.feature else DEFAULT_FEATURES
+    from random import shuffle
+    shuffle(samples)
+    data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
+                         for p in samples])
+    res = kNearestNeighbors(np.array(data), np.array(labels),
+                            n=args.folds, verbose=args.verbose)
+    print("Overall Accuracy: %f" % res)
 
 
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Run a data set through a kNearestNeighbors classifier.')
+    parser.add_argument('features_file', type=argparse.FileType('rb'),
+                        help='File of extracted features.')
+    parser.add_argument('-v', '--verbose', action="count", default=0,
+                        help='Show more information')
+    parser.add_argument('-n', '--folds', type=int, default=5,
+                        help='Number of cross-validation folds (default: 5)')
+    parser.add_argument('-f', '--feature', action='append', type=str,
+                        help='Add feature to list of features to test with.')
+    return parser.parse_args()
 
 
-def kNearestNeighbors(data: list, classifications: list, test_data: list):
-    folds = KFold(n_splits=5)
+def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
+    folds = KFold(n_splits=n)
+    i = 1
+    avg = 0
     for train_index, test_index in folds.split(data):
     for train_index, test_index in folds.split(data):
+        if verbose >= 1:
+            print("Round %d:" % i)
+            i += 1
+        if verbose >= 2:
+            print("Training on: ", train_index)
         kn = KNeighborsClassifier(n_neighbors=2)
         kn = KNeighborsClassifier(n_neighbors=2)
-        kn.fit(data[train_index], classifications[train_index])
-        p = kn.predict(test_data[test_index])
-        print("Predictions, matching test_data by index: ")
-        print(test_data[test_index])
-        print(p)
-        writestr = "Predictions, matching test_data by index:\n" + str(test_data) \
-                   + "\n" + str(p)
-    # if sys.argv[4][1] == 'p':
-    #     pickle.dump((test_data, p), open("results.bin", "wb"))
-    # else:
-    #     with open("results.txt", "w+") as file:
-    #         file.write(writestr)
-
-
-def nearestNeighbors(data: list, test_data: list):
-    x = np.array(data)
-    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(x)
-    dist, indicies = nbrs.kneighbors(test_data)
-    print("Indicies:")
-    print(indicies)
-    print("Distances:")
-    print(dist)
-    return indicies, dist
-
-
-def randomForest(data: list, classifications: list, test_data: list):
-    rfc = RandomForestClassifier(n_estimators=len(data))
-    rfc.fit(data, classifications)
-    print(rfc.predict(test_data))
-
+        kn.fit(data[train_index], labels[train_index])
+        predictions = kn.predict(data[test_index])
+        correct = [a == p for a, p in zip(labels[test_index], predictions)]
+        accuracy = correct.count(True)/len(correct)
+        if verbose >= 1:
+            print(accuracy)
+        avg += accuracy
+    return avg/n
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     main()
     main()

+ 4 - 3
src/feature-extractor/sample.py

@@ -4,7 +4,6 @@ import typing
 from typing import List
 from typing import List
 from common import window
 from common import window
 
 
-
 class Sample:
 class Sample:
     EPOCH = datetime(1970, 1, 1)
     EPOCH = datetime(1970, 1, 1)
     TIME_FMT = '%Y-%m-%d %H:%M:%S.%f'
     TIME_FMT = '%Y-%m-%d %H:%M:%S.%f'
@@ -29,7 +28,7 @@ class Sample:
     def set_activity_thresholds(lower_bound: float, upper_bound: float,
     def set_activity_thresholds(lower_bound: float, upper_bound: float,
                                 lookback: float):
                                 lookback: float):
         assert(lower_bound < upper_bound)
         assert(lower_bound < upper_bound)
-        assert(lookback <= lower_bound)
+        assert(lower_bound <= lookback)
         assert(0 < lookback)
         assert(0 < lookback)
         Sample.high_act_threshold = upper_bound
         Sample.high_act_threshold = upper_bound
         Sample.low_act_threshold = lower_bound
         Sample.low_act_threshold = lower_bound
@@ -168,11 +167,13 @@ class Sample:
             raise ValueError('Unable to access value at %s, unknown prefix.' % key)
             raise ValueError('Unable to access value at %s, unknown prefix.' % key)
 
 
     def keys(self):
     def keys(self):
-        return self.__general.keys().extend(
+        res = [k for k in self.__general.keys()]
+        res.extend(
             ["%s.%s" % (prefix, suffix)
             ["%s.%s" % (prefix, suffix)
              for prefix in self.__activities.keys()
              for prefix in self.__activities.keys()
              for suffix in self.__activities[prefix].keys()]
              for suffix in self.__activities[prefix].keys()]
         )
         )
+        return res
         
         
     def __str__(self):
     def __str__(self):
         return "Sample: {%s, high: %s, mid: %s, low: %s}" % \
         return "Sample: {%s, high: %s, mid: %s, low: %s}" % \