6 jaren geleden · 84acdb9932
--- a/ethan_data_processing_scripts/Vector.py
+++ b/ethan_data_processing_scripts/Vector.py
@@ -5,15 +5,14 @@ except ImportError:
 
															     import pickle
														
 
															 import os
														
 
															 import sys
														
 
															-import sample
														
 
															 import typing
														
 
															 from typing import List
														
 
															 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
														
 
															                 '/../src/feature-extractor')
														
 
															-
														
 
															+import sample
														
 
															 class FeatureVector:
														
 
															-    def __init__(self, s: sample):
														
 
															+    def __init__(self, s: sample, features = None):
														
 
															         self.activefeatures = []
														
 
															         # list of key, value tuples represnting values for features.
														
 
															         self.sampleInfo = s
														
@@ -21,33 +20,22 @@ class FeatureVector:
 
															             self.classification = s.user
														
 
															         else:
														
 
															             self.classification = "DUMMY DATA - DO NOT USE"
														
 
															+        if features is not None:
														
 
															+            self.set_features(features)
														
 
															     # set which features are active using a binary list
														
 
															     # ~~ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]~~
														
 
															     # ex: ["total_time", "average_iat", "dead_time"]
														
 
															     def set_features(self, features: typing.List[str]):
														
 
															         self.activefeatures = [self.sampleInfo[feature] for feature in features]
														
 
															+        return self
														
 
															+    
														
 
															+    def get(self):
														
 
															+        return self.activefeatures
														
 
															     def __repr__(self):
														
 
															         return str(self.activefeatures)
														
 
															-
														
 
															-def writePickledData(filename):
														
 
															-    v = FeatureVector()
														
 
															-    v.features = [1, 2, 3, 4]
														
 
															-    v.activefeatures = [1, 2, 3, 4]
														
 
															-    vs = [v,v]
														
 
															-    with open(filename, 'wb') as file:
														
 
															-        pickle.dump(vs, file)
														
 
															-
														
 
															-
														
 
															-def readPickledData(filename):
														
 
															-    with open(filename, 'rb') as file:
														
 
															-        x = pickle.load(file)
														
 
															-        # print(x)
														
 
															-    return x
														
 
															-
														
 
															-
														
 
															 def main():
														
 
															     # fv = FeatureVector()
														
 
															     # writePickledData("test.bin")
														
--- a/ethan_data_processing_scripts/nearestneighbors.py
+++ b/ethan_data_processing_scripts/nearestneighbors.py
@@ -4,70 +4,62 @@ from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
 
															 from sklearn.ensemble import RandomForestClassifier
														
 
															 import numpy as np
														
 
															 import sys
														
 
															-from Vector import *
														
 
															+from Vector import FeatureVector
														
 
															+DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
														
 
															 def main():
														
 
															     # a test of this method using an arbitrarily generated list of 5 vectors with
														
 
															     # 3 features each
														
 
															     # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
														
 
															-    print(len(sys.argv))
														
 
															-    if len(sys.argv) != 5:
														
 
															-        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin " \
														
 
															-              "testdatafile.bin -(p/e)")
														
 
															-        exit()
														
 
															-    data = readPickledData(sys.argv[1])
														
 
															-    classifcations = readPickledData(sys.argv[2])
														
 
															-    testdata = readPickledData(sys.argv[3])
														
 
															-    newdata, newtest = [], []
														
 
															-    for d in data:
														
 
															-        newdata.append(d.features)
														
 
															-    for d in testdata:
														
 
															-        newtest.append(d.features)
														
 
															-    print(newdata)
														
 
															-    print(classifcations)
														
 
															-    print(newtest)
														
 
															-    kNearestNeighbors(newdata, classifcations, newtest)
														
 
															-    # print("Random Forest:")
														
 
															-    # randomForest(newdata, classifcations, newtest)
														
 
															-    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]],
														
 
															-    #                    ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
														
 
															+    args = parse_args()
														
 
															+    try:
														
 
															+        import cPickle as pickle
														
 
															+    except:
														
 
															+        import pickle
														
 
															+    samples = pickle.load(args.features_file)
														
 
															+    features = args.feature if args.feature else DEFAULT_FEATURES
														
 
															+    from random import shuffle
														
 
															+    shuffle(samples)
														
 
															+    data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
														
 
															+                         for p in samples])
														
 
															+    res = kNearestNeighbors(np.array(data), np.array(labels),
														
 
															+                            n=args.folds, verbose=args.verbose)
														
 
															+    print("Overall Accuracy: %f" % res)
														
 
															+def parse_args():
														
 
															+    import argparse
														
 
															+    parser = argparse.ArgumentParser(
														
 
															+        description='Run a data set through a kNearestNeighbors classifier.')
														
 
															+    parser.add_argument('features_file', type=argparse.FileType('rb'),
														
 
															+                        help='File of extracted features.')
														
 
															+    parser.add_argument('-v', '--verbose', action="count", default=0,
														
 
															+                        help='Show more information')
														
 
															+    parser.add_argument('-n', '--folds', type=int, default=5,
														
 
															+                        help='Number of cross-validation folds (default: 5)')
														
 
															+    parser.add_argument('-f', '--feature', action='append', type=str,
														
 
															+                        help='Add feature to list of features to test with.')
														
 
															+    return parser.parse_args()
														
 
															-def kNearestNeighbors(data: list, classifications: list, test_data: list):
														
 
															-    folds = KFold(n_splits=5)
														
 
															+def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
														
 
															+    folds = KFold(n_splits=n)
														
 
															+    i = 1
														
 
															+    avg = 0
														
 
															     for train_index, test_index in folds.split(data):
														
 
															+        if verbose >= 1:
														
 
															+            print("Round %d:" % i)
														
 
															+            i += 1
														
 
															+        if verbose >= 2:
														
 
															+            print("Training on: ", train_index)
														
 
															         kn = KNeighborsClassifier(n_neighbors=2)
														
 
															-        kn.fit(data[train_index], classifications[train_index])
														
 
															-        p = kn.predict(test_data[test_index])
														
 
															-        print("Predictions, matching test_data by index: ")
														
 
															-        print(test_data[test_index])
														
 
															-        print(p)
														
 
															-        writestr = "Predictions, matching test_data by index:\n" + str(test_data) \
														
 
															-                   + "\n" + str(p)
														
 
															-    # if sys.argv[4][1] == 'p':
														
 
															-    #     pickle.dump((test_data, p), open("results.bin", "wb"))
														
 
															-    # else:
														
 
															-    #     with open("results.txt", "w+") as file:
														
 
															-    #         file.write(writestr)
														
 
															-
														
 
															-
														
 
															-def nearestNeighbors(data: list, test_data: list):
														
 
															-    x = np.array(data)
														
 
															-    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(x)
														
 
															-    dist, indicies = nbrs.kneighbors(test_data)
														
 
															-    print("Indicies:")
														
 
															-    print(indicies)
														
 
															-    print("Distances:")
														
 
															-    print(dist)
														
 
															-    return indicies, dist
														
 
															-
														
 
															-
														
 
															-def randomForest(data: list, classifications: list, test_data: list):
														
 
															-    rfc = RandomForestClassifier(n_estimators=len(data))
														
 
															-    rfc.fit(data, classifications)
														
 
															-    print(rfc.predict(test_data))
														
 
															-
														
 
															+        kn.fit(data[train_index], labels[train_index])
														
 
															+        predictions = kn.predict(data[test_index])
														
 
															+        correct = [a == p for a, p in zip(labels[test_index], predictions)]
														
 
															+        accuracy = correct.count(True)/len(correct)
														
 
															+        if verbose >= 1:
														
 
															+            print(accuracy)
														
 
															+        avg += accuracy
														
 
															+    return avg/n
														
 
															 if __name__ == '__main__':
														
 
															     main()
														
--- a/src/feature-extractor/sample.py
+++ b/src/feature-extractor/sample.py
@@ -4,7 +4,6 @@ import typing
 
															 from typing import List
														
 
															 from common import window
														
 
															-
														
 
															 class Sample:
														
 
															     EPOCH = datetime(1970, 1, 1)
														
 
															     TIME_FMT = '%Y-%m-%d %H:%M:%S.%f'
														
@@ -29,7 +28,7 @@ class Sample:
 
															     def set_activity_thresholds(lower_bound: float, upper_bound: float,
														
 
															                                 lookback: float):
														
 
															         assert(lower_bound < upper_bound)
														
 
															-        assert(lookback <= lower_bound)
														
 
															+        assert(lower_bound <= lookback)
														
 
															         assert(0 < lookback)
														
 
															         Sample.high_act_threshold = upper_bound
														
 
															         Sample.low_act_threshold = lower_bound
														
@@ -168,11 +167,13 @@ class Sample:
 
															             raise ValueError('Unable to access value at %s, unknown prefix.' % key)
														
 
															     def keys(self):
														
 
															-        return self.__general.keys().extend(
														
 
															+        res = [k for k in self.__general.keys()]
														
 
															+        res.extend(
														
 
															             ["%s.%s" % (prefix, suffix)
														
 
															              for prefix in self.__activities.keys()
														
 
															              for suffix in self.__activities[prefix].keys()]
														
 
															         )
														
 
															+        return res
														
 
															     def __str__(self):
														
 
															         return "Sample: {%s, high: %s, mid: %s, low: %s}" % \