6 năm trước cách đây · 84acdb9932
--- a/ethan_data_processing_scripts/Vector.py
+++ b/ethan_data_processing_scripts/Vector.py
@@ -5,15 +5,14 @@ except ImportError:
 
				     import pickle
			
 
				 import os
			
 
				 import sys
			
 
				-import sample
			
 
				 import typing
			
 
				 from typing import List
			
 
				 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
			
 
				                 '/../src/feature-extractor')
			
 
				-
			
 
				+import sample
			
 
				 
			
 
				 class FeatureVector:
			
 
				-    def __init__(self, s: sample):
			
 
				+    def __init__(self, s: sample, features = None):
			
 
				         self.activefeatures = []
			
 
				         # list of key, value tuples represnting values for features.
			
 
				         self.sampleInfo = s
			
@@ -21,33 +20,22 @@ class FeatureVector:
 
				             self.classification = s.user
			
 
				         else:
			
 
				             self.classification = "DUMMY DATA - DO NOT USE"
			
 
				+        if features is not None:
			
 
				+            self.set_features(features)
			
 
				 
			
 
				     # set which features are active using a binary list
			
 
				     # ~~ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]~~
			
 
				     # ex: ["total_time", "average_iat", "dead_time"]
			
 
				     def set_features(self, features: typing.List[str]):
			
 
				         self.activefeatures = [self.sampleInfo[feature] for feature in features]
			
 
				+        return self
			
 
				+    
			
 
				+    def get(self):
			
 
				+        return self.activefeatures
			
 
				 
			
 
				     def __repr__(self):
			
 
				         return str(self.activefeatures)
			
 
				 
			
 
				-
			
 
				-def writePickledData(filename):
			
 
				-    v = FeatureVector()
			
 
				-    v.features = [1, 2, 3, 4]
			
 
				-    v.activefeatures = [1, 2, 3, 4]
			
 
				-    vs = [v,v]
			
 
				-    with open(filename, 'wb') as file:
			
 
				-        pickle.dump(vs, file)
			
 
				-
			
 
				-
			
 
				-def readPickledData(filename):
			
 
				-    with open(filename, 'rb') as file:
			
 
				-        x = pickle.load(file)
			
 
				-        # print(x)
			
 
				-    return x
			
 
				-
			
 
				-
			
 
				 def main():
			
 
				     # fv = FeatureVector()
			
 
				     # writePickledData("test.bin")
			
--- a/ethan_data_processing_scripts/nearestneighbors.py
+++ b/ethan_data_processing_scripts/nearestneighbors.py
@@ -4,70 +4,62 @@ from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
 
				 from sklearn.ensemble import RandomForestClassifier
			
 
				 import numpy as np
			
 
				 import sys
			
 
				-from Vector import *
			
 
				+from Vector import FeatureVector
			
 
				 
			
 
				+DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
			
 
				 
			
 
				 def main():
			
 
				     # a test of this method using an arbitrarily generated list of 5 vectors with
			
 
				     # 3 features each
			
 
				     # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
			
 
				-    print(len(sys.argv))
			
 
				-    if len(sys.argv) != 5:
			
 
				-        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin " \
			
 
				-              "testdatafile.bin -(p/e)")
			
 
				-        exit()
			
 
				-    data = readPickledData(sys.argv[1])
			
 
				-    classifcations = readPickledData(sys.argv[2])
			
 
				-    testdata = readPickledData(sys.argv[3])
			
 
				-    newdata, newtest = [], []
			
 
				-    for d in data:
			
 
				-        newdata.append(d.features)
			
 
				-    for d in testdata:
			
 
				-        newtest.append(d.features)
			
 
				-    print(newdata)
			
 
				-    print(classifcations)
			
 
				-    print(newtest)
			
 
				-    kNearestNeighbors(newdata, classifcations, newtest)
			
 
				-    # print("Random Forest:")
			
 
				-    # randomForest(newdata, classifcations, newtest)
			
 
				-    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]],
			
 
				-    #                    ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
			
 
				+    args = parse_args()
			
 
				+    try:
			
 
				+        import cPickle as pickle
			
 
				+    except:
			
 
				+        import pickle
			
 
				+    samples = pickle.load(args.features_file)
			
 
				+    features = args.feature if args.feature else DEFAULT_FEATURES
			
 
				+    from random import shuffle
			
 
				+    shuffle(samples)
			
 
				+    data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
			
 
				+                         for p in samples])
			
 
				+    res = kNearestNeighbors(np.array(data), np.array(labels),
			
 
				+                            n=args.folds, verbose=args.verbose)
			
 
				+    print("Overall Accuracy: %f" % res)
			
 
				 
			
 
				+def parse_args():
			
 
				+    import argparse
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='Run a data set through a kNearestNeighbors classifier.')
			
 
				+    parser.add_argument('features_file', type=argparse.FileType('rb'),
			
 
				+                        help='File of extracted features.')
			
 
				+    parser.add_argument('-v', '--verbose', action="count", default=0,
			
 
				+                        help='Show more information')
			
 
				+    parser.add_argument('-n', '--folds', type=int, default=5,
			
 
				+                        help='Number of cross-validation folds (default: 5)')
			
 
				+    parser.add_argument('-f', '--feature', action='append', type=str,
			
 
				+                        help='Add feature to list of features to test with.')
			
 
				+    return parser.parse_args()
			
 
				 
			
 
				-def kNearestNeighbors(data: list, classifications: list, test_data: list):
			
 
				-    folds = KFold(n_splits=5)
			
 
				+def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
			
 
				+    folds = KFold(n_splits=n)
			
 
				+    i = 1
			
 
				+    avg = 0
			
 
				     for train_index, test_index in folds.split(data):
			
 
				+        if verbose >= 1:
			
 
				+            print("Round %d:" % i)
			
 
				+            i += 1
			
 
				+        if verbose >= 2:
			
 
				+            print("Training on: ", train_index)
			
 
				         kn = KNeighborsClassifier(n_neighbors=2)
			
 
				-        kn.fit(data[train_index], classifications[train_index])
			
 
				-        p = kn.predict(test_data[test_index])
			
 
				-        print("Predictions, matching test_data by index: ")
			
 
				-        print(test_data[test_index])
			
 
				-        print(p)
			
 
				-        writestr = "Predictions, matching test_data by index:\n" + str(test_data) \
			
 
				-                   + "\n" + str(p)
			
 
				-    # if sys.argv[4][1] == 'p':
			
 
				-    #     pickle.dump((test_data, p), open("results.bin", "wb"))
			
 
				-    # else:
			
 
				-    #     with open("results.txt", "w+") as file:
			
 
				-    #         file.write(writestr)
			
 
				-
			
 
				-
			
 
				-def nearestNeighbors(data: list, test_data: list):
			
 
				-    x = np.array(data)
			
 
				-    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(x)
			
 
				-    dist, indicies = nbrs.kneighbors(test_data)
			
 
				-    print("Indicies:")
			
 
				-    print(indicies)
			
 
				-    print("Distances:")
			
 
				-    print(dist)
			
 
				-    return indicies, dist
			
 
				-
			
 
				-
			
 
				-def randomForest(data: list, classifications: list, test_data: list):
			
 
				-    rfc = RandomForestClassifier(n_estimators=len(data))
			
 
				-    rfc.fit(data, classifications)
			
 
				-    print(rfc.predict(test_data))
			
 
				-
			
 
				+        kn.fit(data[train_index], labels[train_index])
			
 
				+        predictions = kn.predict(data[test_index])
			
 
				+        correct = [a == p for a, p in zip(labels[test_index], predictions)]
			
 
				+        accuracy = correct.count(True)/len(correct)
			
 
				+        if verbose >= 1:
			
 
				+            print(accuracy)
			
 
				+        avg += accuracy
			
 
				+    return avg/n
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/src/feature-extractor/sample.py
+++ b/src/feature-extractor/sample.py
@@ -4,7 +4,6 @@ import typing
 
				 from typing import List
			
 
				 from common import window
			
 
				 
			
 
				-
			
 
				 class Sample:
			
 
				     EPOCH = datetime(1970, 1, 1)
			
 
				     TIME_FMT = '%Y-%m-%d %H:%M:%S.%f'
			
@@ -29,7 +28,7 @@ class Sample:
 
				     def set_activity_thresholds(lower_bound: float, upper_bound: float,
			
 
				                                 lookback: float):
			
 
				         assert(lower_bound < upper_bound)
			
 
				-        assert(lookback <= lower_bound)
			
 
				+        assert(lower_bound <= lookback)
			
 
				         assert(0 < lookback)
			
 
				         Sample.high_act_threshold = upper_bound
			
 
				         Sample.low_act_threshold = lower_bound
			
@@ -168,11 +167,13 @@ class Sample:
 
				             raise ValueError('Unable to access value at %s, unknown prefix.' % key)
			
 
				 
			
 
				     def keys(self):
			
 
				-        return self.__general.keys().extend(
			
 
				+        res = [k for k in self.__general.keys()]
			
 
				+        res.extend(
			
 
				             ["%s.%s" % (prefix, suffix)
			
 
				              for prefix in self.__activities.keys()
			
 
				              for suffix in self.__activities[prefix].keys()]
			
 
				         )
			
 
				+        return res
			
 
				         
			
 
				     def __str__(self):
			
 
				         return "Sample: {%s, high: %s, mid: %s, low: %s}" % \