Procházet zdrojové kódy

Merge branch 'master' of git.tflucke.name:tflucke/SSH-Master-Thesis

Thomas Flucke před 6 roky
rodič
revize
35b818b404

+ 14 - 1
ethan_data_processing_scripts/README.md

@@ -6,4 +6,17 @@ Data and Testdata should be arrays of arrays of features, ex:
 
 Suppose there are 3 features, each a float from 0 to 1. Data could be: [[.3, .2, .3], [.3, .3, .3], [.3, .4, .5]...]
 
-Results should be ints in an array, each result accoring to the list of features it should represent the classification of.
+Results should be ints in an array, each result accoring to the list of features it should represent the classification of.
+
+### Current state of nearestneighbors.py
+
+Usage: nearestneighbors.py datafile.bin classificationsfile.bin testdatafile.bin -p/e
+
+if -p, a tuple of the pickle dump of the test data array and their classifications are written
+if -e, an english copy of the printout is written
+
+A command line utility that reads in FeatureVectors and runs a KNN classification on them.
+
+Plan for classifier.py:
+Discuss data formatting at meeting, expand utility to include choice of classification and
+make more robust in general

+ 51 - 0
ethan_data_processing_scripts/Vector.py

@@ -0,0 +1,51 @@
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+
+class FeatureVector:
+    def __init__(self):
+        self.features = []
+        self.activefeatures = []
+        self.classification = None
+
+    # set which features are active using a binary list
+    # ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]
+    def setFeatures(self, binlist):
+        activefeatures = []
+        if len(binlist) != len(self.features):
+            print("Feature choice list must equal length of feature list")
+        for b in range(len(binlist)):
+            if binlist[b] == 1:
+                activefeatures.append(self.features[b])
+        self.activefeatures = activefeatures
+
+    def __repr__(self):
+        return str(self.features)
+
+
+def writePickledData(filename):
+    v = FeatureVector()
+    v.features = [1, 2, 3, 4]
+    v.activefeatures = [1, 2, 3, 4]
+    vs = [v,v]
+    with open(filename, 'wb') as file:
+        pickle.dump(vs, file)
+
+
+def readPickledData(filename):
+    with open(filename, 'rb') as file:
+        x = pickle.load(file)
+        # print(x)
+    return x
+
+
+def main():
+    fv = FeatureVector()
+    writePickledData("test.bin")
+    readPickledData("test.txt")
+
+
+if __name__ == '__main__':
+    main()

+ 56 - 0
ethan_data_processing_scripts/nearestneighbors.py

@@ -0,0 +1,56 @@
+from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
+import numpy as np
+import sys
+from Vector import *
+
+
+def main():
+    # a test of this method using an arbitrarily generated list of 5 vectors with 3 features each
+    # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
+    print(len(sys.argv))
+    if len(sys.argv) != 5:
+        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin testdatafile.bin -(p/e)")
+        exit()
+    data = readPickledData(sys.argv[1])
+    classifcations = readPickledData(sys.argv[2])
+    testdata = readPickledData(sys.argv[3])
+    newdata, newtest = [], []
+    for d in data:
+        newdata.append(d.features)
+    for d in testdata:
+        newtest.append(d.features)
+    print(newdata)
+    print(classifcations)
+    print(newtest)
+    kNearestNeighbors(newdata, classifcations, newtest)
+    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
+
+
+def kNearestNeighbors(data: list, classifications: list, test_data: list):
+    kn = KNeighborsClassifier(n_neighbors=2)
+    kn.fit(data, classifications)
+    p = kn.predict(test_data)
+    print("Predictions, matching test_data by index: ")
+    print(test_data)
+    print(p)
+    writestr = "Predictions, matching test_data by index:\n" + str(test_data) + "\n" + str(p)
+    if sys.argv[4][1] == 'p':
+        pickle.dump((test_data, p), open("results.bin", "wb"))
+    else:
+        with open("results.txt", "w+") as file:
+            file.write(writestr)
+
+
+def nearestNeighbors(data: list, test_data: list):
+    x = np.array(data)
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(x)
+    dist, indicies = nbrs.kneighbors(test_data)
+    print("Indicies:")
+    print(indicies)
+    print("Distances:")
+    print(dist)
+    return indicies, dist
+
+
+if __name__ == '__main__':
+    main()