před 6 roky · 35b818b404
--- a/ethan_data_processing_scripts/README.md
+++ b/ethan_data_processing_scripts/README.md
@@ -6,4 +6,17 @@ Data and Testdata should be arrays of arrays of features, ex:
 
				 
			
 
				 Suppose there are 3 features, each a float from 0 to 1. Data could be: [[.3, .2, .3], [.3, .3, .3], [.3, .4, .5]...]
			
 
				 
			
 
				-Results should be ints in an array, each result accoring to the list of features it should represent the classification of.
			
 
				+Results should be ints in an array, each result accoring to the list of features it should represent the classification of.
			
 
				+
			
 
				+### Current state of nearestneighbors.py
			
 
				+
			
 
				+Usage: nearestneighbors.py datafile.bin classificationsfile.bin testdatafile.bin -p/e
			
 
				+
			
 
				+if -p, a tuple of the pickle dump of the test data array and their classifications are written
			
 
				+if -e, an english copy of the printout is written
			
 
				+
			
 
				+A command line utility that reads in FeatureVectors and runs a KNN classification on them.
			
 
				+
			
 
				+Plan for classifier.py:
			
 
				+Discuss data formatting at meeting, expand utility to include choice of classification and
			
 
				+make more robust in general
			
--- a/ethan_data_processing_scripts/Vector.py
+++ b/ethan_data_processing_scripts/Vector.py
@@ -0,0 +1,51 @@
 
				+try:
			
 
				+    import cPickle as pickle
			
 
				+except ImportError:
			
 
				+    import pickle
			
 
				+
			
 
				+
			
 
				+class FeatureVector:
			
 
				+    def __init__(self):
			
 
				+        self.features = []
			
 
				+        self.activefeatures = []
			
 
				+        self.classification = None
			
 
				+
			
 
				+    # set which features are active using a binary list
			
 
				+    # ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]
			
 
				+    def setFeatures(self, binlist):
			
 
				+        activefeatures = []
			
 
				+        if len(binlist) != len(self.features):
			
 
				+            print("Feature choice list must equal length of feature list")
			
 
				+        for b in range(len(binlist)):
			
 
				+            if binlist[b] == 1:
			
 
				+                activefeatures.append(self.features[b])
			
 
				+        self.activefeatures = activefeatures
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return str(self.features)
			
 
				+
			
 
				+
			
 
				+def writePickledData(filename):
			
 
				+    v = FeatureVector()
			
 
				+    v.features = [1, 2, 3, 4]
			
 
				+    v.activefeatures = [1, 2, 3, 4]
			
 
				+    vs = [v,v]
			
 
				+    with open(filename, 'wb') as file:
			
 
				+        pickle.dump(vs, file)
			
 
				+
			
 
				+
			
 
				+def readPickledData(filename):
			
 
				+    with open(filename, 'rb') as file:
			
 
				+        x = pickle.load(file)
			
 
				+        # print(x)
			
 
				+    return x
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    fv = FeatureVector()
			
 
				+    writePickledData("test.bin")
			
 
				+    readPickledData("test.txt")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/ethan_data_processing_scripts/nearestneighbors.py
+++ b/ethan_data_processing_scripts/nearestneighbors.py
@@ -0,0 +1,56 @@
 
				+from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
			
 
				+import numpy as np
			
 
				+import sys
			
 
				+from Vector import *
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # a test of this method using an arbitrarily generated list of 5 vectors with 3 features each
			
 
				+    # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
			
 
				+    print(len(sys.argv))
			
 
				+    if len(sys.argv) != 5:
			
 
				+        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin testdatafile.bin -(p/e)")
			
 
				+        exit()
			
 
				+    data = readPickledData(sys.argv[1])
			
 
				+    classifcations = readPickledData(sys.argv[2])
			
 
				+    testdata = readPickledData(sys.argv[3])
			
 
				+    newdata, newtest = [], []
			
 
				+    for d in data:
			
 
				+        newdata.append(d.features)
			
 
				+    for d in testdata:
			
 
				+        newtest.append(d.features)
			
 
				+    print(newdata)
			
 
				+    print(classifcations)
			
 
				+    print(newtest)
			
 
				+    kNearestNeighbors(newdata, classifcations, newtest)
			
 
				+    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
			
 
				+
			
 
				+
			
 
				+def kNearestNeighbors(data: list, classifications: list, test_data: list):
			
 
				+    kn = KNeighborsClassifier(n_neighbors=2)
			
 
				+    kn.fit(data, classifications)
			
 
				+    p = kn.predict(test_data)
			
 
				+    print("Predictions, matching test_data by index: ")
			
 
				+    print(test_data)
			
 
				+    print(p)
			
 
				+    writestr = "Predictions, matching test_data by index:\n" + str(test_data) + "\n" + str(p)
			
 
				+    if sys.argv[4][1] == 'p':
			
 
				+        pickle.dump((test_data, p), open("results.bin", "wb"))
			
 
				+    else:
			
 
				+        with open("results.txt", "w+") as file:
			
 
				+            file.write(writestr)
			
 
				+
			
 
				+
			
 
				+def nearestNeighbors(data: list, test_data: list):
			
 
				+    x = np.array(data)
			
 
				+    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(x)
			
 
				+    dist, indicies = nbrs.kneighbors(test_data)
			
 
				+    print("Indicies:")
			
 
				+    print(indicies)
			
 
				+    print("Distances:")
			
 
				+    print(dist)
			
 
				+    return indicies, dist
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()