Преглед изворни кода

Added n-fold cross validation sample code.

Thomas Flucke пре 6 година
родитељ
комит
d03562d942

+ 23 - 18
ethan_data_processing_scripts/nearestneighbors.py

@@ -1,3 +1,4 @@
+#!/usr/bin/python3
 from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
 import numpy as np
@@ -6,11 +7,13 @@ from Vector import *
 
 
 def main():
-    # a test of this method using an arbitrarily generated list of 5 vectors with 3 features each
+    # a test of this method using an arbitrarily generated list of 5 vectors with
+    # 3 features each
     # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
     print(len(sys.argv))
     if len(sys.argv) != 5:
-        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin testdatafile.bin -(p/e)")
+        print("Usage: nearestneighbors.py datafile.bin classificationsfile.bin " \
+              "testdatafile.bin -(p/e)")
         exit()
     data = readPickledData(sys.argv[1])
     classifcations = readPickledData(sys.argv[2])
@@ -26,23 +29,25 @@ def main():
     kNearestNeighbors(newdata, classifcations, newtest)
     # print("Random Forest:")
     # randomForest(newdata, classifcations, newtest)
-    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
-
-
-def kNearestNeighbors(data: list, classifications: list, test_data: list):
-    kn = KNeighborsClassifier(n_neighbors=2)
-    kn.fit(data, classifications)
-    p = kn.predict(test_data)
-    print("Predictions, matching test_data by index: ")
-    print(test_data)
-    print(p)
-    writestr = "Predictions, matching test_data by index:\n" + str(test_data) + "\n" + str(p)
-    if sys.argv[4][1] == 'p':
-        pickle.dump((test_data, p), open("results.bin", "wb"))
-    else:
-        with open("results.txt", "w+") as file:
-            file.write(writestr)
+    # kNearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]],
+    #                    ["three", 2, 3, "5"], [[1, 1, 0], [0, 5, 5]])
 
+def kNearestNeighbors(data: list, classifications: list):
+    folds = KFold(n_splits=5)
+    for train_index, test_index in folds.split(data):
+        kn = KNeighborsClassifier(n_neighbors=2)
+        kn.fit(data[train_index], classifications[train_index])
+        p = kn.predict(test_data[test_index])
+        print("Predictions, matching test_data by index: ")
+        print(test_data[test_index])
+        print(p)
+        writestr = "Predictions, matching test_data by index:\n" + str(test_data) \
+                   + "\n" + str(p)
+    # if sys.argv[4][1] == 'p':
+    #     pickle.dump((test_data, p), open("results.bin", "wb"))
+    # else:
+    #     with open("results.txt", "w+") as file:
+    #         file.write(writestr)
 
 def nearestNeighbors(data: list, test_data: list):
     x = np.array(data)

+ 3 - 5
ethan_data_processing_scripts/runtests.py

@@ -1,3 +1,4 @@
+#!/usr/bin/python3
 import sys
 import Vector
 import sample
@@ -12,15 +13,12 @@ def main():
     sampleList = Vector.readPickledData(sys.argv[1])
     featureList = []
     for s in sampleList:
-        featureList.append(Vector.SampleToFeatureVector(s))
+        featureList.append(Vector.FeatureVector(s))
     activeFeatureStrings = []
     for i in range(2, len(sys.argv)):
         activeFeatureStrings.append(sys.argv[i])
     for f in featureList:
-        temp = []
-        for s in activeFeatureStrings:
-            temp.append(f.sampleInfo[s])
-        f.activefeatures = temp
+        f.set_features(activeFeatureStrings)
     # perform classification on f here
     nearestneighbors.kNearestNeighbors()