瀏覽代碼

Added a few new paramaters to nearest neighbor classifier.

Thomas Flucke 6 年之前
父節點
當前提交
693e396395
共有 1 個文件被更改,包括 28 次插入4 次删除
  1. 28 4
      src/classifiers/nearestneighbors.py

+ 28 - 4
src/classifiers/nearestneighbors.py

@@ -24,7 +24,8 @@ def main():
     data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
                          for p in samples])
     res = kNearestNeighbors(np.array(data), np.array(labels),
-                            n=args.folds, verbose=args.verbose)
+                            n=args.folds, verbose=args.verbose, k=args.k_neighbors,
+                            weights=args.weight, guesses=args.top)
     print("Overall Accuracy: %f" % np.average(res))
     if args.p_value:
         _, p = t_test(res, labels)
@@ -40,33 +41,56 @@ def parse_args():
                         help='Show more information')
     parser.add_argument('-n', '--folds', type=int, default=5,
                         help='Number of cross-validation folds (default: 5)')
+    parser.add_argument('-k', '--k-neighbors', type=int, default=5,
+                        help='Number of neighbors to consider (default: 5)')
+    parser.add_argument('-w', '--weight', choices=["uniform", "distance"],
+                        default="uniform", help='Weight function for determining \
+                        distance (default: \"Uniform\")')
     parser.add_argument('-f', '--feature', action='append', type=str,
                         help='Add feature to list of features to test with.')
     parser.add_argument('-p', '--p-value', action='store_const', default=False,
                         const=True, help='Calculate a p-value from a t-test.')
+    parser.add_argument('-t', '--top', type=int, default=1,
+                        help='Number of guesses to be considered \"correct\" \
+                        (default: 1)')
     return parser.parse_args()
 
-def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
+def kNearestNeighbors(data: list, labels: list,
+                      n=5, verbose=0, k=5, weights="uniform", guesses=1):
     folds = KFold(n_splits=n)
     i = 1
     avg = 0
     accuracies = []
+    label_list = sorted(np.unique(labels))
     for train_index, test_index in folds.split(data):
         if verbose >= 1:
             print("Round %d:" % i)
             i += 1
         if verbose >= 2:
             print("Training on: ", train_index)
-        kn = KNeighborsClassifier(n_neighbors=2)
+        kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
         kn.fit(data[train_index], labels[train_index])
         predictions = kn.predict(data[test_index])
-        correct = [a == p for a, p in zip(labels[test_index], predictions)]
+        if guesses <= 1:
+            correct = [a == p for a, p in zip(labels[test_index], predictions)]
+        else:
+            correct = list(map(lambda x: x <= guesses,
+                               find_in_predictions(
+                                   kn.predict_proba(data[test_index]),
+                                   labels[test_index],
+                                   label_list)))
         accuracy = correct.count(True)/len(correct)
         if verbose >= 1:
             print(accuracy)
         accuracies.append(accuracy)
     return accuracies
 
+def find_in_predictions(probabilities: list, tests: int, labels: list):
+    return [list(map(lambda x: x[0],
+                     sorted(list(zip(labels, probs)), key=lambda x: x[1]))
+    ).index(test)
+            for probs, test in zip(probabilities, tests)]
+
 def t_test(accuracy: list, labels: list):
     from scipy import stats
     random_avg = 1.0/len(np.unique(labels))