|
@@ -24,7 +24,8 @@ def main():
|
|
|
data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
|
|
data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
|
|
|
for p in samples])
|
|
for p in samples])
|
|
|
res = kNearestNeighbors(np.array(data), np.array(labels),
|
|
res = kNearestNeighbors(np.array(data), np.array(labels),
|
|
|
- n=args.folds, verbose=args.verbose)
|
|
|
|
|
|
|
+ n=args.folds, verbose=args.verbose, k=args.k_neighbors,
|
|
|
|
|
+ weights=args.weight, guesses=args.top)
|
|
|
print("Overall Accuracy: %f" % np.average(res))
|
|
print("Overall Accuracy: %f" % np.average(res))
|
|
|
if args.p_value:
|
|
if args.p_value:
|
|
|
_, p = t_test(res, labels)
|
|
_, p = t_test(res, labels)
|
|
@@ -40,33 +41,56 @@ def parse_args():
|
|
|
help='Show more information')
|
|
help='Show more information')
|
|
|
parser.add_argument('-n', '--folds', type=int, default=5,
|
|
parser.add_argument('-n', '--folds', type=int, default=5,
|
|
|
help='Number of cross-validation folds (default: 5)')
|
|
help='Number of cross-validation folds (default: 5)')
|
|
|
|
|
+ parser.add_argument('-k', '--k-neighbors', type=int, default=5,
|
|
|
|
|
+ help='Number of neighbors to consider (default: 5)')
|
|
|
|
|
+ parser.add_argument('-w', '--weight', choices=["uniform", "distance"],
|
|
|
|
|
+ default="uniform", help='Weight function for determining \
|
|
|
|
|
+ distance (default: \"Uniform\")')
|
|
|
parser.add_argument('-f', '--feature', action='append', type=str,
|
|
parser.add_argument('-f', '--feature', action='append', type=str,
|
|
|
help='Add feature to list of features to test with.')
|
|
help='Add feature to list of features to test with.')
|
|
|
parser.add_argument('-p', '--p-value', action='store_const', default=False,
|
|
parser.add_argument('-p', '--p-value', action='store_const', default=False,
|
|
|
const=True, help='Calculate a p-value from a t-test.')
|
|
const=True, help='Calculate a p-value from a t-test.')
|
|
|
|
|
+ parser.add_argument('-t', '--top', type=int, default=1,
|
|
|
|
|
+ help='Number of guesses to be considered \"correct\" \
|
|
|
|
|
+ (default: 1)')
|
|
|
return parser.parse_args()
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
-def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
|
|
|
|
|
|
|
+def kNearestNeighbors(data: list, labels: list,
|
|
|
|
|
+ n=5, verbose=0, k=5, weights="uniform", guesses=1):
|
|
|
folds = KFold(n_splits=n)
|
|
folds = KFold(n_splits=n)
|
|
|
i = 1
|
|
i = 1
|
|
|
avg = 0
|
|
avg = 0
|
|
|
accuracies = []
|
|
accuracies = []
|
|
|
|
|
+ label_list = sorted(np.unique(labels))
|
|
|
for train_index, test_index in folds.split(data):
|
|
for train_index, test_index in folds.split(data):
|
|
|
if verbose >= 1:
|
|
if verbose >= 1:
|
|
|
print("Round %d:" % i)
|
|
print("Round %d:" % i)
|
|
|
i += 1
|
|
i += 1
|
|
|
if verbose >= 2:
|
|
if verbose >= 2:
|
|
|
print("Training on: ", train_index)
|
|
print("Training on: ", train_index)
|
|
|
- kn = KNeighborsClassifier(n_neighbors=2)
|
|
|
|
|
|
|
+ kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
|
|
|
kn.fit(data[train_index], labels[train_index])
|
|
kn.fit(data[train_index], labels[train_index])
|
|
|
predictions = kn.predict(data[test_index])
|
|
predictions = kn.predict(data[test_index])
|
|
|
- correct = [a == p for a, p in zip(labels[test_index], predictions)]
|
|
|
|
|
|
|
+ if guesses <= 1:
|
|
|
|
|
+ correct = [a == p for a, p in zip(labels[test_index], predictions)]
|
|
|
|
|
+ else:
|
|
|
|
|
+ correct = list(map(lambda x: x <= guesses,
|
|
|
|
|
+ find_in_predictions(
|
|
|
|
|
+ kn.predict_proba(data[test_index]),
|
|
|
|
|
+ labels[test_index],
|
|
|
|
|
+ label_list)))
|
|
|
accuracy = correct.count(True)/len(correct)
|
|
accuracy = correct.count(True)/len(correct)
|
|
|
if verbose >= 1:
|
|
if verbose >= 1:
|
|
|
print(accuracy)
|
|
print(accuracy)
|
|
|
accuracies.append(accuracy)
|
|
accuracies.append(accuracy)
|
|
|
return accuracies
|
|
return accuracies
|
|
|
|
|
|
|
|
|
|
+def find_in_predictions(probabilities: list, tests: int, labels: list):
|
|
|
|
|
+ return [list(map(lambda x: x[0],
|
|
|
|
|
+ sorted(list(zip(labels, probs)), key=lambda x: x[1]))
|
|
|
|
|
+ ).index(test)
|
|
|
|
|
+ for probs, test in zip(probabilities, tests)]
|
|
|
|
|
+
|
|
|
def t_test(accuracy: list, labels: list):
|
|
def t_test(accuracy: list, labels: list):
|
|
|
from scipy import stats
|
|
from scipy import stats
|
|
|
random_avg = 1.0/len(np.unique(labels))
|
|
random_avg = 1.0/len(np.unique(labels))
|