|
@@ -1,4 +1,4 @@
|
|
|
-#!/usr/bin/python3
|
|
|
|
|
|
|
+#!/home/tflucke/bin/bin/python3
|
|
|
|
|
|
|
|
from sklearn.model_selection import KFold
|
|
from sklearn.model_selection import KFold
|
|
|
import numpy as np
|
|
import numpy as np
|
|
@@ -26,10 +26,57 @@ def main(options: list):
|
|
|
zip(*[(FeatureVector(p, features).get(), p.user)
|
|
zip(*[(FeatureVector(p, features).get(), p.user)
|
|
|
for p in samples]))
|
|
for p in samples]))
|
|
|
num_users = len(np.unique([s.user for s in samples]))
|
|
num_users = len(np.unique([s.user for s in samples]))
|
|
|
- avg, p = classify(data, labels, num_users, args)
|
|
|
|
|
- print("Overall Accuracy: %f" % avg)
|
|
|
|
|
|
|
+ s = np.arange(data.shape[0])
|
|
|
|
|
+ np.random.shuffle(s)
|
|
|
|
|
+ if args.graph_top:
|
|
|
|
|
+ graph_top(args, data[s], labels[s])
|
|
|
|
|
+ res, matrix = kNearestNeighbors(data[s], labels[s], n=args.folds,
|
|
|
|
|
+ verbose=args.verbose, guesses=args.top,
|
|
|
|
|
+ k=args.k_neighbors, weights=args.weight)
|
|
|
|
|
+ print("Overall Accuracy: %f" % np.average(res))
|
|
|
if args.p_value:
|
|
if args.p_value:
|
|
|
- print("P-Value: %f" % p)
|
|
|
|
|
|
|
+ print("P-Value: %f" % t_test(res, num_users)[1] / 2)
|
|
|
|
|
+ if args.graph:
|
|
|
|
|
+ gen_confusion_matrix(matrix, labels)
|
|
|
|
|
+
|
|
|
|
|
+def graph_top(args, data, labels):
|
|
|
|
|
+ t = 0
|
|
|
|
|
+ label_list = np.unique(labels)
|
|
|
|
|
+ res = []
|
|
|
|
|
+ while t < len(label_list):
|
|
|
|
|
+ t += 1
|
|
|
|
|
+ res.append(#, 1.0/t
|
|
|
|
|
+ (t, np.average(
|
|
|
|
|
+ kNearestNeighbors(data, labels, n=args.folds, guesses=t,
|
|
|
|
|
+ verbose=args.verbose, k=args.k_neighbors,
|
|
|
|
|
+ weights=args.weight)[0])
|
|
|
|
|
+ )
|
|
|
|
|
+ )
|
|
|
|
|
+ import seaborn as sns
|
|
|
|
|
+ from pandas import DataFrame
|
|
|
|
|
+ from matplotlib import pyplot as plt
|
|
|
|
|
+ dataset = DataFrame(res, columns=["Top-N Guesses", "Accuracy (%)"])
|
|
|
|
|
+ graph = sns.lineplot("Top-N Guesses", "Accuracy", data=dataset)
|
|
|
|
|
+ graph.set_xticks(np.arange(1, len(label_list), 2))
|
|
|
|
|
+ graph.set_yticks(np.arange(0, 1, 0.1))
|
|
|
|
|
+ plt.title('K-Nearest Neighbor Accuracy on Nth Guess')
|
|
|
|
|
+ graph.get_figure().savefig("nearest-neighbor-top-n.png")
|
|
|
|
|
+
|
|
|
|
|
+def gen_confusion_matrix(matrix, labels):
|
|
|
|
|
+ import seaborn as sns
|
|
|
|
|
+ from pandas import DataFrame
|
|
|
|
|
+ from matplotlib import pyplot as plt
|
|
|
|
|
+ label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
|
|
|
|
|
+ dataset = DataFrame(matrix, columns=label_list, index=label_list)
|
|
|
|
|
+ sns.set(font_scale=0.8)
|
|
|
|
|
+ graph = sns.heatmap(data=dataset, annot=True, cbar=False)
|
|
|
|
|
+ graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
|
|
|
|
|
+ horizontalalignment="right")
|
|
|
|
|
+ plt.subplots_adjust(left=0.15, bottom=0.2)
|
|
|
|
|
+ plt.ylabel('True Label')
|
|
|
|
|
+ plt.xlabel('Predicted Label')
|
|
|
|
|
+ plt.title('K-Nearest Neighbor Confusion Matrix')
|
|
|
|
|
+ graph.get_figure().savefig("nearest-neighbor.png")
|
|
|
|
|
|
|
|
def parse_args(args: list):
|
|
def parse_args(args: list):
|
|
|
import argparse
|
|
import argparse
|
|
@@ -53,23 +100,30 @@ def parse_args(args: list):
|
|
|
parser.add_argument('-t', '--top', type=int, default=1,
|
|
parser.add_argument('-t', '--top', type=int, default=1,
|
|
|
help='Number of guesses to be considered \"correct\" \
|
|
help='Number of guesses to be considered \"correct\" \
|
|
|
(default: 1)')
|
|
(default: 1)')
|
|
|
|
|
+ parser.add_argument('-g', '--graph', action="store_true",
|
|
|
|
|
+ help='Generates a confusion matrix.')
|
|
|
|
|
+ parser.add_argument('--graph-top', action="store_true",
|
|
|
|
|
+ help='Generates a graph of accuracy in top N guesses.')
|
|
|
return parser.parse_args(args)
|
|
return parser.parse_args(args)
|
|
|
|
|
|
|
|
def classify(data, labels, num_users: int, args):
|
|
def classify(data, labels, num_users: int, args):
|
|
|
s = np.arange(data.shape[0])
|
|
s = np.arange(data.shape[0])
|
|
|
np.random.shuffle(s)
|
|
np.random.shuffle(s)
|
|
|
- res = kNearestNeighbors(data[s], labels[s],
|
|
|
|
|
- n=args.folds, verbose=args.verbose, k=args.k_neighbors,
|
|
|
|
|
- weights=args.weight, guesses=args.top)
|
|
|
|
|
|
|
+ res, _ = kNearestNeighbors(data[s], labels[s], n=args.folds,
|
|
|
|
|
+ verbose=args.verbose, guesses=args.top,
|
|
|
|
|
+ k=args.k_neighbors, weights=args.weight)
|
|
|
return (np.average(res), t_test(res, num_users)[1] / 2)
|
|
return (np.average(res), t_test(res, num_users)[1] / 2)
|
|
|
|
|
|
|
|
def kNearestNeighbors(data: list, labels: list,
|
|
def kNearestNeighbors(data: list, labels: list,
|
|
|
n=5, verbose=0, k=5, weights="uniform", guesses=1):
|
|
n=5, verbose=0, k=5, weights="uniform", guesses=1):
|
|
|
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
|
|
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
|
|
|
|
|
+ from sklearn.metrics import confusion_matrix
|
|
|
folds = KFold(n_splits=n)
|
|
folds = KFold(n_splits=n)
|
|
|
i = 1
|
|
i = 1
|
|
|
avg = 0
|
|
avg = 0
|
|
|
accuracies = []
|
|
accuracies = []
|
|
|
|
|
+ output = []
|
|
|
|
|
+ truth = []
|
|
|
label_list = sorted(np.unique(labels))
|
|
label_list = sorted(np.unique(labels))
|
|
|
for train_index, test_index in folds.split(data):
|
|
for train_index, test_index in folds.split(data):
|
|
|
if verbose >= 1:
|
|
if verbose >= 1:
|
|
@@ -80,10 +134,12 @@ def kNearestNeighbors(data: list, labels: list,
|
|
|
kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
|
|
kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
|
|
|
kn.fit(data[train_index], labels[train_index])
|
|
kn.fit(data[train_index], labels[train_index])
|
|
|
predictions = kn.predict(data[test_index])
|
|
predictions = kn.predict(data[test_index])
|
|
|
|
|
+ output.extend(predictions)
|
|
|
|
|
+ truth.extend(labels[test_index])
|
|
|
if guesses <= 1:
|
|
if guesses <= 1:
|
|
|
correct = [a == p for a, p in zip(labels[test_index], predictions)]
|
|
correct = [a == p for a, p in zip(labels[test_index], predictions)]
|
|
|
else:
|
|
else:
|
|
|
- correct = list(map(lambda x: x <= guesses,
|
|
|
|
|
|
|
+ correct = list(map(lambda x: x < guesses,
|
|
|
find_in_predictions(
|
|
find_in_predictions(
|
|
|
kn.predict_proba(data[test_index]),
|
|
kn.predict_proba(data[test_index]),
|
|
|
labels[test_index],
|
|
labels[test_index],
|
|
@@ -92,8 +148,15 @@ def kNearestNeighbors(data: list, labels: list,
|
|
|
if verbose >= 1:
|
|
if verbose >= 1:
|
|
|
print(accuracy)
|
|
print(accuracy)
|
|
|
accuracies.append(accuracy)
|
|
accuracies.append(accuracy)
|
|
|
- return accuracies
|
|
|
|
|
|
|
+ return (accuracies, confusion_matrix(truth, output, labels=label_list))
|
|
|
|
|
|
|
|
|
|
+def find_in_predictions(probabilities: list, truth: list, labels: list):
|
|
|
|
|
+ return [
|
|
|
|
|
+ list(map(lambda x: x[0],
|
|
|
|
|
+ sorted(zip(labels, probs), key=lambda x: -x[1])
|
|
|
|
|
+ )).index(actual)
|
|
|
|
|
+ for probs, actual in zip(probabilities, truth)
|
|
|
|
|
+ ]
|
|
|
|
|
|
|
|
# TODO: This should be in a separate file.
|
|
# TODO: This should be in a separate file.
|
|
|
# If we need a unified interface we can make an aggregater.
|
|
# If we need a unified interface we can make an aggregater.
|
|
@@ -116,31 +179,6 @@ def multiLayerPerceptronClassifier(classifications: int, data: list, results: li
|
|
|
print(loss)
|
|
print(loss)
|
|
|
print(accuracy)
|
|
print(accuracy)
|
|
|
|
|
|
|
|
-# TODO: This should be in a separate file.
|
|
|
|
|
-# If we need a unified interface we can make an aggregater.
|
|
|
|
|
-# TODO: KFold validation
|
|
|
|
|
-def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
|
|
|
|
|
- from sklearn.ensemble import RandomForestClassifier
|
|
|
|
|
- rfc = RandomForestClassifier(n_estimators=10)
|
|
|
|
|
- rfc.fit(data, labels)
|
|
|
|
|
- predictions = rfc.predict(test_data)
|
|
|
|
|
- for t in range(len(test_data)):
|
|
|
|
|
- print(str(test_data[t]) + "prediction: " + str(predictions[t]))
|
|
|
|
|
- if len(test_data) == 0:
|
|
|
|
|
- return
|
|
|
|
|
- accuracysum = 0
|
|
|
|
|
- for t in range(len(test_data)):
|
|
|
|
|
- accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
|
|
|
|
|
- print("Accuracy: " + str(accuracysum/len(test_data_labels)))
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def find_in_predictions(probabilities: list, tests: int, labels: list):
|
|
|
|
|
- return [list(map(lambda x: x[0],
|
|
|
|
|
- sorted(list(zip(labels, probs)), key=lambda x: x[1]))
|
|
|
|
|
- ).index(test)
|
|
|
|
|
- for probs, test in zip(probabilities, tests)]
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
def t_test(accuracy: list, num_users: int):
|
|
def t_test(accuracy: list, num_users: int):
|
|
|
from scipy import stats
|
|
from scipy import stats
|
|
|
random_avg = 1.0/num_users
|
|
random_avg = 1.0/num_users
|
|
@@ -148,7 +186,6 @@ def t_test(accuracy: list, num_users: int):
|
|
|
# If all numbers are identical, p-value = 1
|
|
# If all numbers are identical, p-value = 1
|
|
|
return res if not np.isnan(res[0]) else (0, 1)
|
|
return res if not np.isnan(res[0]) else (0, 1)
|
|
|
|
|
|
|
|
-
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
import sys
|
|
import sys
|
|
|
main(sys.argv[1:])
|
|
main(sys.argv[1:])
|