#!/usr/bin/python3 from sklearn.model_selection import KFold import numpy as np try: import sample except ImportError: import os import sys sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \ '/../feature-extractor') import sample DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"] def main(): # a test of this method using an arbitrarily generated list of 5 vectors with # 3 features each # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]]) args = parse_args() try: import cPickle as pickle except: import pickle samples = pickle.load(args.features_file) from random import shuffle shuffle(samples) features = args.feature if args.feature else DEFAULT_FEATURES from Vector import FeatureVector data, labels = zip(*[(FeatureVector(p, features).get(), p.user) for p in samples]) res = kNearestNeighbors(np.array(data), np.array(labels), n=args.folds, verbose=args.verbose, k=args.k_neighbors, weights=args.weight, guesses=args.top) print("Overall Accuracy: %f" % np.average(res)) if args.p_value: _, p = t_test(res, labels) print("P-Value: %f" % (p / 2)) def parse_args(): import argparse parser = argparse.ArgumentParser( description='Run a data set through a kNearestNeighbors classifier.') parser.add_argument('features_file', type=argparse.FileType('rb'), help='File of extracted features.') parser.add_argument('-v', '--verbose', action="count", default=0, help='Show more information') parser.add_argument('-n', '--folds', type=int, default=5, help='Number of cross-validation folds (default: 5)') parser.add_argument('-k', '--k-neighbors', type=int, default=5, help='Number of neighbors to consider (default: 5)') parser.add_argument('-w', '--weight', choices=["uniform", "distance"], default="uniform", help='Weight function for determining \ distance (default: \"Uniform\")') parser.add_argument('-f', '--feature', action='append', type=str, help='Add feature to list of features to test with.') parser.add_argument('-p', '--p-value', action='store_const', default=False, const=True, help='Calculate a p-value from a t-test.') parser.add_argument('-t', '--top', type=int, default=1, help='Number of guesses to be considered \"correct\" \ (default: 1)') return parser.parse_args() def kNearestNeighbors(data: list, labels: list, n=5, verbose=0, k=5, weights="uniform", guesses=1): from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier folds = KFold(n_splits=n) i = 1 avg = 0 accuracies = [] label_list = sorted(np.unique(labels)) for train_index, test_index in folds.split(data): if verbose >= 1: print("Round %d:" % i) i += 1 if verbose >= 2: print("Training on: ", train_index) kn = KNeighborsClassifier(n_neighbors=k, weights=weights) kn.fit(data[train_index], labels[train_index]) predictions = kn.predict(data[test_index]) if guesses <= 1: correct = [a == p for a, p in zip(labels[test_index], predictions)] else: correct = list(map(lambda x: x <= guesses, find_in_predictions( kn.predict_proba(data[test_index]), labels[test_index], label_list))) accuracy = correct.count(True)/len(correct) if verbose >= 1: print(accuracy) accuracies.append(accuracy) return accuracies # TODO: This should be in a separate file. # If we need a unified interface we can make an aggregater. # TODO: KFold validation def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list): import tensorflow as tf numberOfNeurons = (len(data[0]) + classifications)/2 model = tf.keras.models.Sequential() model.add(tf.keras.layers.Flatten()) model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu)) model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu)) model.add(tf.keras.layers.Dense(classifications, tf.nn.softmax)) model.compile(optimizer='SGD', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(data, results, epochs=5) loss, accuracy = model.evaluate(testdata, testresults) print(loss) print(accuracy) # TODO: This should be in a separate file. # If we need a unified interface we can make an aggregater. # TODO: KFold validation def randomForest(data: list, labels: list, test_data: list, test_data_labels: list): from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators=10) rfc.fit(data, labels) predictions = rfc.predict(test_data) for t in range(len(test_data)): print(str(test_data[t]) + "prediction: " + str(predictions[t])) if len(test_data) == 0: return accuracysum = 0 for t in range(len(test_data)): accuracysum += 1 if predictions[t] == test_data_labels[t] else 0 print("Accuracy: " + str(accuracysum/len(test_data_labels))) def find_in_predictions(probabilities: list, tests: int, labels: list): return [list(map(lambda x: x[0], sorted(list(zip(labels, probs)), key=lambda x: x[1])) ).index(test) for probs, test in zip(probabilities, tests)] def t_test(accuracy: list, labels: list): from scipy import stats random_avg = 1.0/len(np.unique(labels)) return stats.ttest_1samp(accuracy, random_avg) if __name__ == '__main__': main()