tflucke
/
SSH-Master-Thesis


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							#!/usr/bin/python3
from sklearn.model_selection import KFold
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import sys
from Vector import FeatureVector

DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]

def main():
    # a test of this method using an arbitrarily generated list of 5 vectors with
    # 3 features each
    # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
    args = parse_args()
    try:
        import cPickle as pickle
    except:
        import pickle
    samples = pickle.load(args.features_file)
    features = args.feature if args.feature else DEFAULT_FEATURES
    from random import shuffle
    shuffle(samples)
    data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
                         for p in samples])
    res = kNearestNeighbors(np.array(data), np.array(labels),
                            n=args.folds, verbose=args.verbose)
    print("Overall Accuracy: %f" % res)

def parse_args():
    import argparse
    parser = argparse.ArgumentParser(
        description='Run a data set through a kNearestNeighbors classifier.')
    parser.add_argument('features_file', type=argparse.FileType('rb'),
                        help='File of extracted features.')
    parser.add_argument('-v', '--verbose', action="count", default=0,
                        help='Show more information')
    parser.add_argument('-n', '--folds', type=int, default=5,
                        help='Number of cross-validation folds (default: 5)')
    parser.add_argument('-f', '--feature', action='append', type=str,
                        help='Add feature to list of features to test with.')
    return parser.parse_args()

def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
    folds = KFold(n_splits=n)
    i = 1
    avg = 0
    for train_index, test_index in folds.split(data):
        if verbose >= 1:
            print("Round %d:" % i)
            i += 1
        if verbose >= 2:
            print("Training on: ", train_index)
        kn = KNeighborsClassifier(n_neighbors=2)
        kn.fit(data[train_index], labels[train_index])
        predictions = kn.predict(data[test_index])
        correct = [a == p for a, p in zip(labels[test_index], predictions)]
        accuracy = correct.count(True)/len(correct)
        if verbose >= 1:
            print(accuracy)
        avg += accuracy
    return avg/n

if __name__ == '__main__':
    main()