nearestneighbors.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #!/usr/bin/python3
  2. from sklearn.model_selection import KFold
  3. from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
  4. from sklearn.ensemble import RandomForestClassifier
  5. import numpy as np
  6. import sys
  7. from Vector import FeatureVector
  8. DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
  9. def main():
  10. # a test of this method using an arbitrarily generated list of 5 vectors with
  11. # 3 features each
  12. # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
  13. args = parse_args()
  14. try:
  15. import cPickle as pickle
  16. except:
  17. import pickle
  18. samples = pickle.load(args.features_file)
  19. features = args.feature if args.feature else DEFAULT_FEATURES
  20. from random import shuffle
  21. shuffle(samples)
  22. data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
  23. for p in samples])
  24. res = kNearestNeighbors(np.array(data), np.array(labels),
  25. n=args.folds, verbose=args.verbose)
  26. print("Overall Accuracy: %f" % res)
  27. def parse_args():
  28. import argparse
  29. parser = argparse.ArgumentParser(
  30. description='Run a data set through a kNearestNeighbors classifier.')
  31. parser.add_argument('features_file', type=argparse.FileType('rb'),
  32. help='File of extracted features.')
  33. parser.add_argument('-v', '--verbose', action="count", default=0,
  34. help='Show more information')
  35. parser.add_argument('-n', '--folds', type=int, default=5,
  36. help='Number of cross-validation folds (default: 5)')
  37. parser.add_argument('-f', '--feature', action='append', type=str,
  38. help='Add feature to list of features to test with.')
  39. return parser.parse_args()
  40. def kNearestNeighbors(data: list, labels: list, n=5, verbose=0):
  41. folds = KFold(n_splits=n)
  42. i = 1
  43. avg = 0
  44. for train_index, test_index in folds.split(data):
  45. if verbose >= 1:
  46. print("Round %d:" % i)
  47. i += 1
  48. if verbose >= 2:
  49. print("Training on: ", train_index)
  50. kn = KNeighborsClassifier(n_neighbors=2)
  51. kn.fit(data[train_index], labels[train_index])
  52. predictions = kn.predict(data[test_index])
  53. correct = [a == p for a, p in zip(labels[test_index], predictions)]
  54. accuracy = correct.count(True)/len(correct)
  55. if verbose >= 1:
  56. print(accuracy)
  57. avg += accuracy
  58. return avg/n
  59. if __name__ == '__main__':
  60. main()