nearestneighbors.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/python3
  2. from sklearn.model_selection import KFold
  3. import numpy as np
  4. try:
  5. import sample
  6. except ImportError:
  7. import os
  8. import sys
  9. sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
  10. '/../feature-extractor')
  11. import sample
  12. DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
  13. def main():
  14. # a test of this method using an arbitrarily generated list of 5 vectors with
  15. # 3 features each
  16. # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
  17. args = parse_args()
  18. try:
  19. import cPickle as pickle
  20. except:
  21. import pickle
  22. samples = pickle.load(args.features_file)
  23. from random import shuffle
  24. shuffle(samples)
  25. features = args.feature if args.feature else DEFAULT_FEATURES
  26. from Vector import FeatureVector
  27. data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
  28. for p in samples])
  29. res = kNearestNeighbors(np.array(data), np.array(labels),
  30. n=args.folds, verbose=args.verbose, k=args.k_neighbors,
  31. weights=args.weight, guesses=args.top)
  32. print("Overall Accuracy: %f" % np.average(res))
  33. if args.p_value:
  34. _, p = t_test(res, labels)
  35. print("P-Value: %f" % (p / 2))
  36. def parse_args():
  37. import argparse
  38. parser = argparse.ArgumentParser(
  39. description='Run a data set through a kNearestNeighbors classifier.')
  40. parser.add_argument('features_file', type=argparse.FileType('rb'),
  41. help='File of extracted features.')
  42. parser.add_argument('-v', '--verbose', action="count", default=0,
  43. help='Show more information')
  44. parser.add_argument('-n', '--folds', type=int, default=5,
  45. help='Number of cross-validation folds (default: 5)')
  46. parser.add_argument('-k', '--k-neighbors', type=int, default=5,
  47. help='Number of neighbors to consider (default: 5)')
  48. parser.add_argument('-w', '--weight', choices=["uniform", "distance"],
  49. default="uniform", help='Weight function for determining \
  50. distance (default: \"Uniform\")')
  51. parser.add_argument('-f', '--feature', action='append', type=str,
  52. help='Add feature to list of features to test with.')
  53. parser.add_argument('-p', '--p-value', action='store_const', default=False,
  54. const=True, help='Calculate a p-value from a t-test.')
  55. parser.add_argument('-t', '--top', type=int, default=1,
  56. help='Number of guesses to be considered \"correct\" \
  57. (default: 1)')
  58. return parser.parse_args()
  59. def kNearestNeighbors(data: list, labels: list,
  60. n=5, verbose=0, k=5, weights="uniform", guesses=1):
  61. from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
  62. folds = KFold(n_splits=n)
  63. i = 1
  64. avg = 0
  65. accuracies = []
  66. label_list = sorted(np.unique(labels))
  67. for train_index, test_index in folds.split(data):
  68. if verbose >= 1:
  69. print("Round %d:" % i)
  70. i += 1
  71. if verbose >= 2:
  72. print("Training on: ", train_index)
  73. kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
  74. kn.fit(data[train_index], labels[train_index])
  75. predictions = kn.predict(data[test_index])
  76. if guesses <= 1:
  77. correct = [a == p for a, p in zip(labels[test_index], predictions)]
  78. else:
  79. correct = list(map(lambda x: x <= guesses,
  80. find_in_predictions(
  81. kn.predict_proba(data[test_index]),
  82. labels[test_index],
  83. label_list)))
  84. accuracy = correct.count(True)/len(correct)
  85. if verbose >= 1:
  86. print(accuracy)
  87. accuracies.append(accuracy)
  88. return accuracies
  89. # TODO: This should be in a separate file.
  90. # If we need a unified interface we can make an aggregater.
  91. # TODO: KFold validation
  92. def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list):
  93. import tensorflow as tf
  94. numberOfNeurons = (len(data[0]) + classifications)/2
  95. model = tf.keras.models.Sequential()
  96. model.add(tf.keras.layers.Flatten())
  97. model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu))
  98. model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu))
  99. model.add(tf.keras.layers.Dense(classifications, tf.nn.softmax))
  100. model.compile(optimizer='SGD',
  101. loss='sparse_categorical_crossentropy',
  102. metrics=['accuracy'])
  103. model.fit(data, results, epochs=5)
  104. loss, accuracy = model.evaluate(testdata, testresults)
  105. print(loss)
  106. print(accuracy)
  107. # TODO: This should be in a separate file.
  108. # If we need a unified interface we can make an aggregater.
  109. # TODO: KFold validation
  110. def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
  111. from sklearn.ensemble import RandomForestClassifier
  112. rfc = RandomForestClassifier(n_estimators=10)
  113. rfc.fit(data, labels)
  114. predictions = rfc.predict(test_data)
  115. for t in range(len(test_data)):
  116. print(str(test_data[t]) + "prediction: " + str(predictions[t]))
  117. if len(test_data) == 0:
  118. return
  119. accuracysum = 0
  120. for t in range(len(test_data)):
  121. accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
  122. print("Accuracy: " + str(accuracysum/len(test_data_labels)))
  123. def find_in_predictions(probabilities: list, tests: int, labels: list):
  124. return [list(map(lambda x: x[0],
  125. sorted(list(zip(labels, probs)), key=lambda x: x[1]))
  126. ).index(test)
  127. for probs, test in zip(probabilities, tests)]
  128. def t_test(accuracy: list, labels: list):
  129. from scipy import stats
  130. random_avg = 1.0/len(np.unique(labels))
  131. return stats.ttest_1samp(accuracy, random_avg)
  132. if __name__ == '__main__':
  133. main()