| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- #!/usr/bin/python3
- from sklearn.model_selection import KFold
- import numpy as np
- try:
- import sample
- except ImportError:
- import os
- import sys
- sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
- '/../feature-extractor')
- import sample
- DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
- def main():
- # a test of this method using an arbitrarily generated list of 5 vectors with
- # 3 features each
- # nearestNeighbors([[1, 1, 0], [1, 0, 0], [0, 0, 0], [0, 5, 5]], [[1, 1, 4]])
- args = parse_args()
- try:
- import cPickle as pickle
- except:
- import pickle
- samples = pickle.load(args.features_file)
- from random import shuffle
- shuffle(samples)
- features = args.feature if args.feature else DEFAULT_FEATURES
- from Vector import FeatureVector
- data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
- for p in samples])
- res = kNearestNeighbors(np.array(data), np.array(labels),
- n=args.folds, verbose=args.verbose, k=args.k_neighbors,
- weights=args.weight, guesses=args.top)
- print("Overall Accuracy: %f" % np.average(res))
- if args.p_value:
- _, p = t_test(res, labels)
- print("P-Value: %f" % (p / 2))
- def parse_args():
- import argparse
- parser = argparse.ArgumentParser(
- description='Run a data set through a kNearestNeighbors classifier.')
- parser.add_argument('features_file', type=argparse.FileType('rb'),
- help='File of extracted features.')
- parser.add_argument('-v', '--verbose', action="count", default=0,
- help='Show more information')
- parser.add_argument('-n', '--folds', type=int, default=5,
- help='Number of cross-validation folds (default: 5)')
- parser.add_argument('-k', '--k-neighbors', type=int, default=5,
- help='Number of neighbors to consider (default: 5)')
- parser.add_argument('-w', '--weight', choices=["uniform", "distance"],
- default="uniform", help='Weight function for determining \
- distance (default: \"Uniform\")')
- parser.add_argument('-f', '--feature', action='append', type=str,
- help='Add feature to list of features to test with.')
- parser.add_argument('-p', '--p-value', action='store_const', default=False,
- const=True, help='Calculate a p-value from a t-test.')
- parser.add_argument('-t', '--top', type=int, default=1,
- help='Number of guesses to be considered \"correct\" \
- (default: 1)')
- return parser.parse_args()
- def kNearestNeighbors(data: list, labels: list,
- n=5, verbose=0, k=5, weights="uniform", guesses=1):
- from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
- folds = KFold(n_splits=n)
- i = 1
- avg = 0
- accuracies = []
- label_list = sorted(np.unique(labels))
- for train_index, test_index in folds.split(data):
- if verbose >= 1:
- print("Round %d:" % i)
- i += 1
- if verbose >= 2:
- print("Training on: ", train_index)
- kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
- kn.fit(data[train_index], labels[train_index])
- predictions = kn.predict(data[test_index])
- if guesses <= 1:
- correct = [a == p for a, p in zip(labels[test_index], predictions)]
- else:
- correct = list(map(lambda x: x <= guesses,
- find_in_predictions(
- kn.predict_proba(data[test_index]),
- labels[test_index],
- label_list)))
- accuracy = correct.count(True)/len(correct)
- if verbose >= 1:
- print(accuracy)
- accuracies.append(accuracy)
- return accuracies
- # TODO: This should be in a separate file.
- # If we need a unified interface we can make an aggregater.
- # TODO: KFold validation
- def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list):
- import tensorflow as tf
- numberOfNeurons = (len(data[0]) + classifications)/2
- model = tf.keras.models.Sequential()
- model.add(tf.keras.layers.Flatten())
- model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu))
- model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu))
- model.add(tf.keras.layers.Dense(classifications, tf.nn.softmax))
- model.compile(optimizer='SGD',
- loss='sparse_categorical_crossentropy',
- metrics=['accuracy'])
- model.fit(data, results, epochs=5)
- loss, accuracy = model.evaluate(testdata, testresults)
- print(loss)
- print(accuracy)
- # TODO: This should be in a separate file.
- # If we need a unified interface we can make an aggregater.
- # TODO: KFold validation
- def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
- from sklearn.ensemble import RandomForestClassifier
- rfc = RandomForestClassifier(n_estimators=10)
- rfc.fit(data, labels)
- predictions = rfc.predict(test_data)
- for t in range(len(test_data)):
- print(str(test_data[t]) + "prediction: " + str(predictions[t]))
- if len(test_data) == 0:
- return
- accuracysum = 0
- for t in range(len(test_data)):
- accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
- print("Accuracy: " + str(accuracysum/len(test_data_labels)))
- def find_in_predictions(probabilities: list, tests: int, labels: list):
- return [list(map(lambda x: x[0],
- sorted(list(zip(labels, probs)), key=lambda x: x[1]))
- ).index(test)
- for probs, test in zip(probabilities, tests)]
- def t_test(accuracy: list, labels: list):
- from scipy import stats
- random_avg = 1.0/len(np.unique(labels))
- return stats.ttest_1samp(accuracy, random_avg)
- if __name__ == '__main__':
- main()
|