|
|
@@ -1,34 +1,94 @@
|
|
|
import tensorflow as tf
|
|
|
+import os
|
|
|
+from sklearn.model_selection import KFold
|
|
|
+import sklearn.linear_model
|
|
|
+import sklearn.metrics
|
|
|
+import keras
|
|
|
+from keras.models import Sequential
|
|
|
+from keras.layers import Dense, Activation
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+import typing
|
|
|
+try:
|
|
|
+ import sample
|
|
|
+except ImportError:
|
|
|
+ import os, sys
|
|
|
+ sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
|
|
|
+ '/../feature-extractor')
|
|
|
+ import sample
|
|
|
|
|
|
+DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
|
|
|
|
|
|
-def main():
|
|
|
- # train(4, [[.1, .2, .3], [.3, .2, .3], [.3, .3, .3], [.3, .4, .5]], [1,2,3,0], [[.1, .2, .3], [.3, .2, .3], [.3, .3, .3], [.3, .4, .5]], [1,2,3,0])
|
|
|
- pass
|
|
|
- '''
|
|
|
- mnist = tf.keras.datasets.mnist
|
|
|
|
|
|
- (x_train, y_train), (x_test, y_test) = mnist.load_data()
|
|
|
+def main(options: list):
|
|
|
+ args = parse_args(options)
|
|
|
+ try:
|
|
|
+ import cPickle as pickle
|
|
|
+ except:
|
|
|
+ import pickle
|
|
|
+ samples = pickle.load(args.features_file)
|
|
|
+ features = args.feature if args.feature else DEFAULT_FEATURES
|
|
|
+ from Vector import FeatureVector
|
|
|
+ data, labels = map(np.array,
|
|
|
+ zip(*[(FeatureVector(p, features).get(), p.user)
|
|
|
+ for p in samples]))
|
|
|
+ num_users = len(np.unique([s.user for s in samples]))
|
|
|
+ s = np.arange(data.shape[0])
|
|
|
+ np.random.shuffle(s)
|
|
|
+ labelset = set(labels)
|
|
|
+ labeldict = {}
|
|
|
+ i = 0
|
|
|
+ for l in labelset:
|
|
|
+ labeldict[l] = i
|
|
|
+ i += 1
|
|
|
+ labelints = []
|
|
|
+ for i in labels:
|
|
|
+ labelints.append(labeldict[i])
|
|
|
+ lin = np.array(labelints)
|
|
|
+ shuffledf = pd.DataFrame(data=lin.flatten())
|
|
|
+ trainingdf = pd.merge(shuffledf, pd.DataFrame(data=data), left_index=True, right_index=True).sample(frac=1)
|
|
|
+ labelsshuffled = trainingdf['0_x']
|
|
|
+ datashuffled = trainingdf.drop(['0_x'], axis=1)
|
|
|
|
|
|
- x_train = tf.keras.utils.normalize(x_train, axis=1)
|
|
|
- x_test = tf.keras.utils.normalize(x_test, axis=1)
|
|
|
+ classify(num_users, datashuffled.to_numpy()[:-20], labelsshuffled.to_numpy()[:-20], datashuffled.to_numpy()[-20:], labelsshuffled.to_numpy()[-20:])
|
|
|
+ '''if args.graph_top:
|
|
|
+ graph_top(args, data[s], labels[s])
|
|
|
+ if args.graph_k:
|
|
|
+ graph_k(args, data[s], labels[s])
|
|
|
+ if args.graph_weights:
|
|
|
+ graph_w(args, data[s], labels[s])
|
|
|
+ res, matrix = kNearestNeighbors(data[s], labels[s], n=args.folds,
|
|
|
+ verbose=args.verbose, guesses=args.top,
|
|
|
+ k=args.k_neighbors, weights=args.weight)'''
|
|
|
+ #print("Overall Accuracy: %f" % np.average(res))
|
|
|
+ #if args.p_value:
|
|
|
+ # print("P-Value: %f" % t_test(res, num_users)[1] / 2)
|
|
|
+ #if args.graph:
|
|
|
+ # gen_confusion_matrix(matrix, labels)
|
|
|
|
|
|
- print(x_train[0])
|
|
|
|
|
|
- model = tf.keras.models.Sequential()
|
|
|
- model.add(tf.keras.layers.Flatten())
|
|
|
- model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
|
|
|
- model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
|
|
|
- model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))
|
|
|
-
|
|
|
- model.compile(optimizer='SGD',
|
|
|
- loss='sparse_categorical_crossentropy',
|
|
|
+def classify(classifications: int, sampleData: list, labels: list, testdata: list, testlabels: list):
|
|
|
+ #print(classifications)
|
|
|
+ model = Sequential()
|
|
|
+ model.add(Dense(len(sampleData[0]), input_dim=len(sampleData[0])))
|
|
|
+ model.add(Activation('sigmoid'))
|
|
|
+ model.add(Dense(len(sampleData[0])//2))
|
|
|
+ model.add(Activation('sigmoid'))
|
|
|
+ model.add(Dense(classifications))
|
|
|
+ model.add(Activation('relu'))
|
|
|
+ model.compile(optimizer='rmsprop',
|
|
|
+ loss='categorical_crossentropy',
|
|
|
metrics=['accuracy'])
|
|
|
- model.fit(x_train, y_train, epochs=3)
|
|
|
- '''
|
|
|
+ one_hot_labels = keras.utils.to_categorical(labels, num_classes=classifications)
|
|
|
+ model.fit(sampleData, one_hot_labels, epochs=10, batch_size=len(sampleData[0]))
|
|
|
+ one_hot_test_labels = keras.utils.to_categorical(testlabels, num_classes=classifications)
|
|
|
+ results = model.evaluate(testdata, one_hot_test_labels, batch_size=128)
|
|
|
+ print(results)
|
|
|
+
|
|
|
|
|
|
|
|
|
# data and results arrays (training and testing) should be paired. Classifications is number of ways to classify data.
|
|
|
-def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list):
|
|
|
+'''def multiLayerPerceptronClassifier(classifications: int, data: list, labels: list, testdata: list, testresults: list):
|
|
|
numberOfNeurons = (len(data[0]) + classifications)/2
|
|
|
model = tf.keras.models.Sequential()
|
|
|
model.add(tf.keras.layers.Flatten())
|
|
|
@@ -39,12 +99,53 @@ def multiLayerPerceptronClassifier(classifications: int, data: list, results: li
|
|
|
model.compile(optimizer='SGD',
|
|
|
loss='sparse_categorical_crossentropy',
|
|
|
metrics=['accuracy'])
|
|
|
- model.fit(data, results, epochs=5)
|
|
|
+ print(data)
|
|
|
+ model.fit(data, labels, epochs=5)
|
|
|
|
|
|
loss, accuracy = model.evaluate(testdata, testresults)
|
|
|
print(loss)
|
|
|
- print(accuracy)
|
|
|
+ print(accuracy)'''
|
|
|
+
|
|
|
+
|
|
|
+def parse_args(args: list):
|
|
|
+ import argparse
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
+ description='Run a data set through a kNearestNeighbors classifier.')
|
|
|
+ parser.add_argument('features_file', type=argparse.FileType('rb'),
|
|
|
+ help='File of extracted features.')
|
|
|
+ parser.add_argument('-v', '--verbose', action="count", default=0,
|
|
|
+ help='Show more information')
|
|
|
+ parser.add_argument('-n', '--folds', type=int, default=5,
|
|
|
+ help='Number of cross-validation folds (default: 5)')
|
|
|
+ parser.add_argument('-k', '--k-neighbors', type=int, default=5,
|
|
|
+ help='Number of neighbors to consider (default: 5)')
|
|
|
+ parser.add_argument('-w', '--weight', choices=["uniform", "distance"],
|
|
|
+ default="uniform", help='Weight function for determining \
|
|
|
+ distance (default: \"Uniform\")')
|
|
|
+ parser.add_argument('-f', '--feature', action='append', type=str,
|
|
|
+ help='Add feature to list of features to test with.')
|
|
|
+ parser.add_argument('-p', '--p-value', action='store_const', default=False,
|
|
|
+ const=True, help='Calculate a p-value from a t-test.')
|
|
|
+ parser.add_argument('-t', '--top', type=int, default=1,
|
|
|
+ help='Number of guesses to be considered \"correct\" \
|
|
|
+ (default: 1)')
|
|
|
+ parser.add_argument('-g', '--graph', action="store_true",
|
|
|
+ help='Generates a confusion matrix.')
|
|
|
+ parser.add_argument('--graph-top', action="store_true",
|
|
|
+ help='Generates a graph of accuracy in top N guesses.')
|
|
|
+ parser.add_argument('--graph-k', action="store_true",
|
|
|
+ help='Generates a graph of accuracy for k-nearest neighbors.')
|
|
|
+ parser.add_argument('--graph-weights', action="store_true",
|
|
|
+ help='Generates a graph comparing weights.')
|
|
|
+ return parser.parse_args(args)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- main()
|
|
|
+ import sys
|
|
|
+
|
|
|
+ for filename in os.listdir('./samples2.0'):
|
|
|
+ print(os.path.join('./samples2.0', filename))
|
|
|
+ main(["-f*", os.path.join('./samples2.0', filename)])
|
|
|
+ #main(sys.argv[1:])
|
|
|
+ #print(sys.argv[1:])
|
|
|
+ #main(sys.argv[1:])
|