Jelajahi Sumber

changes to classifier

Ethan Goldfarb 6 tahun lalu
induk
melakukan
40d9774c2e
2 mengubah file dengan 139 tambahan dan 23 penghapusan
  1. 124 23
      src/classifiers/classifier.py
  2. 15 0
      src/classifiers/interpres.py

+ 124 - 23
src/classifiers/classifier.py

@@ -1,34 +1,94 @@
 import tensorflow as tf
+import os
+from sklearn.model_selection import KFold
+import sklearn.linear_model
+import sklearn.metrics
+import keras
+from keras.models import Sequential
+from keras.layers import Dense, Activation
+import numpy as np
+import pandas as pd
+import typing
+try:
+    import sample
+except ImportError:
+    import os, sys
+    sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
+                    '/../feature-extractor')
+    import sample
 
+DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
 
-def main():
-    # train(4, [[.1, .2, .3], [.3, .2, .3], [.3, .3, .3], [.3, .4, .5]], [1,2,3,0], [[.1, .2, .3], [.3, .2, .3], [.3, .3, .3], [.3, .4, .5]], [1,2,3,0])
-    pass
-    '''
-    mnist = tf.keras.datasets.mnist
 
-    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+def main(options: list):
+    args = parse_args(options)
+    try:
+        import cPickle as pickle
+    except:
+        import pickle
+    samples = pickle.load(args.features_file)
+    features = args.feature if args.feature else DEFAULT_FEATURES
+    from Vector import FeatureVector
+    data, labels = map(np.array,
+                       zip(*[(FeatureVector(p, features).get(), p.user)
+                             for p in samples]))
+    num_users = len(np.unique([s.user for s in samples]))
+    s = np.arange(data.shape[0])
+    np.random.shuffle(s)
+    labelset = set(labels)
+    labeldict = {}
+    i = 0
+    for l in labelset:
+        labeldict[l] = i
+        i += 1
+    labelints = []
+    for i in labels:
+        labelints.append(labeldict[i])
+    lin = np.array(labelints)
+    shuffledf = pd.DataFrame(data=lin.flatten())
+    trainingdf = pd.merge(shuffledf, pd.DataFrame(data=data), left_index=True, right_index=True).sample(frac=1)
+    labelsshuffled = trainingdf['0_x']
+    datashuffled = trainingdf.drop(['0_x'], axis=1)
 
-    x_train = tf.keras.utils.normalize(x_train, axis=1)
-    x_test = tf.keras.utils.normalize(x_test, axis=1)
+    classify(num_users, datashuffled.to_numpy()[:-20], labelsshuffled.to_numpy()[:-20], datashuffled.to_numpy()[-20:], labelsshuffled.to_numpy()[-20:])
+    '''if args.graph_top:
+        graph_top(args, data[s], labels[s])
+    if args.graph_k:
+        graph_k(args, data[s], labels[s])
+    if args.graph_weights:
+        graph_w(args, data[s], labels[s])
+    res, matrix = kNearestNeighbors(data[s], labels[s], n=args.folds,
+                                    verbose=args.verbose, guesses=args.top,
+                                    k=args.k_neighbors, weights=args.weight)'''
+    #print("Overall Accuracy: %f" % np.average(res))
+    #if args.p_value:
+     #   print("P-Value: %f" % t_test(res, num_users)[1] / 2)
+    #if args.graph:
+    #    gen_confusion_matrix(matrix, labels)
 
-    print(x_train[0])
 
-    model = tf.keras.models.Sequential()
-    model.add(tf.keras.layers.Flatten())
-    model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
-    model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
-    model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))
-
-    model.compile(optimizer='SGD',
-                  loss='sparse_categorical_crossentropy',
+def classify(classifications: int, sampleData: list, labels: list, testdata: list, testlabels: list):
+    #print(classifications)
+    model = Sequential()
+    model.add(Dense(len(sampleData[0]), input_dim=len(sampleData[0])))
+    model.add(Activation('sigmoid'))
+    model.add(Dense(len(sampleData[0])//2))
+    model.add(Activation('sigmoid'))
+    model.add(Dense(classifications))
+    model.add(Activation('relu'))
+    model.compile(optimizer='rmsprop',
+                  loss='categorical_crossentropy',
                   metrics=['accuracy'])
-    model.fit(x_train, y_train, epochs=3)
-    '''
+    one_hot_labels = keras.utils.to_categorical(labels, num_classes=classifications)
+    model.fit(sampleData, one_hot_labels, epochs=10, batch_size=len(sampleData[0]))
+    one_hot_test_labels = keras.utils.to_categorical(testlabels, num_classes=classifications)
+    results = model.evaluate(testdata, one_hot_test_labels, batch_size=128)
+    print(results)
+
 
 
 # data and results arrays (training and testing) should be paired. Classifications is number of ways to classify data.
-def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list):
+'''def multiLayerPerceptronClassifier(classifications: int, data: list, labels: list, testdata: list, testresults: list):
     numberOfNeurons = (len(data[0]) + classifications)/2
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Flatten())
@@ -39,12 +99,53 @@ def multiLayerPerceptronClassifier(classifications: int, data: list, results: li
     model.compile(optimizer='SGD',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])
-    model.fit(data, results, epochs=5)
+    print(data)
+    model.fit(data, labels, epochs=5)
 
     loss, accuracy = model.evaluate(testdata, testresults)
     print(loss)
-    print(accuracy)
+    print(accuracy)'''
+
+
+def parse_args(args: list):
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Run a data set through a kNearestNeighbors classifier.')
+    parser.add_argument('features_file', type=argparse.FileType('rb'),
+                        help='File of extracted features.')
+    parser.add_argument('-v', '--verbose', action="count", default=0,
+                        help='Show more information')
+    parser.add_argument('-n', '--folds', type=int, default=5,
+                        help='Number of cross-validation folds (default: 5)')
+    parser.add_argument('-k', '--k-neighbors', type=int, default=5,
+                        help='Number of neighbors to consider (default: 5)')
+    parser.add_argument('-w', '--weight', choices=["uniform", "distance"],
+                        default="uniform", help='Weight function for determining \
+                        distance (default: \"Uniform\")')
+    parser.add_argument('-f', '--feature', action='append', type=str,
+                        help='Add feature to list of features to test with.')
+    parser.add_argument('-p', '--p-value', action='store_const', default=False,
+                        const=True, help='Calculate a p-value from a t-test.')
+    parser.add_argument('-t', '--top', type=int, default=1,
+                        help='Number of guesses to be considered \"correct\" \
+                        (default: 1)')
+    parser.add_argument('-g', '--graph', action="store_true",
+                        help='Generates a confusion matrix.')
+    parser.add_argument('--graph-top', action="store_true",
+                        help='Generates a graph of accuracy in top N guesses.')
+    parser.add_argument('--graph-k', action="store_true",
+                        help='Generates a graph of accuracy for k-nearest neighbors.')
+    parser.add_argument('--graph-weights', action="store_true",
+                        help='Generates a graph comparing weights.')
+    return parser.parse_args(args)
 
 
 if __name__ == '__main__':
-    main()
+    import sys
+
+    for filename in os.listdir('./samples2.0'):
+        print(os.path.join('./samples2.0', filename))
+        main(["-f*", os.path.join('./samples2.0', filename)])
+        #main(sys.argv[1:])
+    #print(sys.argv[1:])
+    #main(sys.argv[1:])

+ 15 - 0
src/classifiers/interpres.py

@@ -0,0 +1,15 @@
+
+def main(fname):
+    f = open(fname[0], 'r')
+    x = f.readline()
+    while x != "":
+        if '.plo' in x:
+            print(x[:-1], end=", ")
+        elif x[0] == '[':
+            print(x)
+        x = f.readline()
+
+
+if __name__ == '__main__':
+    import sys
+    main(sys.argv[1:])