Преглед изворни кода

Added graph generation to scripts.

Also fixed top-n on knearestneighbors.
Tom Flucke пре 6 година
родитељ
комит
145b514131

+ 72 - 35
src/classifiers/nearestneighbors.py

@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/home/tflucke/bin/bin/python3
 
 from sklearn.model_selection import KFold
 import numpy as np
@@ -26,10 +26,57 @@ def main(options: list):
                        zip(*[(FeatureVector(p, features).get(), p.user)
                              for p in samples]))
     num_users = len(np.unique([s.user for s in samples]))
-    avg, p = classify(data, labels, num_users, args)
-    print("Overall Accuracy: %f" % avg)
+    s = np.arange(data.shape[0])
+    np.random.shuffle(s)
+    if args.graph_top:
+        graph_top(args, data[s], labels[s])
+    res, matrix = kNearestNeighbors(data[s], labels[s], n=args.folds,
+                                    verbose=args.verbose, guesses=args.top,
+                                    k=args.k_neighbors, weights=args.weight)
+    print("Overall Accuracy: %f" % np.average(res))
     if args.p_value:
-        print("P-Value: %f" % p)
+        print("P-Value: %f" % t_test(res, num_users)[1] / 2)
+    if args.graph:
+        gen_confusion_matrix(matrix, labels)
+
+def graph_top(args, data, labels):
+    t = 0
+    label_list = np.unique(labels)
+    res = []
+    while t < len(label_list):
+        t += 1
+        res.append(#, 1.0/t
+            (t, np.average(
+                kNearestNeighbors(data, labels, n=args.folds, guesses=t,
+                                  verbose=args.verbose, k=args.k_neighbors,
+                                  weights=args.weight)[0])
+         )
+        )
+    import seaborn as sns
+    from pandas import DataFrame
+    from matplotlib import pyplot as plt
+    dataset = DataFrame(res, columns=["Top-N Guesses", "Accuracy (%)"])
+    graph = sns.lineplot("Top-N Guesses", "Accuracy", data=dataset)
+    graph.set_xticks(np.arange(1, len(label_list), 2))
+    graph.set_yticks(np.arange(0, 1, 0.1))
+    plt.title('K-Nearest Neighbor Accuracy on Nth Guess')
+    graph.get_figure().savefig("nearest-neighbor-top-n.png")
+
+def gen_confusion_matrix(matrix, labels):
+    import seaborn as sns
+    from pandas import DataFrame
+    from matplotlib import pyplot as plt
+    label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
+    dataset = DataFrame(matrix, columns=label_list, index=label_list)
+    sns.set(font_scale=0.8)
+    graph = sns.heatmap(data=dataset, annot=True, cbar=False)
+    graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
+                          horizontalalignment="right")
+    plt.subplots_adjust(left=0.15, bottom=0.2)
+    plt.ylabel('True Label')
+    plt.xlabel('Predicted Label')
+    plt.title('K-Nearest Neighbor Confusion Matrix')
+    graph.get_figure().savefig("nearest-neighbor.png")
 
 def parse_args(args: list):
     import argparse
@@ -53,23 +100,30 @@ def parse_args(args: list):
     parser.add_argument('-t', '--top', type=int, default=1,
                         help='Number of guesses to be considered \"correct\" \
                         (default: 1)')
+    parser.add_argument('-g', '--graph', action="store_true",
+                        help='Generates a confusion matrix.')
+    parser.add_argument('--graph-top', action="store_true",
+                        help='Generates a graph of accuracy in top N guesses.')
     return parser.parse_args(args)
 
 def classify(data, labels, num_users: int, args):
     s = np.arange(data.shape[0])
     np.random.shuffle(s)
-    res = kNearestNeighbors(data[s], labels[s],
-                            n=args.folds, verbose=args.verbose, k=args.k_neighbors,
-                            weights=args.weight, guesses=args.top)
+    res, _ = kNearestNeighbors(data[s], labels[s], n=args.folds,
+                               verbose=args.verbose, guesses=args.top,
+                               k=args.k_neighbors, weights=args.weight)
     return (np.average(res), t_test(res, num_users)[1] / 2)
 
 def kNearestNeighbors(data: list, labels: list,
                       n=5, verbose=0, k=5, weights="uniform", guesses=1):
     from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
+    from sklearn.metrics import confusion_matrix
     folds = KFold(n_splits=n)
     i = 1
     avg = 0
     accuracies = []
+    output = []
+    truth = []
     label_list = sorted(np.unique(labels))
     for train_index, test_index in folds.split(data):
         if verbose >= 1:
@@ -80,10 +134,12 @@ def kNearestNeighbors(data: list, labels: list,
         kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
         kn.fit(data[train_index], labels[train_index])
         predictions = kn.predict(data[test_index])
+        output.extend(predictions)
+        truth.extend(labels[test_index])
         if guesses <= 1:
             correct = [a == p for a, p in zip(labels[test_index], predictions)]
         else:
-            correct = list(map(lambda x: x <= guesses,
+            correct = list(map(lambda x: x < guesses,
                                find_in_predictions(
                                    kn.predict_proba(data[test_index]),
                                    labels[test_index],
@@ -92,8 +148,15 @@ def kNearestNeighbors(data: list, labels: list,
         if verbose >= 1:
             print(accuracy)
         accuracies.append(accuracy)
-    return accuracies
+    return (accuracies, confusion_matrix(truth, output, labels=label_list))
 
+def find_in_predictions(probabilities: list, truth: list, labels: list):
+    return [
+        list(map(lambda x: x[0],
+                 sorted(zip(labels, probs), key=lambda x: -x[1])
+             )).index(actual)
+        for probs, actual in zip(probabilities, truth)
+    ]
 
 # TODO: This should be in a separate file.
 # If we need a unified interface we can make an aggregater.
@@ -116,31 +179,6 @@ def multiLayerPerceptronClassifier(classifications: int, data: list, results: li
     print(loss)
     print(accuracy)
 
-# TODO: This should be in a separate file.
-# If we need a unified interface we can make an aggregater.
-# TODO: KFold validation
-def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
-    from sklearn.ensemble import RandomForestClassifier
-    rfc = RandomForestClassifier(n_estimators=10)
-    rfc.fit(data, labels)
-    predictions = rfc.predict(test_data)
-    for t in range(len(test_data)):
-        print(str(test_data[t]) + "prediction: " + str(predictions[t]))
-    if len(test_data) == 0:
-        return
-    accuracysum = 0
-    for t in range(len(test_data)):
-        accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
-    print("Accuracy: " + str(accuracysum/len(test_data_labels)))
-
-
-def find_in_predictions(probabilities: list, tests: int, labels: list):
-    return [list(map(lambda x: x[0],
-                     sorted(list(zip(labels, probs)), key=lambda x: x[1]))
-    ).index(test)
-            for probs, test in zip(probabilities, tests)]
-
-
 def t_test(accuracy: list, num_users: int):
     from scipy import stats
     random_avg = 1.0/num_users
@@ -148,7 +186,6 @@ def t_test(accuracy: list, num_users: int):
     # If all numbers are identical, p-value = 1
     return res if not np.isnan(res[0]) else (0, 1)
 
-
 if __name__ == '__main__':
     import sys
     main(sys.argv[1:])

+ 37 - 8
src/classifiers/randomforest.py

@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/home/tflucke/bin/bin/python3
 
 from sklearn.model_selection import KFold
 import numpy as np
@@ -26,10 +26,29 @@ def main(options: list):
                        zip(*[(FeatureVector(p, features).get(), p.user)
                              for p in samples]))
     num_users = len(np.unique([s.user for s in samples]))
-    avg, p = classify(data, labels, num_users, args)
-    print("Overall Accuracy: %f" % avg)
+    s = np.arange(data.shape[0])
+    np.random.shuffle(s)
+    res, matrix = random_forest(data[s], labels[s], fn=args.criterion,
+                                n=args.folds, verbose=args.verbose,
+                                estimators=args.estimators)
+    print("Overall Accuracy: %f" % np.average(res))
     if args.p_value:
         print("P-Value: %f" % p)
+    if args.graph:
+        import seaborn as sns
+        from pandas import DataFrame
+        from matplotlib import pyplot as plt
+        label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
+        dataset = DataFrame(matrix, columns=label_list, index=label_list)
+        sns.set(font_scale=0.8)
+        graph = sns.heatmap(data=dataset, annot=True, cbar=False)
+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
+                              horizontalalignment="right")
+        plt.subplots_adjust(left=0.15, bottom=0.2)
+        plt.ylabel('True Label')
+        plt.xlabel('Predicted Label')
+        plt.title('K-Nearest Neighbor Confusion Matrix')
+        graph.get_figure().savefig("random-forest.png")
 
 def parse_args(args: list):
     import argparse
@@ -50,23 +69,28 @@ def parse_args(args: list):
                         help='Add feature to list of features to test with.')
     parser.add_argument('-p', '--p-value', action='store_const', default=False,
                         const=True, help='Calculate a p-value from a t-test.')
+    parser.add_argument('-g', '--graph', action="store_true",
+                        help='Generates a confusion matrix.')
     return parser.parse_args(args)
 
 def classify(data, labels, num_users: int, args):
     s = np.arange(data.shape[0])
     np.random.shuffle(s)
-    res = random_forest(data[s], labels[s],
-                       n=args.folds, verbose=args.verbose, fn=args.criterion,
-                       estimators=args.estimators)
+    res, _ = random_forest(data[s], labels[s],
+                           n=args.folds, verbose=args.verbose, fn=args.criterion,
+                           estimators=args.estimators)
     return (np.average(res), t_test(res, num_users)[1] / 2)
 
 def random_forest(data: list, labels: list, n=5, verbose=0, estimators=100,
                  fn="gini"):
     from sklearn.ensemble import RandomForestClassifier
+    from sklearn.metrics import confusion_matrix
     folds = KFold(n_splits=n)
     i = 1
     avg = 0
     accuracies = []
+    output = []
+    truth = []
     label_list = sorted(np.unique(labels))
     for train_index, test_index in folds.split(data):
         if verbose >= 1:
@@ -76,11 +100,16 @@ def random_forest(data: list, labels: list, n=5, verbose=0, estimators=100,
             print("Training on: ", train_index)
         rfc = RandomForestClassifier(n_estimators=estimators, criterion=fn)
         rfc.fit(data[train_index], labels[train_index])
-        accuracy = rfc.score(data[test_index], labels[test_index])
+        predictions = rfc.predict(data[test_index])
+        output.extend(predictions)
+        truth.extend(labels[test_index])
+        accuracy = [a == p
+                    for a, p in zip(labels[test_index], predictions)
+        ].count(True)/len(predictions)
         if verbose >= 1:
             print(accuracy)
         accuracies.append(accuracy)
-    return accuracies
+    return (accuracies, confusion_matrix(truth, output, labels=label_list))
 
 def t_test(accuracy: list, num_users: int):
     from scipy import stats

+ 180 - 0
src/distributer/collect_compressed.py

@@ -0,0 +1,180 @@
+#!/home/tflucke/bin/bin/python3
+
+import typing
+import numpy as np
+
+EXTRACTION_PARAMS = {
+    "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)),
+    "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]),
+           np.arange(2,8)),
+    "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001,
+           np.arange(1,7)),
+    "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001,
+           [1.5, 2, 2.5, 3, 3.5, 4, 4.5]),
+    "-b": ("Lookback (s)", int, np.arange(1,7)),
+    "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]),
+    "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25])
+}
+
+def main(options: list):
+    args = parse_args(options)
+    heaps = [load_file(f, args.compression) for f in args.in_files]
+    if args.action == "print":
+        import heapq
+        heap = list(heapq.merge(*heaps))
+        while heap:
+            print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" %
+                  heapq.heappop(heap))
+    elif args.action == "count-files":
+        from numpy import unique
+        files = list(zip(*unique([res[2] for h in heaps for res in h],
+                                 return_counts=True)))
+        files.sort(key=lambda x: x[1])
+        for f in files:
+            print("Count: %4d; File: %80s" % (f[1], f[0]))
+    elif args.action == "count-feature-sets":
+        from numpy import unique
+        files = list(zip(*unique([res[3] for h in heaps for res in h],
+                                 return_counts=True)))
+        files.sort(key=lambda x: x[1])
+        for f in files:
+            print("Count: %4d; Features: %s" % (f[1], f[0]))
+    elif args.action == "count-features":
+        count_features(args, heaps)
+    elif args.action == "count-extraction-flag":
+        count_preprocess_params(args, heaps)
+    elif args.action == "feature-relation":
+        count_features_2d(args, heaps)
+
+def count_features(args, heaps):
+    from numpy import unique
+    features = [feature for h in heaps
+                for res in h
+                for feature in res[3]]
+    feature_counts = list(zip(*unique(features, return_counts=True)))
+    feature_counts.sort(key=lambda x: x[1])
+    for f in feature_counts:
+        print("Count: %4d; Feature: %s" % (f[1], f[0]))
+    if args.graph:
+        import seaborn as sns
+        from pandas import DataFrame
+        from matplotlib import pyplot as plt
+        dataset = DataFrame(features, columns=["Feature"])
+        sns.set(font_scale=0.65)
+        graph = sns.countplot("Feature", data=dataset, palette="Set1",
+                              order = dataset['Feature'].value_counts().index)
+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
+                              horizontalalignment="right")
+        plt.subplots_adjust(bottom=0.25)
+        if args.log_scale:
+            graph.set_yscale('log')
+        graph.get_figure().savefig("%s.png" % args.action)
+
+def count_features_2d(args, heaps):
+    from numpy import unique
+    features = unique([feature for h in heaps
+                       for res in h
+                       for feature in res[3]])
+    feature_pairs = []
+    mi=9999999999999999
+    ma=0
+    for feature in features:
+        print("%s:" % feature)
+        pairings = []
+        for feature_pair in features:
+            if feature_pair == feature:
+                pairings.append(0)
+                continue
+            count = sum([feature_pair in res[3]
+                         for h in heaps
+                         for res in h
+                         if feature in res[3]])
+            print("\tFeature: %s; Count: %4d" % (feature_pair, count))
+            pairings.append(count)
+        ma=max(*pairings, ma)
+        mi=min(*pairings, mi)
+        feature_pairs.append(pairings)
+    if args.graph:
+        import seaborn as sns
+        from pandas import DataFrame
+        from matplotlib import pyplot as plt
+        dataset = DataFrame(feature_pairs, index=features, columns=features)
+        sns.set(font_scale=0.65)
+        if args.log_scale:
+            from matplotlib.colors import LogNorm
+            # , norm=LogNorm(mi, ma)
+        graph = sns.heatmap(data=dataset)
+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
+                              horizontalalignment="right")
+        plt.subplots_adjust(left=0.25, bottom=0.25)
+        plt.ylabel('Feature')
+        plt.xlabel('Feature')
+        plt.title('Occurrences of Feature Pairs')
+        graph.get_figure().savefig("%s.png" % args.action)
+
+def count_preprocess_params(args, heaps):
+    from numpy import unique
+    parameters = [EXTRACTION_PARAMS[args.flag][1](
+        res[2][res[2].index(args.flag) + 2:].split("-", 1)[0]
+    ) for h in heaps for res in h]
+    flags = list(zip(*unique(parameters, return_counts=True)))
+    flags.sort(key=lambda x: x[1])
+    for f in flags:
+        print("Count: %4d; Value: %s" % (f[1], f[0]))
+    if args.graph:
+        import seaborn as sns
+        from pandas import Series
+        sns.set(font_scale=0.65)
+        graph = sns.distplot(Series(parameters,
+                                    name=EXTRACTION_PARAMS[args.flag][0]),
+                             bins=EXTRACTION_PARAMS[args.flag][2], kde=False)
+        graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0],
+                        EXTRACTION_PARAMS[args.flag][2][-1]))
+        graph.get_figure().savefig("%s%s.png" % (args.action, args.flag))
+
+def load_file(filename: str, compression=None):
+    import compress_pickle
+    return [r[0:3] + (r[3].feature,)
+            for r in compress_pickle.load(filename, compression=compression)]
+
+def parse_args(args: list):
+    import argparse
+    parser = argparse.ArgumentParser(description='Collect the output from \
+    distributed distribute_compressor.')
+    parser.add_argument('action', choices=["print", "count-files",
+                                           "count-feature-sets",
+                                           "count-features", "feature-relation",
+                                           "count-extraction-flag"],
+                        help='Action to take.')
+    parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb')
+                        help='Output file name.')
+    parser.add_argument('-v', '--verbose', action="count", default=0,
+                        help='Show more information')
+    parser.add_argument('-c', '--compression', default="bz2",
+                        choices=["bz2", "gzip", "lzma", "zipfile", None],
+                        help='Compression algorithm to use. (default: bzip2)')
+    try:
+        import seaborn
+        parser.add_argument('-g', '--graph', action="store_true",
+                            help='Generate and store a graph of the results.')
+        parser.add_argument('--log-scale', action="store_true",
+                            help='Y-axis log scale')
+    except ImportError:
+        pass
+    res, unknown = parser.parse_known_args(args)
+    if res.action == "count-extraction-flag":
+        if len(unknown) != 1:
+            import sys
+            print("count-extraction-flag requires exactly 1 flag to search for.",
+                  file=sys.stderr)
+            exit(2)
+        else:
+            vars(res)["flag"] = unknown[0]
+    elif unknown:
+        parser.print_help()
+        exit(2)
+    return res
+
+if __name__ == '__main__':
+    import sys
+    main(sys.argv[1:])

+ 52 - 43
src/distributer/distribute_compressor.py

@@ -1,71 +1,80 @@
 #!/home/tflucke/bin/bin/python3
 
-import os, sys, struct, typing, numpy as np
+import os, sys, typing, numpy as np
 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
                     '/../classifiers/')
 from Vector import FeatureVector
-try:
-    import compress_pickle
-except ImportError:
-    try:
-        import cPickle as compress_pickle
-    except ImportError:
-        import pickle as compress_pickle
-
-USHRT_MAX=(1 << 16 - 1)
+import pickle
 
 def main(options: list):
     args = parse_args(options)
+    heap = []
+    import heapq, signal
+    def dump(sig, frame):
+        print("Dumping to file %s." % args.out_file, file=sys.stderr)
+        pickle.dump(heap, args.out_file)
+    signal.signal(signal.SIGUSR1, dump)
+    def dump_exit(sig, frame, i):
+        dump(sig, frame)
+        exit(i)
+    signal.signal(signal.SIGTERM, lambda sig, frame: dumpexit(sig, frame, 3))
     if args.classifier == "nearest-neighbor":
         import nearestneighbors as classifier
+    if args.classifier == "random-forest":
+        import randomforest as classifier
     if args.final_statistic == "median":
-        avg_fn = np.median
+        avg_fn = lambda values: np.median([classifier.classify(*values)
+                                    for i in range(0, args.reruns)], 0)
     else:
-        avg_fn = np.average
+        avg_fn = lambda values: np.average([classifier.classify(*values)
+                                    for i in range(0, args.reruns)], 0)
     print("Ready for input!", flush=True)
     for line in sys.stdin:
-        options = classifier.parse_args(line.split())
-        samples = compress_pickle.load(options.features_file, compression=None)
-        num_users = len(np.unique([s.user for s in samples]))
-        if num_users < args.min_users:
-            print("%s: Too few users.  Skipping..." % args.features_file.name, file=sys.stderr, flush=True)
+        res = process_options(line, classifier, avg_fn)
+        if len(heap) == args.max_outputs:
+            heapq.heappushpop(heap, res)
         else:
-            features = options.feature if options.feature else classifier.DEFAULT_FEATURES
-            data, labels = map(np.array,
-                               zip(*[(FeatureVector(p, features).get(), p.user)
-                                     for p in samples]))
-            runs = [classifier.classify(data, labels, num_users, options) for i in range(0, args.reruns)]
-            #print(runs, file=sys.stderr)
-            write_to_file(args.out_file, args.compression, options, avg_fn(runs, 0))
-            print("Finished: '%s'" % line.strip(), flush=True)
+            heapq.heappush(heap, res)
+        print("Finished: '%s'" % line.strip(), flush=True)
+    dump(0, 0)
 
-def write_to_file(out: typing.BinaryIO, compression: str, options: list, results: (float, float)):
+def process_options(line: str, classifier, avg_fn):
+    options = classifier.parse_args(line.split())
+    samples = pickle.load(options.features_file)
+    num_users = len(np.unique([s.user for s in samples]))
+    features = options.feature if options.feature else classifier.DEFAULT_FEATURES
+    data, labels = map(np.array,
+                       zip(*[(FeatureVector(p, features).get(), p.user)
+                             for p in samples]))
+    runs = avg_fn((data, labels, num_users, options))
+    #print(runs, file=sys.stderr)
+    #write_to_file(args.out_file, args.compression, )
     filename = options.features_file.name
     del options.features_file
-    #print("Args: %s; results: %s" % (options, results), file=sys.stderr)
-    res_packed = struct.pack("@HH", *[int(USHRT_MAX*v) for v in results])
-    compress_pickle.dump((filename, options, res_packed), out, compression=compression)
-
-def read_from_file(in_file: typing.BinaryIO, compression: str = None):
-    res = compress_pickle.load(in_file, compression = compression)
-    return (res[0], res[1], tuple(float(v)/USHRT_MAX for v in struct.unpack("@HH", res[1])))
+    return (*runs, filename, options)
 
 def parse_args(args: list):
     import argparse
-    parser = argparse.ArgumentParser(description='Run a series of tests and compress the output.')
-    parser.add_argument('classifier', choices=["nearest-neighbor"], help='Classifier to use.')
-    parser.add_argument('out_file', type=argparse.FileType('wb'), help='Output file name.')
+    parser = argparse.ArgumentParser(description='Run a series of tests and \
+    compress the output.')
+    parser.add_argument('classifier', choices=["nearest-neighbor", "random-forest"],
+                        help='Classifier to use.')
+    parser.add_argument('out_file', type=str, #argparse.FileType('wb')
+                        help='Output file name.')
     parser.add_argument('-v', '--verbose', action="count", default=0,
                         help='Show more information')
-    parser.add_argument('-m', '--min-users', type=int, default=10,
-                        help='Minimum number of unique users to consider a sample\
-                        file valid. (default: 10)')
     parser.add_argument('-r', '--reruns', type=int, default=3,
-                        help='Number of times to rerun a sample set. (default: 3)')
-    parser.add_argument('-f', '--final-statistic', choices=["mean", "median"], default="median",
-                        help='Final statistic to show. (default: median)')
-    parser.add_argument('-c', '--compression', choices=["bz2", "gzip", "lzma", "zipfile", None], default="bz2",
+                        help='Number of times to rerun a sample set. \
+                        (default: 3)')
+    parser.add_argument('-f', '--final-statistic', choices=["mean", "median"],
+                        default="median", help='Final statistic to show. \
+                        (default: median)')
+    parser.add_argument('-c', '--compression', default="bz2",
+                        choices=["bz2", "gzip", "lzma", "zipfile", None],
                         help='Compression algorithm to use. (default: bzip2)')
+    parser.add_argument('-m', '--max-outputs', type=int, default="100",
+                        help='Maximum number of best outputs to print. \
+                        (default: 100)')
     return parser.parse_args(args)
 
 if __name__ == '__main__':