il y a 6 ans · 145b514131
--- a/src/classifiers/nearestneighbors.py
+++ b/src/classifiers/nearestneighbors.py
@@ -1,4 +1,4 @@
 
															-#!/usr/bin/python3
														
 
															+#!/home/tflucke/bin/bin/python3
														
 
															 from sklearn.model_selection import KFold
														
 
															 import numpy as np
														
@@ -26,10 +26,57 @@ def main(options: list):
 
															                        zip(*[(FeatureVector(p, features).get(), p.user)
														
 
															                              for p in samples]))
														
 
															     num_users = len(np.unique([s.user for s in samples]))
														
 
															-    avg, p = classify(data, labels, num_users, args)
														
 
															-    print("Overall Accuracy: %f" % avg)
														
 
															+    s = np.arange(data.shape[0])
														
 
															+    np.random.shuffle(s)
														
 
															+    if args.graph_top:
														
 
															+        graph_top(args, data[s], labels[s])
														
 
															+    res, matrix = kNearestNeighbors(data[s], labels[s], n=args.folds,
														
 
															+                                    verbose=args.verbose, guesses=args.top,
														
 
															+                                    k=args.k_neighbors, weights=args.weight)
														
 
															+    print("Overall Accuracy: %f" % np.average(res))
														
 
															     if args.p_value:
														
 
															-        print("P-Value: %f" % p)
														
 
															+        print("P-Value: %f" % t_test(res, num_users)[1] / 2)
														
 
															+    if args.graph:
														
 
															+        gen_confusion_matrix(matrix, labels)
														
 
															+
														
 
															+def graph_top(args, data, labels):
														
 
															+    t = 0
														
 
															+    label_list = np.unique(labels)
														
 
															+    res = []
														
 
															+    while t < len(label_list):
														
 
															+        t += 1
														
 
															+        res.append(#, 1.0/t
														
 
															+            (t, np.average(
														
 
															+                kNearestNeighbors(data, labels, n=args.folds, guesses=t,
														
 
															+                                  verbose=args.verbose, k=args.k_neighbors,
														
 
															+                                  weights=args.weight)[0])
														
 
															+         )
														
 
															+        )
														
 
															+    import seaborn as sns
														
 
															+    from pandas import DataFrame
														
 
															+    from matplotlib import pyplot as plt
														
 
															+    dataset = DataFrame(res, columns=["Top-N Guesses", "Accuracy (%)"])
														
 
															+    graph = sns.lineplot("Top-N Guesses", "Accuracy", data=dataset)
														
 
															+    graph.set_xticks(np.arange(1, len(label_list), 2))
														
 
															+    graph.set_yticks(np.arange(0, 1, 0.1))
														
 
															+    plt.title('K-Nearest Neighbor Accuracy on Nth Guess')
														
 
															+    graph.get_figure().savefig("nearest-neighbor-top-n.png")
														
 
															+
														
 
															+def gen_confusion_matrix(matrix, labels):
														
 
															+    import seaborn as sns
														
 
															+    from pandas import DataFrame
														
 
															+    from matplotlib import pyplot as plt
														
 
															+    label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
														
 
															+    dataset = DataFrame(matrix, columns=label_list, index=label_list)
														
 
															+    sns.set(font_scale=0.8)
														
 
															+    graph = sns.heatmap(data=dataset, annot=True, cbar=False)
														
 
															+    graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
														
 
															+                          horizontalalignment="right")
														
 
															+    plt.subplots_adjust(left=0.15, bottom=0.2)
														
 
															+    plt.ylabel('True Label')
														
 
															+    plt.xlabel('Predicted Label')
														
 
															+    plt.title('K-Nearest Neighbor Confusion Matrix')
														
 
															+    graph.get_figure().savefig("nearest-neighbor.png")
														
 
															 def parse_args(args: list):
														
 
															     import argparse
														
@@ -53,23 +100,30 @@ def parse_args(args: list):
 
															     parser.add_argument('-t', '--top', type=int, default=1,
														
 
															                         help='Number of guesses to be considered \"correct\" \
														
 
															                         (default: 1)')
														
 
															+    parser.add_argument('-g', '--graph', action="store_true",
														
 
															+                        help='Generates a confusion matrix.')
														
 
															+    parser.add_argument('--graph-top', action="store_true",
														
 
															+                        help='Generates a graph of accuracy in top N guesses.')
														
 
															     return parser.parse_args(args)
														
 
															 def classify(data, labels, num_users: int, args):
														
 
															     s = np.arange(data.shape[0])
														
 
															     np.random.shuffle(s)
														
 
															-    res = kNearestNeighbors(data[s], labels[s],
														
 
															-                            n=args.folds, verbose=args.verbose, k=args.k_neighbors,
														
 
															-                            weights=args.weight, guesses=args.top)
														
 
															+    res, _ = kNearestNeighbors(data[s], labels[s], n=args.folds,
														
 
															+                               verbose=args.verbose, guesses=args.top,
														
 
															+                               k=args.k_neighbors, weights=args.weight)
														
 
															     return (np.average(res), t_test(res, num_users)[1] / 2)
														
 
															 def kNearestNeighbors(data: list, labels: list,
														
 
															                       n=5, verbose=0, k=5, weights="uniform", guesses=1):
														
 
															     from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
														
 
															+    from sklearn.metrics import confusion_matrix
														
 
															     folds = KFold(n_splits=n)
														
 
															     i = 1
														
 
															     avg = 0
														
 
															     accuracies = []
														
 
															+    output = []
														
 
															+    truth = []
														
 
															     label_list = sorted(np.unique(labels))
														
 
															     for train_index, test_index in folds.split(data):
														
 
															         if verbose >= 1:
														
@@ -80,10 +134,12 @@ def kNearestNeighbors(data: list, labels: list,
 
															         kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
														
 
															         kn.fit(data[train_index], labels[train_index])
														
 
															         predictions = kn.predict(data[test_index])
														
 
															+        output.extend(predictions)
														
 
															+        truth.extend(labels[test_index])
														
 
															         if guesses <= 1:
														
 
															             correct = [a == p for a, p in zip(labels[test_index], predictions)]
														
 
															         else:
														
 
															-            correct = list(map(lambda x: x <= guesses,
														
 
															+            correct = list(map(lambda x: x < guesses,
														
 
															                                find_in_predictions(
														
 
															                                    kn.predict_proba(data[test_index]),
														
 
															                                    labels[test_index],
														
@@ -92,8 +148,15 @@ def kNearestNeighbors(data: list, labels: list,
 
															         if verbose >= 1:
														
 
															             print(accuracy)
														
 
															         accuracies.append(accuracy)
														
 
															-    return accuracies
														
 
															+    return (accuracies, confusion_matrix(truth, output, labels=label_list))
														
 
															+def find_in_predictions(probabilities: list, truth: list, labels: list):
														
 
															+    return [
														
 
															+        list(map(lambda x: x[0],
														
 
															+                 sorted(zip(labels, probs), key=lambda x: -x[1])
														
 
															+             )).index(actual)
														
 
															+        for probs, actual in zip(probabilities, truth)
														
 
															+    ]
														
 
															 # TODO: This should be in a separate file.
														
 
															 # If we need a unified interface we can make an aggregater.
														
@@ -116,31 +179,6 @@ def multiLayerPerceptronClassifier(classifications: int, data: list, results: li
 
															     print(loss)
														
 
															     print(accuracy)
														
 
															-# TODO: This should be in a separate file.
														
 
															-# If we need a unified interface we can make an aggregater.
														
 
															-# TODO: KFold validation
														
 
															-def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
														
 
															-    from sklearn.ensemble import RandomForestClassifier
														
 
															-    rfc = RandomForestClassifier(n_estimators=10)
														
 
															-    rfc.fit(data, labels)
														
 
															-    predictions = rfc.predict(test_data)
														
 
															-    for t in range(len(test_data)):
														
 
															-        print(str(test_data[t]) + "prediction: " + str(predictions[t]))
														
 
															-    if len(test_data) == 0:
														
 
															-        return
														
 
															-    accuracysum = 0
														
 
															-    for t in range(len(test_data)):
														
 
															-        accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
														
 
															-    print("Accuracy: " + str(accuracysum/len(test_data_labels)))
														
 
															-
														
 
															-
														
 
															-def find_in_predictions(probabilities: list, tests: int, labels: list):
														
 
															-    return [list(map(lambda x: x[0],
														
 
															-                     sorted(list(zip(labels, probs)), key=lambda x: x[1]))
														
 
															-    ).index(test)
														
 
															-            for probs, test in zip(probabilities, tests)]
														
 
															-
														
 
															-
														
 
															 def t_test(accuracy: list, num_users: int):
														
 
															     from scipy import stats
														
 
															     random_avg = 1.0/num_users
														
@@ -148,7 +186,6 @@ def t_test(accuracy: list, num_users: int):
 
															     # If all numbers are identical, p-value = 1
														
 
															     return res if not np.isnan(res[0]) else (0, 1)
														
 
															-
														
 
															 if __name__ == '__main__':
														
 
															     import sys
														
 
															     main(sys.argv[1:])
														
--- a/src/classifiers/randomforest.py
+++ b/src/classifiers/randomforest.py
@@ -1,4 +1,4 @@
 
															-#!/usr/bin/python3
														
 
															+#!/home/tflucke/bin/bin/python3
														
 
															 from sklearn.model_selection import KFold
														
 
															 import numpy as np
														
@@ -26,10 +26,29 @@ def main(options: list):
 
															                        zip(*[(FeatureVector(p, features).get(), p.user)
														
 
															                              for p in samples]))
														
 
															     num_users = len(np.unique([s.user for s in samples]))
														
 
															-    avg, p = classify(data, labels, num_users, args)
														
 
															-    print("Overall Accuracy: %f" % avg)
														
 
															+    s = np.arange(data.shape[0])
														
 
															+    np.random.shuffle(s)
														
 
															+    res, matrix = random_forest(data[s], labels[s], fn=args.criterion,
														
 
															+                                n=args.folds, verbose=args.verbose,
														
 
															+                                estimators=args.estimators)
														
 
															+    print("Overall Accuracy: %f" % np.average(res))
														
 
															     if args.p_value:
														
 
															         print("P-Value: %f" % p)
														
 
															+    if args.graph:
														
 
															+        import seaborn as sns
														
 
															+        from pandas import DataFrame
														
 
															+        from matplotlib import pyplot as plt
														
 
															+        label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
														
 
															+        dataset = DataFrame(matrix, columns=label_list, index=label_list)
														
 
															+        sns.set(font_scale=0.8)
														
 
															+        graph = sns.heatmap(data=dataset, annot=True, cbar=False)
														
 
															+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
														
 
															+                              horizontalalignment="right")
														
 
															+        plt.subplots_adjust(left=0.15, bottom=0.2)
														
 
															+        plt.ylabel('True Label')
														
 
															+        plt.xlabel('Predicted Label')
														
 
															+        plt.title('K-Nearest Neighbor Confusion Matrix')
														
 
															+        graph.get_figure().savefig("random-forest.png")
														
 
															 def parse_args(args: list):
														
 
															     import argparse
														
@@ -50,23 +69,28 @@ def parse_args(args: list):
 
															                         help='Add feature to list of features to test with.')
														
 
															     parser.add_argument('-p', '--p-value', action='store_const', default=False,
														
 
															                         const=True, help='Calculate a p-value from a t-test.')
														
 
															+    parser.add_argument('-g', '--graph', action="store_true",
														
 
															+                        help='Generates a confusion matrix.')
														
 
															     return parser.parse_args(args)
														
 
															 def classify(data, labels, num_users: int, args):
														
 
															     s = np.arange(data.shape[0])
														
 
															     np.random.shuffle(s)
														
 
															-    res = random_forest(data[s], labels[s],
														
 
															-                       n=args.folds, verbose=args.verbose, fn=args.criterion,
														
 
															-                       estimators=args.estimators)
														
 
															+    res, _ = random_forest(data[s], labels[s],
														
 
															+                           n=args.folds, verbose=args.verbose, fn=args.criterion,
														
 
															+                           estimators=args.estimators)
														
 
															     return (np.average(res), t_test(res, num_users)[1] / 2)
														
 
															 def random_forest(data: list, labels: list, n=5, verbose=0, estimators=100,
														
 
															                  fn="gini"):
														
 
															     from sklearn.ensemble import RandomForestClassifier
														
 
															+    from sklearn.metrics import confusion_matrix
														
 
															     folds = KFold(n_splits=n)
														
 
															     i = 1
														
 
															     avg = 0
														
 
															     accuracies = []
														
 
															+    output = []
														
 
															+    truth = []
														
 
															     label_list = sorted(np.unique(labels))
														
 
															     for train_index, test_index in folds.split(data):
														
 
															         if verbose >= 1:
														
@@ -76,11 +100,16 @@ def random_forest(data: list, labels: list, n=5, verbose=0, estimators=100,
 
															             print("Training on: ", train_index)
														
 
															         rfc = RandomForestClassifier(n_estimators=estimators, criterion=fn)
														
 
															         rfc.fit(data[train_index], labels[train_index])
														
 
															-        accuracy = rfc.score(data[test_index], labels[test_index])
														
 
															+        predictions = rfc.predict(data[test_index])
														
 
															+        output.extend(predictions)
														
 
															+        truth.extend(labels[test_index])
														
 
															+        accuracy = [a == p
														
 
															+                    for a, p in zip(labels[test_index], predictions)
														
 
															+        ].count(True)/len(predictions)
														
 
															         if verbose >= 1:
														
 
															             print(accuracy)
														
 
															         accuracies.append(accuracy)
														
 
															-    return accuracies
														
 
															+    return (accuracies, confusion_matrix(truth, output, labels=label_list))
														
 
															 def t_test(accuracy: list, num_users: int):
														
 
															     from scipy import stats
														
--- a/src/distributer/collect_compressed.py
+++ b/src/distributer/collect_compressed.py
@@ -0,0 +1,180 @@
 
															+#!/home/tflucke/bin/bin/python3
														
 
															+
														
 
															+import typing
														
 
															+import numpy as np
														
 
															+
														
 
															+EXTRACTION_PARAMS = {
														
 
															+    "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)),
														
 
															+    "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]),
														
 
															+           np.arange(2,8)),
														
 
															+    "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001,
														
 
															+           np.arange(1,7)),
														
 
															+    "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001,
														
 
															+           [1.5, 2, 2.5, 3, 3.5, 4, 4.5]),
														
 
															+    "-b": ("Lookback (s)", int, np.arange(1,7)),
														
 
															+    "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]),
														
 
															+    "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25])
														
 
															+}
														
 
															+
														
 
															+def main(options: list):
														
 
															+    args = parse_args(options)
														
 
															+    heaps = [load_file(f, args.compression) for f in args.in_files]
														
 
															+    if args.action == "print":
														
 
															+        import heapq
														
 
															+        heap = list(heapq.merge(*heaps))
														
 
															+        while heap:
														
 
															+            print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" %
														
 
															+                  heapq.heappop(heap))
														
 
															+    elif args.action == "count-files":
														
 
															+        from numpy import unique
														
 
															+        files = list(zip(*unique([res[2] for h in heaps for res in h],
														
 
															+                                 return_counts=True)))
														
 
															+        files.sort(key=lambda x: x[1])
														
 
															+        for f in files:
														
 
															+            print("Count: %4d; File: %80s" % (f[1], f[0]))
														
 
															+    elif args.action == "count-feature-sets":
														
 
															+        from numpy import unique
														
 
															+        files = list(zip(*unique([res[3] for h in heaps for res in h],
														
 
															+                                 return_counts=True)))
														
 
															+        files.sort(key=lambda x: x[1])
														
 
															+        for f in files:
														
 
															+            print("Count: %4d; Features: %s" % (f[1], f[0]))
														
 
															+    elif args.action == "count-features":
														
 
															+        count_features(args, heaps)
														
 
															+    elif args.action == "count-extraction-flag":
														
 
															+        count_preprocess_params(args, heaps)
														
 
															+    elif args.action == "feature-relation":
														
 
															+        count_features_2d(args, heaps)
														
 
															+
														
 
															+def count_features(args, heaps):
														
 
															+    from numpy import unique
														
 
															+    features = [feature for h in heaps
														
 
															+                for res in h
														
 
															+                for feature in res[3]]
														
 
															+    feature_counts = list(zip(*unique(features, return_counts=True)))
														
 
															+    feature_counts.sort(key=lambda x: x[1])
														
 
															+    for f in feature_counts:
														
 
															+        print("Count: %4d; Feature: %s" % (f[1], f[0]))
														
 
															+    if args.graph:
														
 
															+        import seaborn as sns
														
 
															+        from pandas import DataFrame
														
 
															+        from matplotlib import pyplot as plt
														
 
															+        dataset = DataFrame(features, columns=["Feature"])
														
 
															+        sns.set(font_scale=0.65)
														
 
															+        graph = sns.countplot("Feature", data=dataset, palette="Set1",
														
 
															+                              order = dataset['Feature'].value_counts().index)
														
 
															+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
														
 
															+                              horizontalalignment="right")
														
 
															+        plt.subplots_adjust(bottom=0.25)
														
 
															+        if args.log_scale:
														
 
															+            graph.set_yscale('log')
														
 
															+        graph.get_figure().savefig("%s.png" % args.action)
														
 
															+
														
 
															+def count_features_2d(args, heaps):
														
 
															+    from numpy import unique
														
 
															+    features = unique([feature for h in heaps
														
 
															+                       for res in h
														
 
															+                       for feature in res[3]])
														
 
															+    feature_pairs = []
														
 
															+    mi=9999999999999999
														
 
															+    ma=0
														
 
															+    for feature in features:
														
 
															+        print("%s:" % feature)
														
 
															+        pairings = []
														
 
															+        for feature_pair in features:
														
 
															+            if feature_pair == feature:
														
 
															+                pairings.append(0)
														
 
															+                continue
														
 
															+            count = sum([feature_pair in res[3]
														
 
															+                         for h in heaps
														
 
															+                         for res in h
														
 
															+                         if feature in res[3]])
														
 
															+            print("\tFeature: %s; Count: %4d" % (feature_pair, count))
														
 
															+            pairings.append(count)
														
 
															+        ma=max(*pairings, ma)
														
 
															+        mi=min(*pairings, mi)
														
 
															+        feature_pairs.append(pairings)
														
 
															+    if args.graph:
														
 
															+        import seaborn as sns
														
 
															+        from pandas import DataFrame
														
 
															+        from matplotlib import pyplot as plt
														
 
															+        dataset = DataFrame(feature_pairs, index=features, columns=features)
														
 
															+        sns.set(font_scale=0.65)
														
 
															+        if args.log_scale:
														
 
															+            from matplotlib.colors import LogNorm
														
 
															+            # , norm=LogNorm(mi, ma)
														
 
															+        graph = sns.heatmap(data=dataset)
														
 
															+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
														
 
															+                              horizontalalignment="right")
														
 
															+        plt.subplots_adjust(left=0.25, bottom=0.25)
														
 
															+        plt.ylabel('Feature')
														
 
															+        plt.xlabel('Feature')
														
 
															+        plt.title('Occurrences of Feature Pairs')
														
 
															+        graph.get_figure().savefig("%s.png" % args.action)
														
 
															+
														
 
															+def count_preprocess_params(args, heaps):
														
 
															+    from numpy import unique
														
 
															+    parameters = [EXTRACTION_PARAMS[args.flag][1](
														
 
															+        res[2][res[2].index(args.flag) + 2:].split("-", 1)[0]
														
 
															+    ) for h in heaps for res in h]
														
 
															+    flags = list(zip(*unique(parameters, return_counts=True)))
														
 
															+    flags.sort(key=lambda x: x[1])
														
 
															+    for f in flags:
														
 
															+        print("Count: %4d; Value: %s" % (f[1], f[0]))
														
 
															+    if args.graph:
														
 
															+        import seaborn as sns
														
 
															+        from pandas import Series
														
 
															+        sns.set(font_scale=0.65)
														
 
															+        graph = sns.distplot(Series(parameters,
														
 
															+                                    name=EXTRACTION_PARAMS[args.flag][0]),
														
 
															+                             bins=EXTRACTION_PARAMS[args.flag][2], kde=False)
														
 
															+        graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0],
														
 
															+                        EXTRACTION_PARAMS[args.flag][2][-1]))
														
 
															+        graph.get_figure().savefig("%s%s.png" % (args.action, args.flag))
														
 
															+
														
 
															+def load_file(filename: str, compression=None):
														
 
															+    import compress_pickle
														
 
															+    return [r[0:3] + (r[3].feature,)
														
 
															+            for r in compress_pickle.load(filename, compression=compression)]
														
 
															+
														
 
															+def parse_args(args: list):
														
 
															+    import argparse
														
 
															+    parser = argparse.ArgumentParser(description='Collect the output from \
														
 
															+    distributed distribute_compressor.')
														
 
															+    parser.add_argument('action', choices=["print", "count-files",
														
 
															+                                           "count-feature-sets",
														
 
															+                                           "count-features", "feature-relation",
														
 
															+                                           "count-extraction-flag"],
														
 
															+                        help='Action to take.')
														
 
															+    parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb')
														
 
															+                        help='Output file name.')
														
 
															+    parser.add_argument('-v', '--verbose', action="count", default=0,
														
 
															+                        help='Show more information')
														
 
															+    parser.add_argument('-c', '--compression', default="bz2",
														
 
															+                        choices=["bz2", "gzip", "lzma", "zipfile", None],
														
 
															+                        help='Compression algorithm to use. (default: bzip2)')
														
 
															+    try:
														
 
															+        import seaborn
														
 
															+        parser.add_argument('-g', '--graph', action="store_true",
														
 
															+                            help='Generate and store a graph of the results.')
														
 
															+        parser.add_argument('--log-scale', action="store_true",
														
 
															+                            help='Y-axis log scale')
														
 
															+    except ImportError:
														
 
															+        pass
														
 
															+    res, unknown = parser.parse_known_args(args)
														
 
															+    if res.action == "count-extraction-flag":
														
 
															+        if len(unknown) != 1:
														
 
															+            import sys
														
 
															+            print("count-extraction-flag requires exactly 1 flag to search for.",
														
 
															+                  file=sys.stderr)
														
 
															+            exit(2)
														
 
															+        else:
														
 
															+            vars(res)["flag"] = unknown[0]
														
 
															+    elif unknown:
														
 
															+        parser.print_help()
														
 
															+        exit(2)
														
 
															+    return res
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    import sys
														
 
															+    main(sys.argv[1:])
														
--- a/src/distributer/distribute_compressor.py
+++ b/src/distributer/distribute_compressor.py
@@ -1,71 +1,80 @@
 
															 #!/home/tflucke/bin/bin/python3
														
 
															-import os, sys, struct, typing, numpy as np
														
 
															+import os, sys, typing, numpy as np
														
 
															 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
														
 
															                     '/../classifiers/')
														
 
															 from Vector import FeatureVector
														
 
															-try:
														
 
															-    import compress_pickle
														
 
															-except ImportError:
														
 
															-    try:
														
 
															-        import cPickle as compress_pickle
														
 
															-    except ImportError:
														
 
															-        import pickle as compress_pickle
														
 
															-
														
 
															-USHRT_MAX=(1 << 16 - 1)
														
 
															+import pickle
														
 
															 def main(options: list):
														
 
															     args = parse_args(options)
														
 
															+    heap = []
														
 
															+    import heapq, signal
														
 
															+    def dump(sig, frame):
														
 
															+        print("Dumping to file %s." % args.out_file, file=sys.stderr)
														
 
															+        pickle.dump(heap, args.out_file)
														
 
															+    signal.signal(signal.SIGUSR1, dump)
														
 
															+    def dump_exit(sig, frame, i):
														
 
															+        dump(sig, frame)
														
 
															+        exit(i)
														
 
															+    signal.signal(signal.SIGTERM, lambda sig, frame: dumpexit(sig, frame, 3))
														
 
															     if args.classifier == "nearest-neighbor":
														
 
															         import nearestneighbors as classifier
														
 
															+    if args.classifier == "random-forest":
														
 
															+        import randomforest as classifier
														
 
															     if args.final_statistic == "median":
														
 
															-        avg_fn = np.median
														
 
															+        avg_fn = lambda values: np.median([classifier.classify(*values)
														
 
															+                                    for i in range(0, args.reruns)], 0)
														
 
															     else:
														
 
															-        avg_fn = np.average
														
 
															+        avg_fn = lambda values: np.average([classifier.classify(*values)
														
 
															+                                    for i in range(0, args.reruns)], 0)
														
 
															     print("Ready for input!", flush=True)
														
 
															     for line in sys.stdin:
														
 
															-        options = classifier.parse_args(line.split())
														
 
															-        samples = compress_pickle.load(options.features_file, compression=None)
														
 
															-        num_users = len(np.unique([s.user for s in samples]))
														
 
															-        if num_users < args.min_users:
														
 
															-            print("%s: Too few users.  Skipping..." % args.features_file.name, file=sys.stderr, flush=True)
														
 
															+        res = process_options(line, classifier, avg_fn)
														
 
															+        if len(heap) == args.max_outputs:
														
 
															+            heapq.heappushpop(heap, res)
														
 
															         else:
														
 
															-            features = options.feature if options.feature else classifier.DEFAULT_FEATURES
														
 
															-            data, labels = map(np.array,
														
 
															-                               zip(*[(FeatureVector(p, features).get(), p.user)
														
 
															-                                     for p in samples]))
														
 
															-            runs = [classifier.classify(data, labels, num_users, options) for i in range(0, args.reruns)]
														
 
															-            #print(runs, file=sys.stderr)
														
 
															-            write_to_file(args.out_file, args.compression, options, avg_fn(runs, 0))
														
 
															-            print("Finished: '%s'" % line.strip(), flush=True)
														
 
															+            heapq.heappush(heap, res)
														
 
															+        print("Finished: '%s'" % line.strip(), flush=True)
														
 
															+    dump(0, 0)
														
 
															-def write_to_file(out: typing.BinaryIO, compression: str, options: list, results: (float, float)):
														
 
															+def process_options(line: str, classifier, avg_fn):
														
 
															+    options = classifier.parse_args(line.split())
														
 
															+    samples = pickle.load(options.features_file)
														
 
															+    num_users = len(np.unique([s.user for s in samples]))
														
 
															+    features = options.feature if options.feature else classifier.DEFAULT_FEATURES
														
 
															+    data, labels = map(np.array,
														
 
															+                       zip(*[(FeatureVector(p, features).get(), p.user)
														
 
															+                             for p in samples]))
														
 
															+    runs = avg_fn((data, labels, num_users, options))
														
 
															+    #print(runs, file=sys.stderr)
														
 
															+    #write_to_file(args.out_file, args.compression, )
														
 
															     filename = options.features_file.name
														
 
															     del options.features_file
														
 
															-    #print("Args: %s; results: %s" % (options, results), file=sys.stderr)
														
 
															-    res_packed = struct.pack("@HH", *[int(USHRT_MAX*v) for v in results])
														
 
															-    compress_pickle.dump((filename, options, res_packed), out, compression=compression)
														
 
															-
														
 
															-def read_from_file(in_file: typing.BinaryIO, compression: str = None):
														
 
															-    res = compress_pickle.load(in_file, compression = compression)
														
 
															-    return (res[0], res[1], tuple(float(v)/USHRT_MAX for v in struct.unpack("@HH", res[1])))
														
 
															+    return (*runs, filename, options)
														
 
															 def parse_args(args: list):
														
 
															     import argparse
														
 
															-    parser = argparse.ArgumentParser(description='Run a series of tests and compress the output.')
														
 
															-    parser.add_argument('classifier', choices=["nearest-neighbor"], help='Classifier to use.')
														
 
															-    parser.add_argument('out_file', type=argparse.FileType('wb'), help='Output file name.')
														
 
															+    parser = argparse.ArgumentParser(description='Run a series of tests and \
														
 
															+    compress the output.')
														
 
															+    parser.add_argument('classifier', choices=["nearest-neighbor", "random-forest"],
														
 
															+                        help='Classifier to use.')
														
 
															+    parser.add_argument('out_file', type=str, #argparse.FileType('wb')
														
 
															+                        help='Output file name.')
														
 
															     parser.add_argument('-v', '--verbose', action="count", default=0,
														
 
															                         help='Show more information')
														
 
															-    parser.add_argument('-m', '--min-users', type=int, default=10,
														
 
															-                        help='Minimum number of unique users to consider a sample\
														
 
															-                        file valid. (default: 10)')
														
 
															     parser.add_argument('-r', '--reruns', type=int, default=3,
														
 
															-                        help='Number of times to rerun a sample set. (default: 3)')
														
 
															-    parser.add_argument('-f', '--final-statistic', choices=["mean", "median"], default="median",
														
 
															-                        help='Final statistic to show. (default: median)')
														
 
															-    parser.add_argument('-c', '--compression', choices=["bz2", "gzip", "lzma", "zipfile", None], default="bz2",
														
 
															+                        help='Number of times to rerun a sample set. \
														
 
															+                        (default: 3)')
														
 
															+    parser.add_argument('-f', '--final-statistic', choices=["mean", "median"],
														
 
															+                        default="median", help='Final statistic to show. \
														
 
															+                        (default: median)')
														
 
															+    parser.add_argument('-c', '--compression', default="bz2",
														
 
															+                        choices=["bz2", "gzip", "lzma", "zipfile", None],
														
 
															                         help='Compression algorithm to use. (default: bzip2)')
														
 
															+    parser.add_argument('-m', '--max-outputs', type=int, default="100",
														
 
															+                        help='Maximum number of best outputs to print. \
														
 
															+                        (default: 100)')
														
 
															     return parser.parse_args(args)
														
 
															 if __name__ == '__main__':