пре 6 година · 145b514131
--- a/src/classifiers/nearestneighbors.py
+++ b/src/classifiers/nearestneighbors.py
@@ -1,4 +1,4 @@
 
				-#!/usr/bin/python3
			
 
				+#!/home/tflucke/bin/bin/python3
			
 
				 
			
 
				 from sklearn.model_selection import KFold
			
 
				 import numpy as np
			
@@ -26,10 +26,57 @@ def main(options: list):
 
				                        zip(*[(FeatureVector(p, features).get(), p.user)
			
 
				                              for p in samples]))
			
 
				     num_users = len(np.unique([s.user for s in samples]))
			
 
				-    avg, p = classify(data, labels, num_users, args)
			
 
				-    print("Overall Accuracy: %f" % avg)
			
 
				+    s = np.arange(data.shape[0])
			
 
				+    np.random.shuffle(s)
			
 
				+    if args.graph_top:
			
 
				+        graph_top(args, data[s], labels[s])
			
 
				+    res, matrix = kNearestNeighbors(data[s], labels[s], n=args.folds,
			
 
				+                                    verbose=args.verbose, guesses=args.top,
			
 
				+                                    k=args.k_neighbors, weights=args.weight)
			
 
				+    print("Overall Accuracy: %f" % np.average(res))
			
 
				     if args.p_value:
			
 
				-        print("P-Value: %f" % p)
			
 
				+        print("P-Value: %f" % t_test(res, num_users)[1] / 2)
			
 
				+    if args.graph:
			
 
				+        gen_confusion_matrix(matrix, labels)
			
 
				+
			
 
				+def graph_top(args, data, labels):
			
 
				+    t = 0
			
 
				+    label_list = np.unique(labels)
			
 
				+    res = []
			
 
				+    while t < len(label_list):
			
 
				+        t += 1
			
 
				+        res.append(#, 1.0/t
			
 
				+            (t, np.average(
			
 
				+                kNearestNeighbors(data, labels, n=args.folds, guesses=t,
			
 
				+                                  verbose=args.verbose, k=args.k_neighbors,
			
 
				+                                  weights=args.weight)[0])
			
 
				+         )
			
 
				+        )
			
 
				+    import seaborn as sns
			
 
				+    from pandas import DataFrame
			
 
				+    from matplotlib import pyplot as plt
			
 
				+    dataset = DataFrame(res, columns=["Top-N Guesses", "Accuracy (%)"])
			
 
				+    graph = sns.lineplot("Top-N Guesses", "Accuracy", data=dataset)
			
 
				+    graph.set_xticks(np.arange(1, len(label_list), 2))
			
 
				+    graph.set_yticks(np.arange(0, 1, 0.1))
			
 
				+    plt.title('K-Nearest Neighbor Accuracy on Nth Guess')
			
 
				+    graph.get_figure().savefig("nearest-neighbor-top-n.png")
			
 
				+
			
 
				+def gen_confusion_matrix(matrix, labels):
			
 
				+    import seaborn as sns
			
 
				+    from pandas import DataFrame
			
 
				+    from matplotlib import pyplot as plt
			
 
				+    label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
			
 
				+    dataset = DataFrame(matrix, columns=label_list, index=label_list)
			
 
				+    sns.set(font_scale=0.8)
			
 
				+    graph = sns.heatmap(data=dataset, annot=True, cbar=False)
			
 
				+    graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
			
 
				+                          horizontalalignment="right")
			
 
				+    plt.subplots_adjust(left=0.15, bottom=0.2)
			
 
				+    plt.ylabel('True Label')
			
 
				+    plt.xlabel('Predicted Label')
			
 
				+    plt.title('K-Nearest Neighbor Confusion Matrix')
			
 
				+    graph.get_figure().savefig("nearest-neighbor.png")
			
 
				 
			
 
				 def parse_args(args: list):
			
 
				     import argparse
			
@@ -53,23 +100,30 @@ def parse_args(args: list):
 
				     parser.add_argument('-t', '--top', type=int, default=1,
			
 
				                         help='Number of guesses to be considered \"correct\" \
			
 
				                         (default: 1)')
			
 
				+    parser.add_argument('-g', '--graph', action="store_true",
			
 
				+                        help='Generates a confusion matrix.')
			
 
				+    parser.add_argument('--graph-top', action="store_true",
			
 
				+                        help='Generates a graph of accuracy in top N guesses.')
			
 
				     return parser.parse_args(args)
			
 
				 
			
 
				 def classify(data, labels, num_users: int, args):
			
 
				     s = np.arange(data.shape[0])
			
 
				     np.random.shuffle(s)
			
 
				-    res = kNearestNeighbors(data[s], labels[s],
			
 
				-                            n=args.folds, verbose=args.verbose, k=args.k_neighbors,
			
 
				-                            weights=args.weight, guesses=args.top)
			
 
				+    res, _ = kNearestNeighbors(data[s], labels[s], n=args.folds,
			
 
				+                               verbose=args.verbose, guesses=args.top,
			
 
				+                               k=args.k_neighbors, weights=args.weight)
			
 
				     return (np.average(res), t_test(res, num_users)[1] / 2)
			
 
				 
			
 
				 def kNearestNeighbors(data: list, labels: list,
			
 
				                       n=5, verbose=0, k=5, weights="uniform", guesses=1):
			
 
				     from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
			
 
				+    from sklearn.metrics import confusion_matrix
			
 
				     folds = KFold(n_splits=n)
			
 
				     i = 1
			
 
				     avg = 0
			
 
				     accuracies = []
			
 
				+    output = []
			
 
				+    truth = []
			
 
				     label_list = sorted(np.unique(labels))
			
 
				     for train_index, test_index in folds.split(data):
			
 
				         if verbose >= 1:
			
@@ -80,10 +134,12 @@ def kNearestNeighbors(data: list, labels: list,
 
				         kn = KNeighborsClassifier(n_neighbors=k, weights=weights)
			
 
				         kn.fit(data[train_index], labels[train_index])
			
 
				         predictions = kn.predict(data[test_index])
			
 
				+        output.extend(predictions)
			
 
				+        truth.extend(labels[test_index])
			
 
				         if guesses <= 1:
			
 
				             correct = [a == p for a, p in zip(labels[test_index], predictions)]
			
 
				         else:
			
 
				-            correct = list(map(lambda x: x <= guesses,
			
 
				+            correct = list(map(lambda x: x < guesses,
			
 
				                                find_in_predictions(
			
 
				                                    kn.predict_proba(data[test_index]),
			
 
				                                    labels[test_index],
			
@@ -92,8 +148,15 @@ def kNearestNeighbors(data: list, labels: list,
 
				         if verbose >= 1:
			
 
				             print(accuracy)
			
 
				         accuracies.append(accuracy)
			
 
				-    return accuracies
			
 
				+    return (accuracies, confusion_matrix(truth, output, labels=label_list))
			
 
				 
			
 
				+def find_in_predictions(probabilities: list, truth: list, labels: list):
			
 
				+    return [
			
 
				+        list(map(lambda x: x[0],
			
 
				+                 sorted(zip(labels, probs), key=lambda x: -x[1])
			
 
				+             )).index(actual)
			
 
				+        for probs, actual in zip(probabilities, truth)
			
 
				+    ]
			
 
				 
			
 
				 # TODO: This should be in a separate file.
			
 
				 # If we need a unified interface we can make an aggregater.
			
@@ -116,31 +179,6 @@ def multiLayerPerceptronClassifier(classifications: int, data: list, results: li
 
				     print(loss)
			
 
				     print(accuracy)
			
 
				 
			
 
				-# TODO: This should be in a separate file.
			
 
				-# If we need a unified interface we can make an aggregater.
			
 
				-# TODO: KFold validation
			
 
				-def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
			
 
				-    from sklearn.ensemble import RandomForestClassifier
			
 
				-    rfc = RandomForestClassifier(n_estimators=10)
			
 
				-    rfc.fit(data, labels)
			
 
				-    predictions = rfc.predict(test_data)
			
 
				-    for t in range(len(test_data)):
			
 
				-        print(str(test_data[t]) + "prediction: " + str(predictions[t]))
			
 
				-    if len(test_data) == 0:
			
 
				-        return
			
 
				-    accuracysum = 0
			
 
				-    for t in range(len(test_data)):
			
 
				-        accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
			
 
				-    print("Accuracy: " + str(accuracysum/len(test_data_labels)))
			
 
				-
			
 
				-
			
 
				-def find_in_predictions(probabilities: list, tests: int, labels: list):
			
 
				-    return [list(map(lambda x: x[0],
			
 
				-                     sorted(list(zip(labels, probs)), key=lambda x: x[1]))
			
 
				-    ).index(test)
			
 
				-            for probs, test in zip(probabilities, tests)]
			
 
				-
			
 
				-
			
 
				 def t_test(accuracy: list, num_users: int):
			
 
				     from scipy import stats
			
 
				     random_avg = 1.0/num_users
			
@@ -148,7 +186,6 @@ def t_test(accuracy: list, num_users: int):
 
				     # If all numbers are identical, p-value = 1
			
 
				     return res if not np.isnan(res[0]) else (0, 1)
			
 
				 
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				     import sys
			
 
				     main(sys.argv[1:])
			
--- a/src/classifiers/randomforest.py
+++ b/src/classifiers/randomforest.py
@@ -1,4 +1,4 @@
 
				-#!/usr/bin/python3
			
 
				+#!/home/tflucke/bin/bin/python3
			
 
				 
			
 
				 from sklearn.model_selection import KFold
			
 
				 import numpy as np
			
@@ -26,10 +26,29 @@ def main(options: list):
 
				                        zip(*[(FeatureVector(p, features).get(), p.user)
			
 
				                              for p in samples]))
			
 
				     num_users = len(np.unique([s.user for s in samples]))
			
 
				-    avg, p = classify(data, labels, num_users, args)
			
 
				-    print("Overall Accuracy: %f" % avg)
			
 
				+    s = np.arange(data.shape[0])
			
 
				+    np.random.shuffle(s)
			
 
				+    res, matrix = random_forest(data[s], labels[s], fn=args.criterion,
			
 
				+                                n=args.folds, verbose=args.verbose,
			
 
				+                                estimators=args.estimators)
			
 
				+    print("Overall Accuracy: %f" % np.average(res))
			
 
				     if args.p_value:
			
 
				         print("P-Value: %f" % p)
			
 
				+    if args.graph:
			
 
				+        import seaborn as sns
			
 
				+        from pandas import DataFrame
			
 
				+        from matplotlib import pyplot as plt
			
 
				+        label_list = list(map(lambda l: l[0:6], sorted(np.unique(labels))))
			
 
				+        dataset = DataFrame(matrix, columns=label_list, index=label_list)
			
 
				+        sns.set(font_scale=0.8)
			
 
				+        graph = sns.heatmap(data=dataset, annot=True, cbar=False)
			
 
				+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
			
 
				+                              horizontalalignment="right")
			
 
				+        plt.subplots_adjust(left=0.15, bottom=0.2)
			
 
				+        plt.ylabel('True Label')
			
 
				+        plt.xlabel('Predicted Label')
			
 
				+        plt.title('K-Nearest Neighbor Confusion Matrix')
			
 
				+        graph.get_figure().savefig("random-forest.png")
			
 
				 
			
 
				 def parse_args(args: list):
			
 
				     import argparse
			
@@ -50,23 +69,28 @@ def parse_args(args: list):
 
				                         help='Add feature to list of features to test with.')
			
 
				     parser.add_argument('-p', '--p-value', action='store_const', default=False,
			
 
				                         const=True, help='Calculate a p-value from a t-test.')
			
 
				+    parser.add_argument('-g', '--graph', action="store_true",
			
 
				+                        help='Generates a confusion matrix.')
			
 
				     return parser.parse_args(args)
			
 
				 
			
 
				 def classify(data, labels, num_users: int, args):
			
 
				     s = np.arange(data.shape[0])
			
 
				     np.random.shuffle(s)
			
 
				-    res = random_forest(data[s], labels[s],
			
 
				-                       n=args.folds, verbose=args.verbose, fn=args.criterion,
			
 
				-                       estimators=args.estimators)
			
 
				+    res, _ = random_forest(data[s], labels[s],
			
 
				+                           n=args.folds, verbose=args.verbose, fn=args.criterion,
			
 
				+                           estimators=args.estimators)
			
 
				     return (np.average(res), t_test(res, num_users)[1] / 2)
			
 
				 
			
 
				 def random_forest(data: list, labels: list, n=5, verbose=0, estimators=100,
			
 
				                  fn="gini"):
			
 
				     from sklearn.ensemble import RandomForestClassifier
			
 
				+    from sklearn.metrics import confusion_matrix
			
 
				     folds = KFold(n_splits=n)
			
 
				     i = 1
			
 
				     avg = 0
			
 
				     accuracies = []
			
 
				+    output = []
			
 
				+    truth = []
			
 
				     label_list = sorted(np.unique(labels))
			
 
				     for train_index, test_index in folds.split(data):
			
 
				         if verbose >= 1:
			
@@ -76,11 +100,16 @@ def random_forest(data: list, labels: list, n=5, verbose=0, estimators=100,
 
				             print("Training on: ", train_index)
			
 
				         rfc = RandomForestClassifier(n_estimators=estimators, criterion=fn)
			
 
				         rfc.fit(data[train_index], labels[train_index])
			
 
				-        accuracy = rfc.score(data[test_index], labels[test_index])
			
 
				+        predictions = rfc.predict(data[test_index])
			
 
				+        output.extend(predictions)
			
 
				+        truth.extend(labels[test_index])
			
 
				+        accuracy = [a == p
			
 
				+                    for a, p in zip(labels[test_index], predictions)
			
 
				+        ].count(True)/len(predictions)
			
 
				         if verbose >= 1:
			
 
				             print(accuracy)
			
 
				         accuracies.append(accuracy)
			
 
				-    return accuracies
			
 
				+    return (accuracies, confusion_matrix(truth, output, labels=label_list))
			
 
				 
			
 
				 def t_test(accuracy: list, num_users: int):
			
 
				     from scipy import stats
			
--- a/src/distributer/collect_compressed.py
+++ b/src/distributer/collect_compressed.py
@@ -0,0 +1,180 @@
 
				+#!/home/tflucke/bin/bin/python3
			
 
				+
			
 
				+import typing
			
 
				+import numpy as np
			
 
				+
			
 
				+EXTRACTION_PARAMS = {
			
 
				+    "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)),
			
 
				+    "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]),
			
 
				+           np.arange(2,8)),
			
 
				+    "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001,
			
 
				+           np.arange(1,7)),
			
 
				+    "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001,
			
 
				+           [1.5, 2, 2.5, 3, 3.5, 4, 4.5]),
			
 
				+    "-b": ("Lookback (s)", int, np.arange(1,7)),
			
 
				+    "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]),
			
 
				+    "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25])
			
 
				+}
			
 
				+
			
 
				+def main(options: list):
			
 
				+    args = parse_args(options)
			
 
				+    heaps = [load_file(f, args.compression) for f in args.in_files]
			
 
				+    if args.action == "print":
			
 
				+        import heapq
			
 
				+        heap = list(heapq.merge(*heaps))
			
 
				+        while heap:
			
 
				+            print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" %
			
 
				+                  heapq.heappop(heap))
			
 
				+    elif args.action == "count-files":
			
 
				+        from numpy import unique
			
 
				+        files = list(zip(*unique([res[2] for h in heaps for res in h],
			
 
				+                                 return_counts=True)))
			
 
				+        files.sort(key=lambda x: x[1])
			
 
				+        for f in files:
			
 
				+            print("Count: %4d; File: %80s" % (f[1], f[0]))
			
 
				+    elif args.action == "count-feature-sets":
			
 
				+        from numpy import unique
			
 
				+        files = list(zip(*unique([res[3] for h in heaps for res in h],
			
 
				+                                 return_counts=True)))
			
 
				+        files.sort(key=lambda x: x[1])
			
 
				+        for f in files:
			
 
				+            print("Count: %4d; Features: %s" % (f[1], f[0]))
			
 
				+    elif args.action == "count-features":
			
 
				+        count_features(args, heaps)
			
 
				+    elif args.action == "count-extraction-flag":
			
 
				+        count_preprocess_params(args, heaps)
			
 
				+    elif args.action == "feature-relation":
			
 
				+        count_features_2d(args, heaps)
			
 
				+
			
 
				+def count_features(args, heaps):
			
 
				+    from numpy import unique
			
 
				+    features = [feature for h in heaps
			
 
				+                for res in h
			
 
				+                for feature in res[3]]
			
 
				+    feature_counts = list(zip(*unique(features, return_counts=True)))
			
 
				+    feature_counts.sort(key=lambda x: x[1])
			
 
				+    for f in feature_counts:
			
 
				+        print("Count: %4d; Feature: %s" % (f[1], f[0]))
			
 
				+    if args.graph:
			
 
				+        import seaborn as sns
			
 
				+        from pandas import DataFrame
			
 
				+        from matplotlib import pyplot as plt
			
 
				+        dataset = DataFrame(features, columns=["Feature"])
			
 
				+        sns.set(font_scale=0.65)
			
 
				+        graph = sns.countplot("Feature", data=dataset, palette="Set1",
			
 
				+                              order = dataset['Feature'].value_counts().index)
			
 
				+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
			
 
				+                              horizontalalignment="right")
			
 
				+        plt.subplots_adjust(bottom=0.25)
			
 
				+        if args.log_scale:
			
 
				+            graph.set_yscale('log')
			
 
				+        graph.get_figure().savefig("%s.png" % args.action)
			
 
				+
			
 
				+def count_features_2d(args, heaps):
			
 
				+    from numpy import unique
			
 
				+    features = unique([feature for h in heaps
			
 
				+                       for res in h
			
 
				+                       for feature in res[3]])
			
 
				+    feature_pairs = []
			
 
				+    mi=9999999999999999
			
 
				+    ma=0
			
 
				+    for feature in features:
			
 
				+        print("%s:" % feature)
			
 
				+        pairings = []
			
 
				+        for feature_pair in features:
			
 
				+            if feature_pair == feature:
			
 
				+                pairings.append(0)
			
 
				+                continue
			
 
				+            count = sum([feature_pair in res[3]
			
 
				+                         for h in heaps
			
 
				+                         for res in h
			
 
				+                         if feature in res[3]])
			
 
				+            print("\tFeature: %s; Count: %4d" % (feature_pair, count))
			
 
				+            pairings.append(count)
			
 
				+        ma=max(*pairings, ma)
			
 
				+        mi=min(*pairings, mi)
			
 
				+        feature_pairs.append(pairings)
			
 
				+    if args.graph:
			
 
				+        import seaborn as sns
			
 
				+        from pandas import DataFrame
			
 
				+        from matplotlib import pyplot as plt
			
 
				+        dataset = DataFrame(feature_pairs, index=features, columns=features)
			
 
				+        sns.set(font_scale=0.65)
			
 
				+        if args.log_scale:
			
 
				+            from matplotlib.colors import LogNorm
			
 
				+            # , norm=LogNorm(mi, ma)
			
 
				+        graph = sns.heatmap(data=dataset)
			
 
				+        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
			
 
				+                              horizontalalignment="right")
			
 
				+        plt.subplots_adjust(left=0.25, bottom=0.25)
			
 
				+        plt.ylabel('Feature')
			
 
				+        plt.xlabel('Feature')
			
 
				+        plt.title('Occurrences of Feature Pairs')
			
 
				+        graph.get_figure().savefig("%s.png" % args.action)
			
 
				+
			
 
				+def count_preprocess_params(args, heaps):
			
 
				+    from numpy import unique
			
 
				+    parameters = [EXTRACTION_PARAMS[args.flag][1](
			
 
				+        res[2][res[2].index(args.flag) + 2:].split("-", 1)[0]
			
 
				+    ) for h in heaps for res in h]
			
 
				+    flags = list(zip(*unique(parameters, return_counts=True)))
			
 
				+    flags.sort(key=lambda x: x[1])
			
 
				+    for f in flags:
			
 
				+        print("Count: %4d; Value: %s" % (f[1], f[0]))
			
 
				+    if args.graph:
			
 
				+        import seaborn as sns
			
 
				+        from pandas import Series
			
 
				+        sns.set(font_scale=0.65)
			
 
				+        graph = sns.distplot(Series(parameters,
			
 
				+                                    name=EXTRACTION_PARAMS[args.flag][0]),
			
 
				+                             bins=EXTRACTION_PARAMS[args.flag][2], kde=False)
			
 
				+        graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0],
			
 
				+                        EXTRACTION_PARAMS[args.flag][2][-1]))
			
 
				+        graph.get_figure().savefig("%s%s.png" % (args.action, args.flag))
			
 
				+
			
 
				+def load_file(filename: str, compression=None):
			
 
				+    import compress_pickle
			
 
				+    return [r[0:3] + (r[3].feature,)
			
 
				+            for r in compress_pickle.load(filename, compression=compression)]
			
 
				+
			
 
				+def parse_args(args: list):
			
 
				+    import argparse
			
 
				+    parser = argparse.ArgumentParser(description='Collect the output from \
			
 
				+    distributed distribute_compressor.')
			
 
				+    parser.add_argument('action', choices=["print", "count-files",
			
 
				+                                           "count-feature-sets",
			
 
				+                                           "count-features", "feature-relation",
			
 
				+                                           "count-extraction-flag"],
			
 
				+                        help='Action to take.')
			
 
				+    parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb')
			
 
				+                        help='Output file name.')
			
 
				+    parser.add_argument('-v', '--verbose', action="count", default=0,
			
 
				+                        help='Show more information')
			
 
				+    parser.add_argument('-c', '--compression', default="bz2",
			
 
				+                        choices=["bz2", "gzip", "lzma", "zipfile", None],
			
 
				+                        help='Compression algorithm to use. (default: bzip2)')
			
 
				+    try:
			
 
				+        import seaborn
			
 
				+        parser.add_argument('-g', '--graph', action="store_true",
			
 
				+                            help='Generate and store a graph of the results.')
			
 
				+        parser.add_argument('--log-scale', action="store_true",
			
 
				+                            help='Y-axis log scale')
			
 
				+    except ImportError:
			
 
				+        pass
			
 
				+    res, unknown = parser.parse_known_args(args)
			
 
				+    if res.action == "count-extraction-flag":
			
 
				+        if len(unknown) != 1:
			
 
				+            import sys
			
 
				+            print("count-extraction-flag requires exactly 1 flag to search for.",
			
 
				+                  file=sys.stderr)
			
 
				+            exit(2)
			
 
				+        else:
			
 
				+            vars(res)["flag"] = unknown[0]
			
 
				+    elif unknown:
			
 
				+        parser.print_help()
			
 
				+        exit(2)
			
 
				+    return res
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    import sys
			
 
				+    main(sys.argv[1:])
			
--- a/src/distributer/distribute_compressor.py
+++ b/src/distributer/distribute_compressor.py
@@ -1,71 +1,80 @@
 
				 #!/home/tflucke/bin/bin/python3
			
 
				 
			
 
				-import os, sys, struct, typing, numpy as np
			
 
				+import os, sys, typing, numpy as np
			
 
				 sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
			
 
				                     '/../classifiers/')
			
 
				 from Vector import FeatureVector
			
 
				-try:
			
 
				-    import compress_pickle
			
 
				-except ImportError:
			
 
				-    try:
			
 
				-        import cPickle as compress_pickle
			
 
				-    except ImportError:
			
 
				-        import pickle as compress_pickle
			
 
				-
			
 
				-USHRT_MAX=(1 << 16 - 1)
			
 
				+import pickle
			
 
				 
			
 
				 def main(options: list):
			
 
				     args = parse_args(options)
			
 
				+    heap = []
			
 
				+    import heapq, signal
			
 
				+    def dump(sig, frame):
			
 
				+        print("Dumping to file %s." % args.out_file, file=sys.stderr)
			
 
				+        pickle.dump(heap, args.out_file)
			
 
				+    signal.signal(signal.SIGUSR1, dump)
			
 
				+    def dump_exit(sig, frame, i):
			
 
				+        dump(sig, frame)
			
 
				+        exit(i)
			
 
				+    signal.signal(signal.SIGTERM, lambda sig, frame: dumpexit(sig, frame, 3))
			
 
				     if args.classifier == "nearest-neighbor":
			
 
				         import nearestneighbors as classifier
			
 
				+    if args.classifier == "random-forest":
			
 
				+        import randomforest as classifier
			
 
				     if args.final_statistic == "median":
			
 
				-        avg_fn = np.median
			
 
				+        avg_fn = lambda values: np.median([classifier.classify(*values)
			
 
				+                                    for i in range(0, args.reruns)], 0)
			
 
				     else:
			
 
				-        avg_fn = np.average
			
 
				+        avg_fn = lambda values: np.average([classifier.classify(*values)
			
 
				+                                    for i in range(0, args.reruns)], 0)
			
 
				     print("Ready for input!", flush=True)
			
 
				     for line in sys.stdin:
			
 
				-        options = classifier.parse_args(line.split())
			
 
				-        samples = compress_pickle.load(options.features_file, compression=None)
			
 
				-        num_users = len(np.unique([s.user for s in samples]))
			
 
				-        if num_users < args.min_users:
			
 
				-            print("%s: Too few users.  Skipping..." % args.features_file.name, file=sys.stderr, flush=True)
			
 
				+        res = process_options(line, classifier, avg_fn)
			
 
				+        if len(heap) == args.max_outputs:
			
 
				+            heapq.heappushpop(heap, res)
			
 
				         else:
			
 
				-            features = options.feature if options.feature else classifier.DEFAULT_FEATURES
			
 
				-            data, labels = map(np.array,
			
 
				-                               zip(*[(FeatureVector(p, features).get(), p.user)
			
 
				-                                     for p in samples]))
			
 
				-            runs = [classifier.classify(data, labels, num_users, options) for i in range(0, args.reruns)]
			
 
				-            #print(runs, file=sys.stderr)
			
 
				-            write_to_file(args.out_file, args.compression, options, avg_fn(runs, 0))
			
 
				-            print("Finished: '%s'" % line.strip(), flush=True)
			
 
				+            heapq.heappush(heap, res)
			
 
				+        print("Finished: '%s'" % line.strip(), flush=True)
			
 
				+    dump(0, 0)
			
 
				 
			
 
				-def write_to_file(out: typing.BinaryIO, compression: str, options: list, results: (float, float)):
			
 
				+def process_options(line: str, classifier, avg_fn):
			
 
				+    options = classifier.parse_args(line.split())
			
 
				+    samples = pickle.load(options.features_file)
			
 
				+    num_users = len(np.unique([s.user for s in samples]))
			
 
				+    features = options.feature if options.feature else classifier.DEFAULT_FEATURES
			
 
				+    data, labels = map(np.array,
			
 
				+                       zip(*[(FeatureVector(p, features).get(), p.user)
			
 
				+                             for p in samples]))
			
 
				+    runs = avg_fn((data, labels, num_users, options))
			
 
				+    #print(runs, file=sys.stderr)
			
 
				+    #write_to_file(args.out_file, args.compression, )
			
 
				     filename = options.features_file.name
			
 
				     del options.features_file
			
 
				-    #print("Args: %s; results: %s" % (options, results), file=sys.stderr)
			
 
				-    res_packed = struct.pack("@HH", *[int(USHRT_MAX*v) for v in results])
			
 
				-    compress_pickle.dump((filename, options, res_packed), out, compression=compression)
			
 
				-
			
 
				-def read_from_file(in_file: typing.BinaryIO, compression: str = None):
			
 
				-    res = compress_pickle.load(in_file, compression = compression)
			
 
				-    return (res[0], res[1], tuple(float(v)/USHRT_MAX for v in struct.unpack("@HH", res[1])))
			
 
				+    return (*runs, filename, options)
			
 
				 
			
 
				 def parse_args(args: list):
			
 
				     import argparse
			
 
				-    parser = argparse.ArgumentParser(description='Run a series of tests and compress the output.')
			
 
				-    parser.add_argument('classifier', choices=["nearest-neighbor"], help='Classifier to use.')
			
 
				-    parser.add_argument('out_file', type=argparse.FileType('wb'), help='Output file name.')
			
 
				+    parser = argparse.ArgumentParser(description='Run a series of tests and \
			
 
				+    compress the output.')
			
 
				+    parser.add_argument('classifier', choices=["nearest-neighbor", "random-forest"],
			
 
				+                        help='Classifier to use.')
			
 
				+    parser.add_argument('out_file', type=str, #argparse.FileType('wb')
			
 
				+                        help='Output file name.')
			
 
				     parser.add_argument('-v', '--verbose', action="count", default=0,
			
 
				                         help='Show more information')
			
 
				-    parser.add_argument('-m', '--min-users', type=int, default=10,
			
 
				-                        help='Minimum number of unique users to consider a sample\
			
 
				-                        file valid. (default: 10)')
			
 
				     parser.add_argument('-r', '--reruns', type=int, default=3,
			
 
				-                        help='Number of times to rerun a sample set. (default: 3)')
			
 
				-    parser.add_argument('-f', '--final-statistic', choices=["mean", "median"], default="median",
			
 
				-                        help='Final statistic to show. (default: median)')
			
 
				-    parser.add_argument('-c', '--compression', choices=["bz2", "gzip", "lzma", "zipfile", None], default="bz2",
			
 
				+                        help='Number of times to rerun a sample set. \
			
 
				+                        (default: 3)')
			
 
				+    parser.add_argument('-f', '--final-statistic', choices=["mean", "median"],
			
 
				+                        default="median", help='Final statistic to show. \
			
 
				+                        (default: median)')
			
 
				+    parser.add_argument('-c', '--compression', default="bz2",
			
 
				+                        choices=["bz2", "gzip", "lzma", "zipfile", None],
			
 
				                         help='Compression algorithm to use. (default: bzip2)')
			
 
				+    parser.add_argument('-m', '--max-outputs', type=int, default="100",
			
 
				+                        help='Maximum number of best outputs to print. \
			
 
				+                        (default: 100)')
			
 
				     return parser.parse_args(args)
			
 
				 
			
 
				 if __name__ == '__main__':