#!/home/tflucke/bin/bin/python3 import typing import numpy as np EXTRACTION_PARAMS = { "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)), "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]), np.arange(2,8)), "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001, np.arange(1,7)), "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001, [1.5, 2, 2.5, 3, 3.5, 4, 4.5]), "-b": ("Lookback (s)", int, np.arange(1,7)), "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]), "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25]) } def main(options: list): args = parse_args(options) heaps = [load_file(f, args.compression) for f in args.in_files] if args.action == "print": import heapq heap = list(heapq.merge(*heaps)) while heap: print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" % heapq.heappop(heap)) elif args.action == "count-files": from numpy import unique files = list(zip(*unique([res[2] for h in heaps for res in h], return_counts=True))) files.sort(key=lambda x: x[1]) for f in files: print("Count: %4d; File: %80s" % (f[1], f[0])) elif args.action == "count-feature-sets": from numpy import unique files = list(zip(*unique([res[3] for h in heaps for res in h], return_counts=True))) files.sort(key=lambda x: x[1]) for f in files: print("Count: %4d; Features: %s" % (f[1], f[0])) elif args.action == "count-features": count_features(args, heaps) elif args.action == "count-extraction-flag": count_preprocess_params(args, heaps) elif args.action == "feature-relation": count_features_2d(args, heaps) def count_features(args, heaps): from numpy import unique features = [feature for h in heaps for res in h for feature in res[3]] feature_counts = list(zip(*unique(features, return_counts=True))) feature_counts.sort(key=lambda x: x[1]) for f in feature_counts: print("Count: %4d; Feature: %s" % (f[1], f[0])) if args.graph: import seaborn as sns from pandas import DataFrame from matplotlib import pyplot as plt dataset = DataFrame(features, columns=["Feature"]) sns.set(font_scale=0.65) graph = sns.countplot("Feature", data=dataset, palette="Set1", order = dataset['Feature'].value_counts().index) graph.set_xticklabels(graph.get_xticklabels(), rotation=50, horizontalalignment="right") plt.subplots_adjust(bottom=0.25) if args.log_scale: graph.set_yscale('log') graph.get_figure().savefig("%s.png" % args.action) def count_features_2d(args, heaps): from numpy import unique features = unique([feature for h in heaps for res in h for feature in res[3]]) feature_pairs = [] mi=9999999999999999 ma=0 for feature in features: print("%s:" % feature) pairings = [] for feature_pair in features: if feature_pair == feature: pairings.append(0) continue count = sum([feature_pair in res[3] for h in heaps for res in h if feature in res[3]]) print("\tFeature: %s; Count: %4d" % (feature_pair, count)) pairings.append(count) ma=max(*pairings, ma) mi=min(*pairings, mi) feature_pairs.append(pairings) if args.graph: import seaborn as sns from pandas import DataFrame from matplotlib import pyplot as plt dataset = DataFrame(feature_pairs, index=features, columns=features) sns.set(font_scale=0.65) if args.log_scale: from matplotlib.colors import LogNorm # , norm=LogNorm(mi, ma) graph = sns.heatmap(data=dataset) graph.set_xticklabels(graph.get_xticklabels(), rotation=50, horizontalalignment="right") plt.subplots_adjust(left=0.25, bottom=0.25) plt.ylabel('Feature') plt.xlabel('Feature') plt.title('Occurrences of Feature Pairs') graph.get_figure().savefig("%s.png" % args.action) def count_preprocess_params(args, heaps): from numpy import unique parameters = [EXTRACTION_PARAMS[args.flag][1]( res[2][res[2].index(args.flag) + 2:].split("-", 1)[0] ) for h in heaps for res in h] flags = list(zip(*unique(parameters, return_counts=True))) flags.sort(key=lambda x: x[1]) for f in flags: print("Count: %4d; Value: %s" % (f[1], f[0])) if args.graph: import seaborn as sns from pandas import Series sns.set(font_scale=0.65) graph = sns.distplot(Series(parameters, name=EXTRACTION_PARAMS[args.flag][0]), bins=EXTRACTION_PARAMS[args.flag][2], kde=False) graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0], EXTRACTION_PARAMS[args.flag][2][-1])) graph.get_figure().savefig("%s%s.png" % (args.action, args.flag)) def load_file(filename: str, compression=None): import compress_pickle return [r[0:3] + (r[3].feature,) for r in compress_pickle.load(filename, compression=compression)] def parse_args(args: list): import argparse parser = argparse.ArgumentParser(description='Collect the output from \ distributed distribute_compressor.') parser.add_argument('action', choices=["print", "count-files", "count-feature-sets", "count-features", "feature-relation", "count-extraction-flag"], help='Action to take.') parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb') help='Output file name.') parser.add_argument('-v', '--verbose', action="count", default=0, help='Show more information') parser.add_argument('-c', '--compression', default="bz2", choices=["bz2", "gzip", "lzma", "zipfile", None], help='Compression algorithm to use. (default: bzip2)') try: import seaborn parser.add_argument('-g', '--graph', action="store_true", help='Generate and store a graph of the results.') parser.add_argument('--log-scale', action="store_true", help='Y-axis log scale') except ImportError: pass res, unknown = parser.parse_known_args(args) if res.action == "count-extraction-flag": if len(unknown) != 1: import sys print("count-extraction-flag requires exactly 1 flag to search for.", file=sys.stderr) exit(2) else: vars(res)["flag"] = unknown[0] elif unknown: parser.print_help() exit(2) return res if __name__ == '__main__': import sys main(sys.argv[1:])