| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- #!/home/tflucke/bin/bin/python3
- import typing
- import numpy as np
- EXTRACTION_PARAMS = {
- "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)),
- "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]),
- np.arange(2,8)),
- "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001,
- np.arange(1,7)),
- "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001,
- [1.5, 2, 2.5, 3, 3.5, 4, 4.5]),
- "-b": ("Lookback (s)", int, np.arange(1,7)),
- "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]),
- "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25])
- }
- def main(options: list):
- args = parse_args(options)
- heaps = [load_file(f, args.compression) for f in args.in_files]
- if args.action == "print":
- import heapq
- heap = list(heapq.merge(*heaps))
- while heap:
- print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" %
- heapq.heappop(heap))
- elif args.action == "count-files":
- from numpy import unique
- files = list(zip(*unique([res[2] for h in heaps for res in h],
- return_counts=True)))
- files.sort(key=lambda x: x[1])
- for f in files:
- print("Count: %4d; File: %80s" % (f[1], f[0]))
- elif args.action == "count-feature-sets":
- from numpy import unique
- files = list(zip(*unique([res[3] for h in heaps for res in h],
- return_counts=True)))
- files.sort(key=lambda x: x[1])
- for f in files:
- print("Count: %4d; Features: %s" % (f[1], f[0]))
- elif args.action == "count-features":
- count_features(args, heaps)
- elif args.action == "count-extraction-flag":
- count_preprocess_params(args, heaps)
- elif args.action == "feature-relation":
- count_features_2d(args, heaps)
- def count_features(args, heaps):
- from numpy import unique
- features = [feature for h in heaps
- for res in h
- for feature in res[3]]
- feature_counts = list(zip(*unique(features, return_counts=True)))
- feature_counts.sort(key=lambda x: x[1])
- for f in feature_counts:
- print("Count: %4d; Feature: %s" % (f[1], f[0]))
- if args.graph:
- import seaborn as sns
- from pandas import DataFrame
- from matplotlib import pyplot as plt
- dataset = DataFrame(features, columns=["Feature"])
- sns.set(font_scale=0.65)
- graph = sns.countplot("Feature", data=dataset, palette="Set1",
- order = dataset['Feature'].value_counts().index)
- graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
- horizontalalignment="right")
- plt.subplots_adjust(bottom=0.25)
- if args.log_scale:
- graph.set_yscale('log')
- graph.get_figure().savefig("%s.png" % args.action)
- def count_features_2d(args, heaps):
- from numpy import unique
- features = unique([feature for h in heaps
- for res in h
- for feature in res[3]])
- feature_pairs = []
- mi=9999999999999999
- ma=0
- for feature in features:
- print("%s:" % feature)
- pairings = []
- for feature_pair in features:
- if feature_pair == feature:
- pairings.append(0)
- continue
- count = sum([feature_pair in res[3]
- for h in heaps
- for res in h
- if feature in res[3]])
- print("\tFeature: %s; Count: %4d" % (feature_pair, count))
- pairings.append(count)
- ma=max(*pairings, ma)
- mi=min(*pairings, mi)
- feature_pairs.append(pairings)
- if args.graph:
- import seaborn as sns
- from pandas import DataFrame
- from matplotlib import pyplot as plt
- dataset = DataFrame(feature_pairs, index=features, columns=features)
- sns.set(font_scale=0.65)
- if args.log_scale:
- from matplotlib.colors import LogNorm
- # , norm=LogNorm(mi, ma)
- graph = sns.heatmap(data=dataset)
- graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
- horizontalalignment="right")
- plt.subplots_adjust(left=0.25, bottom=0.25)
- plt.ylabel('Feature')
- plt.xlabel('Feature')
- plt.title('Occurrences of Feature Pairs')
- graph.get_figure().savefig("%s.png" % args.action)
- def count_preprocess_params(args, heaps):
- from numpy import unique
- parameters = [EXTRACTION_PARAMS[args.flag][1](
- res[2][res[2].index(args.flag) + 2:].split("-", 1)[0]
- ) for h in heaps for res in h]
- flags = list(zip(*unique(parameters, return_counts=True)))
- flags.sort(key=lambda x: x[1])
- for f in flags:
- print("Count: %4d; Value: %s" % (f[1], f[0]))
- if args.graph:
- import seaborn as sns
- from pandas import Series
- sns.set(font_scale=0.65)
- graph = sns.distplot(Series(parameters,
- name=EXTRACTION_PARAMS[args.flag][0]),
- bins=EXTRACTION_PARAMS[args.flag][2], kde=False)
- graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0],
- EXTRACTION_PARAMS[args.flag][2][-1]))
- graph.get_figure().savefig("%s%s.png" % (args.action, args.flag))
- def load_file(filename: str, compression=None):
- import compress_pickle
- return [r[0:3] + (r[3].feature,)
- for r in compress_pickle.load(filename, compression=compression)]
- def parse_args(args: list):
- import argparse
- parser = argparse.ArgumentParser(description='Collect the output from \
- distributed distribute_compressor.')
- parser.add_argument('action', choices=["print", "count-files",
- "count-feature-sets",
- "count-features", "feature-relation",
- "count-extraction-flag"],
- help='Action to take.')
- parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb')
- help='Output file name.')
- parser.add_argument('-v', '--verbose', action="count", default=0,
- help='Show more information')
- parser.add_argument('-c', '--compression', default="bz2",
- choices=["bz2", "gzip", "lzma", "zipfile", None],
- help='Compression algorithm to use. (default: bzip2)')
- try:
- import seaborn
- parser.add_argument('-g', '--graph', action="store_true",
- help='Generate and store a graph of the results.')
- parser.add_argument('--log-scale', action="store_true",
- help='Y-axis log scale')
- except ImportError:
- pass
- res, unknown = parser.parse_known_args(args)
- if res.action == "count-extraction-flag":
- if len(unknown) != 1:
- import sys
- print("count-extraction-flag requires exactly 1 flag to search for.",
- file=sys.stderr)
- exit(2)
- else:
- vars(res)["flag"] = unknown[0]
- elif unknown:
- parser.print_help()
- exit(2)
- return res
- if __name__ == '__main__':
- import sys
- main(sys.argv[1:])
|