tflucke
/
SSH-Master-Thesis


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
							#!/home/tflucke/bin/bin/python3

import typing
import numpy as np

EXTRACTION_PARAMS = {
    "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)),
    "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]),
           np.arange(2,8)),
    "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001,
           np.arange(1,7)),
    "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001,
           [1.5, 2, 2.5, 3, 3.5, 4, 4.5]),
    "-b": ("Lookback (s)", int, np.arange(1,7)),
    "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]),
    "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25])
}

def main(options: list):
    args = parse_args(options)
    heaps = [load_file(f, args.compression) for f in args.in_files]
    if args.action == "print":
        import heapq
        heap = list(heapq.merge(*heaps))
        while heap:
            print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" %
                  heapq.heappop(heap))
    elif args.action == "count-files":
        from numpy import unique
        files = list(zip(*unique([res[2] for h in heaps for res in h],
                                 return_counts=True)))
        files.sort(key=lambda x: x[1])
        for f in files:
            print("Count: %4d; File: %80s" % (f[1], f[0]))
    elif args.action == "count-feature-sets":
        from numpy import unique
        files = list(zip(*unique([res[3] for h in heaps for res in h],
                                 return_counts=True)))
        files.sort(key=lambda x: x[1])
        for f in files:
            print("Count: %4d; Features: %s" % (f[1], f[0]))
    elif args.action == "count-features":
        count_features(args, heaps)
    elif args.action == "count-extraction-flag":
        count_preprocess_params(args, heaps)
    elif args.action == "feature-relation":
        count_features_2d(args, heaps)

def count_features(args, heaps):
    from numpy import unique
    features = [feature for h in heaps
                for res in h
                for feature in res[3]]
    feature_counts = list(zip(*unique(features, return_counts=True)))
    feature_counts.sort(key=lambda x: x[1])
    for f in feature_counts:
        print("Count: %4d; Feature: %s" % (f[1], f[0]))
    if args.graph:
        import seaborn as sns
        from pandas import DataFrame
        from matplotlib import pyplot as plt
        dataset = DataFrame(features, columns=["Feature"])
        sns.set(font_scale=0.65)
        graph = sns.countplot("Feature", data=dataset, palette="Set1",
                              order = dataset['Feature'].value_counts().index)
        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
                              horizontalalignment="right")
        plt.subplots_adjust(bottom=0.25)
        if args.log_scale:
            graph.set_yscale('log')
        graph.get_figure().savefig("%s.png" % args.action)

def count_features_2d(args, heaps):
    from numpy import unique
    features = unique([feature for h in heaps
                       for res in h
                       for feature in res[3]])
    feature_pairs = []
    mi=9999999999999999
    ma=0
    for feature in features:
        print("%s:" % feature)
        pairings = []
        for feature_pair in features:
            if feature_pair == feature:
                pairings.append(0)
                continue
            count = sum([feature_pair in res[3]
                         for h in heaps
                         for res in h
                         if feature in res[3]])
            print("\tFeature: %s; Count: %4d" % (feature_pair, count))
            pairings.append(count)
        ma=max(*pairings, ma)
        mi=min(*pairings, mi)
        feature_pairs.append(pairings)
    if args.graph:
        import seaborn as sns
        from pandas import DataFrame
        from matplotlib import pyplot as plt
        dataset = DataFrame(feature_pairs, index=features, columns=features)
        sns.set(font_scale=0.65)
        if args.log_scale:
            from matplotlib.colors import LogNorm
            # , norm=LogNorm(mi, ma)
        graph = sns.heatmap(data=dataset)
        graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
                              horizontalalignment="right")
        plt.subplots_adjust(left=0.25, bottom=0.25)
        plt.ylabel('Feature')
        plt.xlabel('Feature')
        plt.title('Occurrences of Feature Pairs')
        graph.get_figure().savefig("%s.png" % args.action)

def count_preprocess_params(args, heaps):
    from numpy import unique
    parameters = [EXTRACTION_PARAMS[args.flag][1](
        res[2][res[2].index(args.flag) + 2:].split("-", 1)[0]
    ) for h in heaps for res in h]
    flags = list(zip(*unique(parameters, return_counts=True)))
    flags.sort(key=lambda x: x[1])
    for f in flags:
        print("Count: %4d; Value: %s" % (f[1], f[0]))
    if args.graph:
        import seaborn as sns
        from pandas import Series
        sns.set(font_scale=0.65)
        graph = sns.distplot(Series(parameters,
                                    name=EXTRACTION_PARAMS[args.flag][0]),
                             bins=EXTRACTION_PARAMS[args.flag][2], kde=False)
        graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0],
                        EXTRACTION_PARAMS[args.flag][2][-1]))
        graph.get_figure().savefig("%s%s.png" % (args.action, args.flag))

def load_file(filename: str, compression=None):
    import compress_pickle
    return [r[0:3] + (r[3].feature,)
            for r in compress_pickle.load(filename, compression=compression)]

def parse_args(args: list):
    import argparse
    parser = argparse.ArgumentParser(description='Collect the output from \
    distributed distribute_compressor.')
    parser.add_argument('action', choices=["print", "count-files",
                                           "count-feature-sets",
                                           "count-features", "feature-relation",
                                           "count-extraction-flag"],
                        help='Action to take.')
    parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb')
                        help='Output file name.')
    parser.add_argument('-v', '--verbose', action="count", default=0,
                        help='Show more information')
    parser.add_argument('-c', '--compression', default="bz2",
                        choices=["bz2", "gzip", "lzma", "zipfile", None],
                        help='Compression algorithm to use. (default: bzip2)')
    try:
        import seaborn
        parser.add_argument('-g', '--graph', action="store_true",
                            help='Generate and store a graph of the results.')
        parser.add_argument('--log-scale', action="store_true",
                            help='Y-axis log scale')
    except ImportError:
        pass
    res, unknown = parser.parse_known_args(args)
    if res.action == "count-extraction-flag":
        if len(unknown) != 1:
            import sys
            print("count-extraction-flag requires exactly 1 flag to search for.",
                  file=sys.stderr)
            exit(2)
        else:
            vars(res)["flag"] = unknown[0]
    elif unknown:
        parser.print_help()
        exit(2)
    return res

if __name__ == '__main__':
    import sys
    main(sys.argv[1:])