collect_compressed.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. #!/home/tflucke/bin/bin/python3
  2. import typing
  3. import numpy as np
  4. EXTRACTION_PARAMS = {
  5. "-a": ("Small Paste Size (Blocks)", int, np.arange(1,7)),
  6. "-p": ("Large Paste Size (Blocks)", lambda x: int(x.split(".", 1)[0]),
  7. np.arange(2,8)),
  8. "-l": ("Low Activity Threshold (k/s)", lambda x: float(x) - 0.001,
  9. np.arange(1,7)),
  10. "-i": ("High Activity Threshold (k/s)", lambda x: float(x) + 0.001,
  11. [1.5, 2, 2.5, 3, 3.5, 4, 4.5]),
  12. "-b": ("Lookback (s)", int, np.arange(1,7)),
  13. "-s": ("Sample Size (Count)", int, [100, 150, 200, 300, 400, 500, 600]),
  14. "-m": ("Minimum Number of Samples (Samples/Tag)", int, [5, 10, 15, 20, 25])
  15. }
  16. def main(options: list):
  17. args = parse_args(options)
  18. heaps = [load_file(f, args.compression) for f in args.in_files]
  19. if args.action == "print":
  20. import heapq
  21. heap = list(heapq.merge(*heaps))
  22. while heap:
  23. print("Accuracy: %0.02f; P-Value: %0.04f; File: %80s; Options: %s" %
  24. heapq.heappop(heap))
  25. elif args.action == "count-files":
  26. from numpy import unique
  27. files = list(zip(*unique([res[2] for h in heaps for res in h],
  28. return_counts=True)))
  29. files.sort(key=lambda x: x[1])
  30. for f in files:
  31. print("Count: %4d; File: %80s" % (f[1], f[0]))
  32. elif args.action == "count-feature-sets":
  33. from numpy import unique
  34. files = list(zip(*unique([res[3] for h in heaps for res in h],
  35. return_counts=True)))
  36. files.sort(key=lambda x: x[1])
  37. for f in files:
  38. print("Count: %4d; Features: %s" % (f[1], f[0]))
  39. elif args.action == "count-features":
  40. count_features(args, heaps)
  41. elif args.action == "count-extraction-flag":
  42. count_preprocess_params(args, heaps)
  43. elif args.action == "feature-relation":
  44. count_features_2d(args, heaps)
  45. def count_features(args, heaps):
  46. from numpy import unique
  47. features = [feature for h in heaps
  48. for res in h
  49. for feature in res[3]]
  50. feature_counts = list(zip(*unique(features, return_counts=True)))
  51. feature_counts.sort(key=lambda x: x[1])
  52. for f in feature_counts:
  53. print("Count: %4d; Feature: %s" % (f[1], f[0]))
  54. if args.graph:
  55. import seaborn as sns
  56. from pandas import DataFrame
  57. from matplotlib import pyplot as plt
  58. dataset = DataFrame(features, columns=["Feature"])
  59. sns.set(font_scale=0.65)
  60. graph = sns.countplot("Feature", data=dataset, palette="Set1",
  61. order = dataset['Feature'].value_counts().index)
  62. graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
  63. horizontalalignment="right")
  64. plt.subplots_adjust(bottom=0.25)
  65. if args.log_scale:
  66. graph.set_yscale('log')
  67. graph.get_figure().savefig("%s.png" % args.action)
  68. def count_features_2d(args, heaps):
  69. from numpy import unique
  70. features = unique([feature for h in heaps
  71. for res in h
  72. for feature in res[3]])
  73. feature_pairs = []
  74. mi=9999999999999999
  75. ma=0
  76. for feature in features:
  77. print("%s:" % feature)
  78. pairings = []
  79. for feature_pair in features:
  80. if feature_pair == feature:
  81. pairings.append(0)
  82. continue
  83. count = sum([feature_pair in res[3]
  84. for h in heaps
  85. for res in h
  86. if feature in res[3]])
  87. print("\tFeature: %s; Count: %4d" % (feature_pair, count))
  88. pairings.append(count)
  89. ma=max(*pairings, ma)
  90. mi=min(*pairings, mi)
  91. feature_pairs.append(pairings)
  92. if args.graph:
  93. import seaborn as sns
  94. from pandas import DataFrame
  95. from matplotlib import pyplot as plt
  96. dataset = DataFrame(feature_pairs, index=features, columns=features)
  97. sns.set(font_scale=0.65)
  98. if args.log_scale:
  99. from matplotlib.colors import LogNorm
  100. # , norm=LogNorm(mi, ma)
  101. graph = sns.heatmap(data=dataset)
  102. graph.set_xticklabels(graph.get_xticklabels(), rotation=50,
  103. horizontalalignment="right")
  104. plt.subplots_adjust(left=0.25, bottom=0.25)
  105. plt.ylabel('Feature')
  106. plt.xlabel('Feature')
  107. plt.title('Occurrences of Feature Pairs')
  108. graph.get_figure().savefig("%s.png" % args.action)
  109. def count_preprocess_params(args, heaps):
  110. from numpy import unique
  111. parameters = [EXTRACTION_PARAMS[args.flag][1](
  112. res[2][res[2].index(args.flag) + 2:].split("-", 1)[0]
  113. ) for h in heaps for res in h]
  114. flags = list(zip(*unique(parameters, return_counts=True)))
  115. flags.sort(key=lambda x: x[1])
  116. for f in flags:
  117. print("Count: %4d; Value: %s" % (f[1], f[0]))
  118. if args.graph:
  119. import seaborn as sns
  120. from pandas import Series
  121. sns.set(font_scale=0.65)
  122. graph = sns.distplot(Series(parameters,
  123. name=EXTRACTION_PARAMS[args.flag][0]),
  124. bins=EXTRACTION_PARAMS[args.flag][2], kde=False)
  125. graph.set(xlim=(EXTRACTION_PARAMS[args.flag][2][0],
  126. EXTRACTION_PARAMS[args.flag][2][-1]))
  127. graph.get_figure().savefig("%s%s.png" % (args.action, args.flag))
  128. def load_file(filename: str, compression=None):
  129. import compress_pickle
  130. return [r[0:3] + (r[3].feature,)
  131. for r in compress_pickle.load(filename, compression=compression)]
  132. def parse_args(args: list):
  133. import argparse
  134. parser = argparse.ArgumentParser(description='Collect the output from \
  135. distributed distribute_compressor.')
  136. parser.add_argument('action', choices=["print", "count-files",
  137. "count-feature-sets",
  138. "count-features", "feature-relation",
  139. "count-extraction-flag"],
  140. help='Action to take.')
  141. parser.add_argument('in_files', nargs='+', type=str, #argparse.FileType('wb')
  142. help='Output file name.')
  143. parser.add_argument('-v', '--verbose', action="count", default=0,
  144. help='Show more information')
  145. parser.add_argument('-c', '--compression', default="bz2",
  146. choices=["bz2", "gzip", "lzma", "zipfile", None],
  147. help='Compression algorithm to use. (default: bzip2)')
  148. try:
  149. import seaborn
  150. parser.add_argument('-g', '--graph', action="store_true",
  151. help='Generate and store a graph of the results.')
  152. parser.add_argument('--log-scale', action="store_true",
  153. help='Y-axis log scale')
  154. except ImportError:
  155. pass
  156. res, unknown = parser.parse_known_args(args)
  157. if res.action == "count-extraction-flag":
  158. if len(unknown) != 1:
  159. import sys
  160. print("count-extraction-flag requires exactly 1 flag to search for.",
  161. file=sys.stderr)
  162. exit(2)
  163. else:
  164. vars(res)["flag"] = unknown[0]
  165. elif unknown:
  166. parser.print_help()
  167. exit(2)
  168. return res
  169. if __name__ == '__main__':
  170. import sys
  171. main(sys.argv[1:])