Explorar o código

Streamlined vectorization/removed privacy violations.

Thomas Flucke %!s(int64=6) %!d(string=hai) anos
pai
achega
f00e2146f7
Modificáronse 2 ficheiros con 35 adicións e 45 borrados
  1. 25 42
      ethan_data_processing_scripts/Vector.py
  2. 10 3
      src/feature-extractor/sample.py

+ 25 - 42
ethan_data_processing_scripts/Vector.py

@@ -1,54 +1,35 @@
+#!/usr/bin/python3
 try:
     import cPickle as pickle
 except ImportError:
     import pickle
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
+                '/../src/feature-extractor')
 import sample
 import typing
 from typing import List
 
 
 class FeatureVector:
-    def __init__(self):
-        self.features = []
+    def __init__(self, s: sample):
         self.activefeatures = []
         # list of key, value tuples represnting values for features.
-        self.sampleInfo = {}
-        self.classification = None
+        self.sampleInfo = s
+        if isinstance(s, sample.Sample):
+            self.classification = s.user
+        else:
+            self.classification = "DUMMY DATA - DO NOT USE"
 
     # set which features are active using a binary list
-    # ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]
-    def setFeatures(self, binlist):
-        activefeatures = []
-        if len(binlist) != len(self.features):
-            print("Feature choice list must equal length of feature list")
-        for b in range(len(binlist)):
-            if binlist[b] == 1:
-                activefeatures.append(self.features[b])
-        self.activefeatures = activefeatures
+    # ~~ex: For 2nd, 4th and 5th features to be active, pass [0,1,0,1,1]~~
+    # ex: ["total_time", "average_iat", "dead_time"]
+    def set_features(self, features: typing.List[str]):
+        self.activefeatures = [self.sampleInfo[feature] for feature in features]
 
     def __repr__(self):
-        return str(self.features)
-
-
-# use to make a sample into a vector
-def SampleToFeatureVector(s: sample):
-    print(s.__activities)
-    v = FeatureVector()
-    for a in s.__activities.keys():
-        for d in s.__activities[a].keys():
-            # a + d looks like: high.total_packets
-            v.sampleInfo.update({a + "." + d: s.__activities[a][d]})
-        v.features.extend(a["high"].values())
-        v.features.extend(a["mid"].values())
-        v.features.extend(a["low"].values())
-    v.sampleInfo.update(("total_time", s.total_time))
-    v.sampleInfo.update(("average_iat", s.average_iat))
-    v.sampleInfo.update(("dead_time", s.dead_time))
-    v.features.append(s.total_time)
-    v.features.append(s.average_iat)
-    v.features.append(s.dead_time)
-    return v
-
+        return str(self.activefeatures)
 
 def writePickledData(filename):
     v = FeatureVector()
@@ -67,15 +48,17 @@ def readPickledData(filename):
 
 
 def main():
-    fv = FeatureVector()
+    # fv = FeatureVector()
     # writePickledData("test.bin")
     # readPickledData("test.txt")
-    s = sample.Sample([{"delta": 0, "time": '09/19/18 13:55:26', }], open("./results.txt"))
-    s.dead_time = 30
-    s.average_iat = 3
-    s.total_time = 30
-    fv = SampleToFeatureVector(s)
-
+    # s = sample.Sample([{"delta": 0, "time": '09/19/18 13:55:26', }], open("./results.txt"))
+    # s.dead_time = 30
+    # s.average_iat = 3
+    # s.total_time = 30
+    s = {"dead_time": 30, "average_iat": 3, "total_time": 30}
+    fv = FeatureVector(s)
+    fv.set_features(["total_time", "average_iat", "dead_time"])
+    print(fv)
 
 if __name__ == '__main__':
     main()

+ 10 - 3
src/feature-extractor/sample.py

@@ -47,9 +47,9 @@ class Sample:
     def __extract_tag(self, keylog: typing.TextIO):
         import os
         dir_guided = os.path.dirname(keylog)
-        self["is_guided"] = os.path.basename(dir_guided) == "y"
+        self.is_guided = os.path.basename(dir_guided) == "y"
         dir_user = os.path.dirname(dir_guided)
-        self["user"] = os.path.basename(dir_user)
+        self.user = os.path.basename(dir_user)
 
     def __extract_activity_stats(self, packets):
         high_activity = []
@@ -165,7 +165,14 @@ class Sample:
             return self.__activities[vals[0]][vals[1]]
         else:
             raise ValueError('Unable to access value at %s, unknown prefix.' % key)
-    
+
+    def keys(self):
+        return self.__general.keys().extend(
+            ["%s.%s" % (prefix, suffix)
+             for prefix in self.__activities.keys()
+             for suffix in self.__activities[prefix].keys()]
+        )
+        
     def __str__(self):
         return "Sample: {%s, high: %s, mid: %s, low: %s}" % \
             (self.__general, self.__activities["high"],