Parcourir la source

Merge branch 'master' of git.tflucke.name:tflucke/SSH-Master-Thesis

Thomas Flucke il y a 6 ans
Parent
commit
677ddb532e

+ 1 - 0
.gitignore

@@ -9,6 +9,7 @@ data/*/
 *.so
 *.pdf
 *.plo
+*.pyc
 src/flow-seperator/flow-seperator
 src/pcap-matcher/pcap-matcher
 src/packet-matcher/packet-matcher

+ 8 - 5
src/classifiers/Vector.py

@@ -3,13 +3,16 @@ try:
     import cPickle as pickle
 except ImportError:
     import pickle
-import os
-import sys
 import typing
 from typing import List
-sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
-                '/../feature-extractor')
-import sample
+try:
+    import sample
+except ImportError:
+    import os
+    import sys
+    sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
+                    '/../feature-extractor')
+    import sample
 
 class FeatureVector:
     def __init__(self, s: sample, features = None):

+ 1 - 1
src/classifiers/classifier.py

@@ -28,7 +28,7 @@ def main():
 
 
 # data and results arrays (training and testing) should be paired. Classifications is number of ways to classify data.
-def train(classifications: int, data: list, results: list, testdata: list, testresults: list):
+def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list):
     numberOfNeurons = (len(data[0]) + classifications)/2
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Flatten())

+ 54 - 4
src/classifiers/nearestneighbors.py

@@ -1,10 +1,14 @@
 #!/usr/bin/python3
 from sklearn.model_selection import KFold
-from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
-from sklearn.ensemble import RandomForestClassifier
 import numpy as np
-import sys
-from Vector import FeatureVector
+try:
+    import sample
+except ImportError:
+    import os
+    import sys
+    sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + \
+                    '/../feature-extractor')
+    import sample
 
 DEFAULT_FEATURES = ["average_iat", "high.avg_burst_size", "high.burst_count"]
 
@@ -21,6 +25,7 @@ def main():
     from random import shuffle
     shuffle(samples)
     features = args.feature if args.feature else DEFAULT_FEATURES
+    from Vector import FeatureVector
     data, labels = zip(*[(FeatureVector(p, features).get(), p.user)
                          for p in samples])
     res = kNearestNeighbors(np.array(data), np.array(labels),
@@ -31,6 +36,7 @@ def main():
         _, p = t_test(res, labels)
         print("P-Value: %f" % (p / 2))
 
+
 def parse_args():
     import argparse
     parser = argparse.ArgumentParser(
@@ -55,8 +61,10 @@ def parse_args():
                         (default: 1)')
     return parser.parse_args()
 
+
 def kNearestNeighbors(data: list, labels: list,
                       n=5, verbose=0, k=5, weights="uniform", guesses=1):
+    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
     folds = KFold(n_splits=n)
     i = 1
     avg = 0
@@ -85,16 +93,58 @@ def kNearestNeighbors(data: list, labels: list,
         accuracies.append(accuracy)
     return accuracies
 
+
+# TODO: This should be in a separate file.
+# If we need a unified interface we can make an aggregater.
+# TODO: KFold validation
+def multiLayerPerceptronClassifier(classifications: int, data: list, results: list, testdata: list, testresults: list):
+    import tensorflow as tf
+    numberOfNeurons = (len(data[0]) + classifications)/2
+    model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Flatten())
+    model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu))
+    model.add(tf.keras.layers.Dense(numberOfNeurons, activation=tf.nn.relu))
+    model.add(tf.keras.layers.Dense(classifications, tf.nn.softmax))
+
+    model.compile(optimizer='SGD',
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+    model.fit(data, results, epochs=5)
+
+    loss, accuracy = model.evaluate(testdata, testresults)
+    print(loss)
+    print(accuracy)
+
+# TODO: This should be in a separate file.
+# If we need a unified interface we can make an aggregater.
+# TODO: KFold validation
+def randomForest(data: list, labels: list, test_data: list, test_data_labels: list):
+    from sklearn.ensemble import RandomForestClassifier
+    rfc = RandomForestClassifier(n_estimators=10)
+    rfc.fit(data, labels)
+    predictions = rfc.predict(test_data)
+    for t in range(len(test_data)):
+        print(str(test_data[t]) + "prediction: " + str(predictions[t]))
+    if len(test_data) == 0:
+        return
+    accuracysum = 0
+    for t in range(len(test_data)):
+        accuracysum += 1 if predictions[t] == test_data_labels[t] else 0
+    print("Accuracy: " + str(accuracysum/len(test_data_labels)))
+
+
 def find_in_predictions(probabilities: list, tests: int, labels: list):
     return [list(map(lambda x: x[0],
                      sorted(list(zip(labels, probs)), key=lambda x: x[1]))
     ).index(test)
             for probs, test in zip(probabilities, tests)]
 
+
 def t_test(accuracy: list, labels: list):
     from scipy import stats
     random_avg = 1.0/len(np.unique(labels))
     return stats.ttest_1samp(accuracy, random_avg)
 
+
 if __name__ == '__main__':
     main()

+ 15 - 8
src/distributer/distribute.sh

@@ -1,7 +1,12 @@
 #!/bin/sh
 
 readonly DEFAULT_OUT_FMT="%s.out"
-readonly CMD_FEED="$(mktemp -u distributer-XXX.fifo)"
+readonly CMD_FEED="$(mktemp -u /tmp/distributer-XXX.fifo)"
+readonly LOCK="$CMD_FEED.lock"
+readonly LOCK_TIMEOUT="1"
+readonly PROC_BUFFER=10
+readonly MAX_PROCS=$(expr $(ulimit -u) / 3 - $PROC_BUFFER)
+
 readonly CONF_LIST="$1"
 readonly SERVER_LIST="$2"
 readonly OUT_FMT="${3:-$DEFAULT_OUT_FMT}"
@@ -20,7 +25,7 @@ help() {
     printf "    server_list: A text file containing a list of servers to\n" >&2
     printf "    connect to and run commands on.\n" >&2
     printf "    out_file_fmt: File name format to write the output of each\n" >&2
-    printf "    command to (default: $DEFAULT_OUT_FMT).\n\n" >&2
+    printf "    command to (default: %s).\n\n" "$DEFAULT_OUT_FMT" >&2
     printf "All commands will be allocated to the first available server.\n" >&2
     printf "Each command must be valid on every server.\n" >&2
     printf "The output will be saved to a text file on the remote systems.\n" >&2
@@ -38,13 +43,15 @@ run_server() {
     loop="/tmp/$(basename $CMD_FEED .fifo)-$server.fifo"
     mkfifo "$loop"
     trap "clean_server $loop" 2 15
-    while read cmd; do
+    while lockfile -$LOCK_TIMEOUT "$LOCK"; read cmd; do
+        rm -f "$LOCK"
         cmd_sanitized="$(echo "$cmd" | sed "$S_SPACE;$R_DASH;$S_QUOTE")"
         out_file="$(printf "$OUT_FMT" "$cmd_sanitized")"
         printf "$server: $cmd\n" >&2
         printf "$cmd > $out_file\necho\n"
         read line < "$loop" > /dev/null # Block until command completes
     done | ssh -oBatchMode=yes -oStrictHostKeyChecking=no "$server" "sh" > "$loop"
+    rm -f "$LOCK"
     clean_server "$loop"
     echo "Server '$server' finished!" >&2
 }
@@ -54,20 +61,20 @@ clean_up() {
         pkill -P $pid
     done
     rm "$CMD_FEED"
+    [ -e "$LOCK" ] && rm -f "$LOCK"
     exit 2
 }
 
 main() {
     mkfifo "$CMD_FEED"
     trap clean_up 2 15
-    sed '/^[[:space:]]*$/d' "$CONF_LIST" > "$CMD_FEED" &
+    cat "$CONF_LIST" | sed '/^[[:space:]]*$/d' > "$CMD_FEED" &
     pids=""
-    for server in $(cat $SERVER_LIST); do
+    for server in $(head -n$MAX_PROCS "$SERVER_LIST"); do
         run_server "$server" < "$CMD_FEED" > /dev/null &
-        pids="$pids $!"
     done
-    for pid in $pids; do
-        wait $pids
+    for pid in $(pgrep -P $$); do
+        wait $pid
     done
     clean_up
     echo "All jobs finished!"

+ 1 - 1
src/feature-extractor/sample.py

@@ -1,4 +1,3 @@
-import pyshark
 from datetime import datetime
 import typing
 from typing import List
@@ -50,6 +49,7 @@ class Sample:
             .total_seconds()
         
     def __init__(self, packets, keylog: typing.TextIO):
+        import pyshark
         self.__general = {}
         self.__extract_tag(keylog)
         self.__extract_activity_stats(packets)