diff --git a/assets/avg_out_edges.png b/assets/avg_out_edges.png index beeb00e3..07153a0b 100644 Binary files a/assets/avg_out_edges.png and b/assets/avg_out_edges.png differ diff --git a/assets/reduced_ranks/0/in_out.png b/assets/reduced_ranks/0/in_out.png new file mode 100644 index 00000000..c087e8f0 Binary files /dev/null and b/assets/reduced_ranks/0/in_out.png differ diff --git a/assets/reduced_ranks/0/pr.png b/assets/reduced_ranks/0/pr.png new file mode 100644 index 00000000..637bb12b Binary files /dev/null and b/assets/reduced_ranks/0/pr.png differ diff --git a/assets/reduced_ranks/0/sr.png b/assets/reduced_ranks/0/sr.png new file mode 100644 index 00000000..4b44633d Binary files /dev/null and b/assets/reduced_ranks/0/sr.png differ diff --git a/assets/reduced_ranks/1/in_out.png b/assets/reduced_ranks/1/in_out.png new file mode 100644 index 00000000..cfd5d103 Binary files /dev/null and b/assets/reduced_ranks/1/in_out.png differ diff --git a/assets/reduced_ranks/1/pr.png b/assets/reduced_ranks/1/pr.png new file mode 100644 index 00000000..3a966a75 Binary files /dev/null and b/assets/reduced_ranks/1/pr.png differ diff --git a/assets/reduced_ranks/1/sr.png b/assets/reduced_ranks/1/sr.png new file mode 100644 index 00000000..6c80948e Binary files /dev/null and b/assets/reduced_ranks/1/sr.png differ diff --git a/assets/reduced_ranks/2/in_out.png b/assets/reduced_ranks/2/in_out.png new file mode 100644 index 00000000..138136ea Binary files /dev/null and b/assets/reduced_ranks/2/in_out.png differ diff --git a/assets/reduced_ranks/2/pr.png b/assets/reduced_ranks/2/pr.png new file mode 100644 index 00000000..6bc4016e Binary files /dev/null and b/assets/reduced_ranks/2/pr.png differ diff --git a/assets/reduced_ranks/2/sr.png b/assets/reduced_ranks/2/sr.png new file mode 100644 index 00000000..7b5e7b95 Binary files /dev/null and b/assets/reduced_ranks/2/sr.png differ diff --git a/assets/reduced_ranks/3/in_out.png b/assets/reduced_ranks/3/in_out.png new file mode 100644 index 00000000..fe870d48 Binary files /dev/null and b/assets/reduced_ranks/3/in_out.png differ diff --git a/assets/reduced_ranks/3/pr.png b/assets/reduced_ranks/3/pr.png new file mode 100644 index 00000000..24bdad3d Binary files /dev/null and b/assets/reduced_ranks/3/pr.png differ diff --git a/assets/reduced_ranks/3/sr.png b/assets/reduced_ranks/3/sr.png new file mode 100644 index 00000000..a80d8b04 Binary files /dev/null and b/assets/reduced_ranks/3/sr.png differ diff --git a/assets/time_deriv.png b/assets/time_deriv.png new file mode 100644 index 00000000..1d880c78 Binary files /dev/null and b/assets/time_deriv.png differ diff --git a/codes/frequency_deriv/.envrc b/codes/frequency_deriv/.envrc new file mode 100644 index 00000000..051d09d2 --- /dev/null +++ b/codes/frequency_deriv/.envrc @@ -0,0 +1 @@ +eval "$(lorri direnv)" diff --git a/codes/frequency_deriv/frequency_deriv.py b/codes/frequency_deriv/frequency_deriv.py new file mode 100644 index 00000000..a351ada1 --- /dev/null +++ b/codes/frequency_deriv/frequency_deriv.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +from collections import defaultdict +from typing import Dict +import matplotlib.pyplot as plt +import time +from datetime import datetime + +def load_log(path: str) -> Dict[datetime, str]: + time_crawler = {} + with open(path, 'r') as f: + for line in f: + unix_nanos, crawler, _ = line.split(' , ') + when = datetime.utcfromtimestamp(int(unix_nanos) / 1000000000) + time_crawler[when] = crawler + + return time_crawler + + + +def plot_deriv(data: Dict[datetime, str]): + diffs = [] + per_crawler = defaultdict(list) + sor = list(sorted(data.items(), key=lambda kv: kv[0])) + for prev, next in zip(sor, sor[1:]): + diffs.append(abs(2.5 - (next[0].timestamp() - prev[0].timestamp()))) + per_crawler[prev[1]].append(prev[0]) + + + # expected = [2.5] * len(diffs) + # x = list(range(len(diffs))) + # x = [] + x = [2.5 * x for x in range(len(diffs))] + fig, ax = plt.subplots() + ax.set_title('Timedelta between crawl events in seconds') + # ax.set_ylabel() + ax.set_xlabel('Time passed in seconds') + ax.set_ylabel('Deviation in seconds') + # ax.plot(x, expected, label='Expected difference') + ax.plot(x, diffs, label='Deviation from the expected value') + fig.legend() + # plt.show() + plt.savefig('./time_deriv.png') + plt.close() + + for c in per_crawler.keys(): + t = per_crawler[c] + devi = [] + for pre, nex in zip(t, t[1:]): + devi.append(abs(10 - (nex.timestamp() - pre.timestamp()))) + x = [10 * x for x in range(len(devi))] + fig, ax = plt.subplots() + ax.plot(x, devi) + ax.set_title(f'Timedeviation for {c}') + ax.set_xlabel('Time passed in seconds') + ax.set_ylabel('Deviation in seconds') + plt.savefig(f'./time_deriv_{c}.png') + plt.close() + # for ts in per_crawler[c]: + + + + +def main(): + data = load_log('./dummy.log') + plot_deriv(data) + + +if __name__ == '__main__': + main() diff --git a/codes/frequency_deriv/shell.nix b/codes/frequency_deriv/shell.nix new file mode 100644 index 00000000..d4c84585 --- /dev/null +++ b/codes/frequency_deriv/shell.nix @@ -0,0 +1,16 @@ +{ pkgs ? import {} }: +let + py-packages = python-packages: with python-packages; [ + matplotlib + numpy + networkx + scipy + ]; + py-package = pkgs.python3.withPackages py-packages; +in + +pkgs.mkShell { + buildInputs = [ + py-package + ]; +} diff --git a/codes/frequency_deriv/time_deriv.png b/codes/frequency_deriv/time_deriv.png new file mode 100644 index 00000000..3663f214 Binary files /dev/null and b/codes/frequency_deriv/time_deriv.png differ diff --git a/codes/frequency_deriv/time_deriv_c0.png b/codes/frequency_deriv/time_deriv_c0.png new file mode 100644 index 00000000..8a1e190c Binary files /dev/null and b/codes/frequency_deriv/time_deriv_c0.png differ diff --git a/codes/frequency_deriv/time_deriv_c1.png b/codes/frequency_deriv/time_deriv_c1.png new file mode 100644 index 00000000..aa83a025 Binary files /dev/null and b/codes/frequency_deriv/time_deriv_c1.png differ diff --git a/codes/frequency_deriv/time_deriv_c2.png b/codes/frequency_deriv/time_deriv_c2.png new file mode 100644 index 00000000..3885521f Binary files /dev/null and b/codes/frequency_deriv/time_deriv_c2.png differ diff --git a/codes/frequency_deriv/time_deriv_c3.png b/codes/frequency_deriv/time_deriv_c3.png new file mode 100644 index 00000000..47318db7 Binary files /dev/null and b/codes/frequency_deriv/time_deriv_c3.png differ diff --git a/codes/node-ranking/plot_in_out_avg.py b/codes/node-ranking/plot_in_out_avg.py index e8344eb9..e8718852 100644 --- a/codes/node-ranking/plot_in_out_avg.py +++ b/codes/node-ranking/plot_in_out_avg.py @@ -23,7 +23,7 @@ def main(): avg_in.append(v['avg_in']) avg_out.append(v['avg_out']) known_in.append(v['known_in']) - known_out.append(v['known_out']) + # known_out.append(v['known_out']) number_of_nodes.append(v['number_of_nodes']) @@ -31,11 +31,12 @@ def main(): ax.plot(times, avg_in, label='Avg. In') # ax.plot(times, avg_out, label='Avg. Out') ax.plot(times, known_in, label='Known In') - ax.plot(times, known_out, label='Known Out') - ax.plot(times, number_of_nodes, label='Number of nodes') + # ax.plot(times, known_out, label='Known Out') + ax.plot(times, number_of_nodes, label='Total number of nodes') ax.set_title(f'Average edge count per hour') fig.autofmt_xdate() fig.legend() + ax.set_ylim(ymin=0) plt.savefig(f'./tmp_plot.png') # print('created sr plot') plt.show() diff --git a/codes/node-ranking/plot_reduced.py b/codes/node-ranking/plot_reduced.py index 407ed1d5..0a759c5f 100644 --- a/codes/node-ranking/plot_reduced.py +++ b/codes/node-ranking/plot_reduced.py @@ -124,7 +124,7 @@ def plot2(percentage, algo, data): fig, ax = plt.subplots() a = 'SensorRank' if algo == 'sr' else 'RageRank' ax.set_ylabel(f'{a}') - ax.plot(times, mean, label='Avg. Rank') + # ax.plot(times, mean, label='Avg. Rank') # ax.errorbar(times, mean, stdev, label='Avg. Rank') ax.plot(times, mean, label='Avg. Rank') # ax.plot(times, known_in, label='Known In') # TODO @@ -156,13 +156,14 @@ def plot_in_out2(percentage, data): known_out.append(d[algo]['known_out']) fig, ax = plt.subplots() - a = 'SensorRank' if algo == 'sr' else 'RageRank' - ax.set_ylabel(f'{a}') + # a = 'SensorRank' if algo == 'sr' else 'RageRank' + ax.set_ylabel('Incoming edges') ax.plot(times, avg_in, label='Avg. In') # ax.plot(times, known_in, label='Known In') # TODO ax.plot(times, known_in, label='Known In') - ax.plot(times, known_out, label='Known out') - title = f'In And Out after removing {percentage * 100}% edges' + # ax.plot(times, known_out, label='Known out') + ax.set_ylim(ymin=0) + title = f'In degree after removing {percentage * 100}% edges' ax.set_title(title) fig.autofmt_xdate() diff --git a/codes/node-ranking/rank_reduced.py b/codes/node-ranking/rank_reduced.py index cbe262f7..ac0db5e0 100644 --- a/codes/node-ranking/rank_reduced.py +++ b/codes/node-ranking/rank_reduced.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import networkx as nx import statistics import multiprocessing from random import sample @@ -34,6 +35,19 @@ def sensor_rank(graph): lambda g, node: sr(g, node, number_of_nodes) ) +def pr_nx(graph): + return nx.algorithms.link_analysis.pagerank(graph, alpha=1.0) + +def sr_nx(graph, pr_nx): + sr = {} + V = len(list(graph.nodes())) + for node, rank in pr_nx.items(): + succs = len(list(graph.successors(node))) + if succs != 0: + preds = len(list(graph.predecessors(node))) + sr[node] = (rank / succs) * (preds / V) + return sr + def find_known(g, known): nodes = list(filter(lambda n: n.node == known.node, g.nodes())) n = len(nodes) @@ -133,6 +147,49 @@ def rank(path): res = {'sr': res_sr, 'pr': res_pr} return res + +def analyze2(g, data): + known = find_known(g, rank_with_churn.KNOWN) + # avg_r = rank_with_churn.avg_without_known(g) + avg_in = rank_with_churn.avg_in(g) + kn_in = known_in(g, known) + kn_out = known_out(g, known) + # d = list(map(lambda node: node.rank, g.nodes())) + d = list(map(lambda kv: kv[1], data.items())) + mean = statistics.mean(d) + stddev = statistics.stdev(d) + r_known = None + for k, v in data.items(): + if k.node == known.node: + r_known = v + break + return { + 'known_rank': r_known, + 'known_in': kn_in, + 'known_out': kn_out, + # 'avg_rank': avg_r, + 'avg_in': avg_in, + 'mean': mean, + 'stdev': stddev, + } + +def rank2(path): + edges = reduce_edges.load_data(path) + g = build_graph(edges, initial_rank=rank_with_churn.INITIAL_RANK) + + + print('pr start') + g_pr = pr_nx(g) + print('sr start') + g_sr = sr_nx(g, g_pr) + print('analyze pr start') + res_pr = analyze2(g, g_pr) + print('analyze sr start') + res_sr = analyze2(g, g_sr) + print('done!') + res = {'sr': res_sr, 'pr': res_pr} + return res + def main(): # pool = multiprocessing.Pool(processes=4) params = [] @@ -150,6 +207,7 @@ def main(): with multiprocessing.Pool(processes=8) as pool: l_path_data = pool.map(wohoo, params) for path_data in l_path_data: + print(f'{path_data=}') for path, data in path_data.items(): with reduce_edges.open_mkdir(path, 'w') as f: json.dump(data, f) @@ -171,8 +229,8 @@ def wohoo(p): # with open() as f: # json.dump(result, f) when = datetime.fromtimestamp(float(file.split('/')[-1][:-4])) - path = f'./data_reduced/{reduced_percentage:.02f}/{when.timestamp()}.json' - result = rank(file) + path = f'./data_reduced2/{reduced_percentage:.02f}/{when.timestamp()}.json' + result = rank2(file) return {path: result} diff --git a/codes/node-ranking/shell.nix b/codes/node-ranking/shell.nix index d8a81955..d4c84585 100644 --- a/codes/node-ranking/shell.nix +++ b/codes/node-ranking/shell.nix @@ -4,6 +4,7 @@ let matplotlib numpy networkx + scipy ]; py-package = pkgs.python3.withPackages py-packages; in diff --git a/content.tex b/content.tex index 01148f18..c991186a 100644 --- a/content.tex +++ b/content.tex @@ -445,7 +445,7 @@ It also allows us to get rid of the state in our strategy since we don't have to %}}} load balancing %{{{ frequency reduction -\subsection{Reduction of Request Frequency} +\subsection{Reduction of Request Frequency}\label{sec:stratRedReqFreq} The GameOver Zeus botnet limited the number of requests a peer was allowed to perform and blacklisted peers, that exceeded the limit, as an anti-monitoring mechanism~\cite{bib:andriesse_goz_2013}. In an uncoordinated crawler approach, the crawl frequency has to be limited to prevent hitting the request limit. @@ -584,7 +584,11 @@ Based on this, \emph{SensorRank} is defined as Since crawlers never respond to peer list requests, they will always be detectable by the described approach but sensors might benefit from the following technique. -By responding to peer list requests with plausible data and thereby producing valid outgoing edges from the sensors, we will try to make those metrics less suspicious. +The PageRank and SensorRank metric are calculated over the sum of the ranks of a node's predecessors. +We will investigate, how limiting the number of predecessors helps producing inconspicuous ranks for a sensor. + +% By responding to peer list requests with plausible data and thereby producing valid outgoing edges from the sensors, we will try to make those metrics less suspicious. +To counter the SensorBuster metric, outgoing edges to valid peers from the botnet are required so the sensor does not build a \ac{wcc}. The challenge here is deciding which peers can be returned without actually supporting the network. The following candidates to place on the neighbor list will be investigated: @@ -606,10 +610,6 @@ The following candidates to place on the neighbor list will be investigated: Returning all the other sensors when responding to peer list requests, thereby effectively creating a complete graph \(K_{\abs{C}}\) among the workers, creates valid outgoing edges. The resulting graph will still form a \ac{wcc} with now edges back into the main network. -PageRank is the sum of a node's predecessors ranks divided by the amount of successors each predecessor's successors. -Predecessors with many successors should therefore reduce the rank. -By their nature, crawlers have many successors, so they are good candidates to reduce the PageRank of a sensor. -\todo{crawlers as predecessors} %{{{ churned peers \subsubsection{Churned Peers After IP Rotation} @@ -630,6 +630,20 @@ Those peers can be used as fake neighbors and create valid-looking outgoing edge %}}} cg nat +\clearpage{} +\todo{clearpage?} +In theory, it would be possible to detect churned peers or peers behind carrier-grade \acs{nat}, without coordinating the sensors but the coordination gives us a few advantages: + +\begin{itemize} + + \item A peer might blacklist a sensor which looks exactly the same as a churned peer from the point of view of an uncoordinated sensor. + The coordination backend has more knowledge and can detect this, if another sensor is still contacted by the peer in question. + + \item The coordination backend can include different streams of information to decide which peers to place in the sensor's neighborhood. + Knowledge about geolocations, \ac{as} and their IP rotation behavior can be consulted to make better informed choices for neighborhood candidates. + +\end{itemize} + %}}} against graph metrics %}}} strategies @@ -650,7 +664,7 @@ We will compare the variance \(\sigma^2\) and standard derivation \(\sigma\) to %{{{ fig:ipPartC02 \begin{figure}[H] \centering -\includegraphics[width=.7\linewidth]{ip_part_c02.png} +\includegraphics[width=1\linewidth]{ip_part_c02.png} \caption{IP based partitioning for 2 crawlers}\label{fig:ipPartC02} \begin{align*} n &= 2 \\ @@ -668,7 +682,7 @@ We will compare the variance \(\sigma^2\) and standard derivation \(\sigma\) to %{{{ fig:ipPartC04 \begin{figure}[H] \centering -\includegraphics[width=.7\linewidth]{ip_part_c04.png} +\includegraphics[width=1\linewidth]{ip_part_c04.png} \caption{IP based partitioning for 4 crawlers}\label{fig:ipPartC04} \begin{align*} n &= 4 \\ @@ -687,7 +701,7 @@ We will compare the variance \(\sigma^2\) and standard derivation \(\sigma\) to %{{{ fig:ipPartC06 \begin{figure}[H] \centering -\includegraphics[width=.7\linewidth]{ip_part_c06.png} +\includegraphics[width=1\linewidth]{ip_part_c06.png} \caption{IP based partitioning for 6 crawlers}\label{fig:ipPartC06} \begin{align*} n &= 6 \\ @@ -706,7 +720,7 @@ We will compare the variance \(\sigma^2\) and standard derivation \(\sigma\) to %{{{ fig:ipPartC10 \begin{figure}[H] \centering -\includegraphics[width=.7\linewidth]{ip_part_c10.png} +\includegraphics[width=1\linewidth]{ip_part_c10.png} \caption{IP based partitioning for 10 crawlers}\label{fig:ipPartC10} \begin{align*} n &= 10 \\ @@ -742,7 +756,7 @@ Therefore, we simulate the partitioning on a bigger sample of \num{1000000} rand %{{{ fig:randIpPartC02 \begin{figure}[H] \centering -\includegraphics[width=.8\linewidth]{rand_ip_part_c02.png} +\includegraphics[width=1\linewidth]{rand_ip_part_c02.png} \caption{IP based partitioning for 2 crawlers on generated dataset}\label{fig:randIpPartC02} \begin{align*} n &= 2 \\ @@ -761,7 +775,7 @@ Therefore, we simulate the partitioning on a bigger sample of \num{1000000} rand %{{{ fig:randIpPartC04 \begin{figure}[H] \centering -\includegraphics[width=.8\linewidth]{rand_ip_part_c04.png} +\includegraphics[width=1\linewidth]{rand_ip_part_c04.png} \caption{IP based partitioning for 4 crawlers on generated dataset}\label{fig:randIpPartC04} \begin{align*} n &= 4 \\ @@ -780,7 +794,7 @@ Therefore, we simulate the partitioning on a bigger sample of \num{1000000} rand %{{{ fig:randIpPartC06 \begin{figure}[H] \centering -\includegraphics[width=.8\linewidth]{rand_ip_part_c06.png} +\includegraphics[width=1\linewidth]{rand_ip_part_c06.png} \caption{IP based partitioning for 6 crawlers on generated dataset}\label{fig:randIpPartC06} \begin{align*} n &= 6 \\ @@ -831,10 +845,74 @@ Therefore, we simulate the partitioning on a bigger sample of \num{1000000} rand As expected, the work is still not perfectly distributed among the crawlers but evenly enough for our use case. The derivation for larger botnets is within \SI{0.5}{\percent} of the even distribution. -This is good enough for balancing the work among workers. +This is good enough for balancing the tasks among workers. %}}} eval load balancing +%{{{ eval redu requ freq +\subsection{Reduction of Request Frequency} + +To evaluate the request frequency optimization described in \Fref{sec:stratRedReqFreq}, crawl a simulated peer and check if the requests are evenly distributed and how big the deviation from the theoretically optimal result is. +To get more realistic results, the crawlers and simulated peer are running on different machines so they are not within the same LAN. +We use the same parameters as in the example above: + +\begin{align*} + n &= 4 \\ + l &= \SI{6}{\request\per\minute} \\ + f &= \SI{24}{\request\per\minute} \\ + o &= \SI{2.5}{\second} +\end{align*} + +To recap, this is what the optimal timeline would look like: + +\begin{center} +\begin{chronology}[10]{0}{60}{0.9\textwidth} + \event{0}{\(C_0\)} + \event{10}{\(C_0\)} + \event{20}{\(C_0\)} + \event{30}{\(C_0\)} + \event{40}{\(C_0\)} + \event{50}{\(C_0\)} + \event{60}{\(C_0\)} + + \event{2.5}{\(C_1\)} + \event{12.5}{\(C_1\)} + \event{22.5}{\(C_1\)} + \event{32.5}{\(C_1\)} + \event{42.5}{\(C_1\)} + \event{52.5}{\(C_1\)} + + \event{5}{\(C_2\)} + \event{15}{\(C_2\)} + \event{25}{\(C_2\)} + \event{35}{\(C_2\)} + \event{45}{\(C_2\)} + \event{55}{\(C_2\)} + + \event{7.5}{\(C_3\)} + \event{17.5}{\(C_3\)} + \event{27.5}{\(C_3\)} + \event{37.5}{\(C_3\)} + \event{47.5}{\(C_3\)} + \event{57.5}{\(C_3\)} +\end{chronology} +\end{center} + + +The ideal distribution would be \SI{2.5}{\second} between each two events. +Due to network latency and load from crawling other peers, we expect the actual result to deviate from the optimal value over time. +With this experiment we try to estimate the impact of the latency. +If it is existent and measurable the crawlers have to be rescheduled periodically to keep the deviation at an acceptable level. + +\begin{figure}[H] + \centering + \includegraphics[width=1\linewidth]{time_deriv.png} + \caption{Derivation from the expected interval}\label{fig:timeDeriv} +\end{figure} + + +%}}} eval redu requ freq + %{{{ eval creating edges \subsection{Impact of Additional Edges on Graph Metrics} @@ -908,8 +986,6 @@ SensorBuster relies on the assumption that sensors don't have any outgoing edges For the \ac{wcc} metric, it is obvious that even a single edge back into the main network is enough to connect the sensor back to the main graph and therefore beat this metric. -\todo{formulieren} - \subsubsection{Effectiveness against Page- and SensorRank} In this section we will evaluate how adding outgoing edges to a sensor impacts it's PageRank and SensorRank values. @@ -1014,59 +1090,158 @@ Looking at the data in smaller buckets of one hour each, the average number of s \end{figure} %}}}fig:avg_out_edges +Experiments were performed, in which the incoming edges for the known sensor are reduced by increasing factors, to see, when the sensor's rank reaches the overall average. + +% \begin{figure}[H] +% \centering +% \includegraphics[width=1\textwidth]{reduced_ranks/0/in_out.png} +% % \caption{PageRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr75} +% \end{figure}% + +\begin{figure}[H] + \centering +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/0/pr.png} + \caption{PageRank after removing \SI{0}{\percent} of edges}\label{fig:pr0} +\end{subfigure}% +\hfill +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/0/sr.png} + \caption{SensorRank after removing \SI{0}{\percent} of edges}\label{fig:sr0} +\end{subfigure}% + % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +\end{figure} + +\Fref{fig:pr0} and \Fref{fig:sr0} show the situation on the base truth without modifications. + +\begin{figure}[H] + \centering + \includegraphics[width=.8\textwidth]{reduced_ranks/1/in_out.png} + \caption{Incoming edges after removing \SI{10}{\percent} of edges}\label{fig:in1} + % \caption{PageRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr75} +\end{figure}% + +\begin{figure}[H] + \centering +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/1/pr.png} + \caption{PageRank after removing \SI{10}{\percent} of edges}\label{fig:pr1} +\end{subfigure}% +\hfill +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/1/sr.png} + \caption{SensorRank after removing \SI{10}{\percent} of edges}\label{fig:sr1} +\end{subfigure}% + % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +\end{figure} + +\begin{figure}[H] + \centering + \includegraphics[width=.8\textwidth]{reduced_ranks/2/in_out.png} + \caption{Incoming edges after removing \SI{20}{\percent} of edges}\label{fig:in2} + % \caption{PageRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr75} +\end{figure}% + +\begin{figure}[H] + \centering +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/2/pr.png} + \caption{PageRank after removing \SI{20}{\percent} of edges}\label{fig:pr2} +\end{subfigure}% +\hfill +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/2/sr.png} + \caption{SensorRank after removing \SI{20}{\percent} of edges}\label{fig:sr2} +\end{subfigure}% + % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +\end{figure} + +\begin{figure}[H] + \centering + \includegraphics[width=.8\textwidth]{reduced_ranks/3/in_out.png} + \caption{Incoming edges after removing \SI{30}{\percent} of edges}\label{fig:in3} + % \caption{PageRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr75} +\end{figure}% + +\begin{figure}[H] + \centering +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/3/pr.png} + \caption{PageRank after removing \SI{30}{\percent} of edges}\label{fig:pr3} +\end{subfigure}% +\hfill +\begin{subfigure}[b]{1\textwidth} + \centering + \includegraphics[width=.8\linewidth]{reduced_ranks/3/sr.png} + \caption{SensorRank after removing \SI{30}{\percent} of edges}\label{fig:sr3} +\end{subfigure}% + % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +\end{figure} + +We can see in \Fref{fig:sr2} and \Fref{fig:pr3}, that we have to reduce the incoming edges by \SI{20}{\percent} and \SI{30}{\percent} respectively to get average values for SensorRank and PageRank. +This also means, that the amount of incoming edges for a sensor must be about the same as the average about of incoming edges as can be seen in \Fref{fig:in3}. +Depending on the protocol details of the botnet (\eg{} how many incoming edges are allowed per peer), this means that a large amount of sensors is needed, if we want to monitor the whole network. + % Experiments were performed, in which a percentage of random outgoing edges were added to the known sensor, based on the amount of incoming edges: -We evaluate the impact of outgoing edges by picking a percentage of random nodes in each bucket and creating edges from the sensor to each of the sampled peers, thereby evening the ratio between \(\deg^{+}\) and \(\deg^{-}\). +% We evaluate the impact of outgoing edges by picking a percentage of random nodes in each bucket and creating edges from the sensor to each of the sampled peers, thereby evening the ratio between \(\deg^{+}\) and \(\deg^{-}\). -\begin{figure}[H] - \centering -\begin{subfigure}[b]{.75\textwidth} - \centering - \includegraphics[width=1\linewidth]{ranks/pr_75.png} - \caption{PageRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr75} -\end{subfigure}% -\hfill -\begin{subfigure}[b]{.75\textwidth} - \centering - \includegraphics[width=1\linewidth]{ranks/sr_75.png} - \caption{SensorRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:sr75} -\end{subfigure}% - % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} -\end{figure} +% \begin{figure}[H] +% \centering +% \begin{subfigure}[b]{.75\textwidth} +% \centering +% \includegraphics[width=1\linewidth]{ranks/pr_75.png} +% \caption{PageRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr75} +% \end{subfigure}% +% \hfill +% \begin{subfigure}[b]{.75\textwidth} +% \centering +% \includegraphics[width=1\linewidth]{ranks/sr_75.png} +% \caption{SensorRank after adding \(0.75 \times \abs{\text{pred}(v)}\) edges}\label{fig:sr75} +% \end{subfigure}% +% % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +% \end{figure} -\begin{figure}[H] - \centering -\begin{subfigure}[b]{.75\textwidth} - \centering - \includegraphics[width=1\linewidth]{ranks/pr_100.png} - \caption{PageRank after adding \(1.0 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr100} -\end{subfigure}% -\hfill -\begin{subfigure}[b]{.75\textwidth} - \centering - \includegraphics[width=1\linewidth]{ranks/sr_100.png} - \caption{SensorRank after adding \(1.0 \times \abs{\text{pred}(v)}\) edges}\label{fig:sr100} -\end{subfigure}% - % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} -\end{figure} +% \begin{figure}[H] +% \centering +% \begin{subfigure}[b]{.75\textwidth} +% \centering +% \includegraphics[width=1\linewidth]{ranks/pr_100.png} +% \caption{PageRank after adding \(1.0 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr100} +% \end{subfigure}% +% \hfill +% \begin{subfigure}[b]{.75\textwidth} +% \centering +% \includegraphics[width=1\linewidth]{ranks/sr_100.png} +% \caption{SensorRank after adding \(1.0 \times \abs{\text{pred}(v)}\) edges}\label{fig:sr100} +% \end{subfigure}% +% % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +% \end{figure} -\begin{figure}[H] - \centering -\begin{subfigure}[b]{.75\textwidth} - \centering - \includegraphics[width=1\linewidth]{ranks/pr_150.png} - \caption{PageRank after adding \(1.5 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr150} -\end{subfigure}% -\hfill -\begin{subfigure}[b]{.75\textwidth} - \centering - \includegraphics[width=1\linewidth]{ranks/sr_150.png} - \caption{SensorRank after adding \(1.5 \times \abs{\text{pred}(v)}\) edges}\label{fig:sr150} -\end{subfigure}% - % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} -\end{figure} +% \begin{figure}[H] +% \centering +% \begin{subfigure}[b]{.75\textwidth} +% \centering +% \includegraphics[width=1\linewidth]{ranks/pr_150.png} +% \caption{PageRank after adding \(1.5 \times \abs{\text{pred}(v)}\) edges}\label{fig:pr150} +% \end{subfigure}% +% \hfill +% \begin{subfigure}[b]{.75\textwidth} +% \centering +% \includegraphics[width=1\linewidth]{ranks/sr_150.png} +% \caption{SensorRank after adding \(1.5 \times \abs{\text{pred}(v)}\) edges}\label{fig:sr150} +% \end{subfigure}% +% % \caption{SensorRank distribution with initial rank \(\forall v \in V : \text{PR}(v) = 0.75\)}\label{fig:dist_sr_75} +% \end{figure} -These results show, that simply adding new edges is not enough and we need to limit the incoming edges to improve the Page- and SensorRank metrics. +% These results show, that simply adding new edges is not enough and we need to limit the incoming edges to improve the Page- and SensorRank metrics. %}}} eval creating edges @@ -1138,6 +1313,11 @@ Collaborative monitoring of \ac{p2p} botnets allows circumventing some anti-moni It also enables more effective monitoring systems for larger botnets, since each peer can be visited by only one crawler. The current concept of independent crawlers in \ac{bms} can also use multiple workers but there is no way to ensure a peer is not watched by multiple crawlers thereby using unnecessary resources. +We were able to show, that a collaborative monitoring approach for \ac{p2p} botnets helps to circumvent anti-monitoring and monitoring detection mechanisms and is helpful to improve resource usage when monitoring large botnets. +On the other hand, graph ranking algorithms have been proven to be hard to bypass without requiring large amounts of sensor nodes. + +Luckily most of the anti-monitoring and monitoring detection techniques discussed in this work are of academic nature and have not yet been deployed in real-world botnets. +Further investigation and improvements in \ac{p2p} botnet monitoring are required to prevent a situation were a botmaster implements the currently theoretical concepts and renders monitoring as it is currently done, ineffective. %}}} conclusion diff --git a/report.pdf b/report.pdf index 8f933e9e..bc218c0d 100644 Binary files a/report.pdf and b/report.pdf differ