This commit is contained in:
Valentin Brandl
2022-04-21 23:05:11 +02:00
parent 9e2d55815e
commit b5b78140aa
28 changed files with 401 additions and 73 deletions

View File

@ -0,0 +1 @@
eval "$(lorri direnv)"

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
from collections import defaultdict
from typing import Dict
import matplotlib.pyplot as plt
import time
from datetime import datetime
def load_log(path: str) -> Dict[datetime, str]:
time_crawler = {}
with open(path, 'r') as f:
for line in f:
unix_nanos, crawler, _ = line.split(' , ')
when = datetime.utcfromtimestamp(int(unix_nanos) / 1000000000)
time_crawler[when] = crawler
return time_crawler
def plot_deriv(data: Dict[datetime, str]):
diffs = []
per_crawler = defaultdict(list)
sor = list(sorted(data.items(), key=lambda kv: kv[0]))
for prev, next in zip(sor, sor[1:]):
diffs.append(abs(2.5 - (next[0].timestamp() - prev[0].timestamp())))
per_crawler[prev[1]].append(prev[0])
# expected = [2.5] * len(diffs)
# x = list(range(len(diffs)))
# x = []
x = [2.5 * x for x in range(len(diffs))]
fig, ax = plt.subplots()
ax.set_title('Timedelta between crawl events in seconds')
# ax.set_ylabel()
ax.set_xlabel('Time passed in seconds')
ax.set_ylabel('Deviation in seconds')
# ax.plot(x, expected, label='Expected difference')
ax.plot(x, diffs, label='Deviation from the expected value')
fig.legend()
# plt.show()
plt.savefig('./time_deriv.png')
plt.close()
for c in per_crawler.keys():
t = per_crawler[c]
devi = []
for pre, nex in zip(t, t[1:]):
devi.append(abs(10 - (nex.timestamp() - pre.timestamp())))
x = [10 * x for x in range(len(devi))]
fig, ax = plt.subplots()
ax.plot(x, devi)
ax.set_title(f'Timedeviation for {c}')
ax.set_xlabel('Time passed in seconds')
ax.set_ylabel('Deviation in seconds')
plt.savefig(f'./time_deriv_{c}.png')
plt.close()
# for ts in per_crawler[c]:
def main():
data = load_log('./dummy.log')
plot_deriv(data)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,16 @@
{ pkgs ? import <nixpkgs> {} }:
let
py-packages = python-packages: with python-packages; [
matplotlib
numpy
networkx
scipy
];
py-package = pkgs.python3.withPackages py-packages;
in
pkgs.mkShell {
buildInputs = [
py-package
];
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

View File

@ -23,7 +23,7 @@ def main():
avg_in.append(v['avg_in'])
avg_out.append(v['avg_out'])
known_in.append(v['known_in'])
known_out.append(v['known_out'])
# known_out.append(v['known_out'])
number_of_nodes.append(v['number_of_nodes'])
@ -31,11 +31,12 @@ def main():
ax.plot(times, avg_in, label='Avg. In')
# ax.plot(times, avg_out, label='Avg. Out')
ax.plot(times, known_in, label='Known In')
ax.plot(times, known_out, label='Known Out')
ax.plot(times, number_of_nodes, label='Number of nodes')
# ax.plot(times, known_out, label='Known Out')
ax.plot(times, number_of_nodes, label='Total number of nodes')
ax.set_title(f'Average edge count per hour')
fig.autofmt_xdate()
fig.legend()
ax.set_ylim(ymin=0)
plt.savefig(f'./tmp_plot.png')
# print('created sr plot')
plt.show()

View File

@ -124,7 +124,7 @@ def plot2(percentage, algo, data):
fig, ax = plt.subplots()
a = 'SensorRank' if algo == 'sr' else 'RageRank'
ax.set_ylabel(f'{a}')
ax.plot(times, mean, label='Avg. Rank')
# ax.plot(times, mean, label='Avg. Rank')
# ax.errorbar(times, mean, stdev, label='Avg. Rank')
ax.plot(times, mean, label='Avg. Rank')
# ax.plot(times, known_in, label='Known In') # TODO
@ -156,13 +156,14 @@ def plot_in_out2(percentage, data):
known_out.append(d[algo]['known_out'])
fig, ax = plt.subplots()
a = 'SensorRank' if algo == 'sr' else 'RageRank'
ax.set_ylabel(f'{a}')
# a = 'SensorRank' if algo == 'sr' else 'RageRank'
ax.set_ylabel('Incoming edges')
ax.plot(times, avg_in, label='Avg. In')
# ax.plot(times, known_in, label='Known In') # TODO
ax.plot(times, known_in, label='Known In')
ax.plot(times, known_out, label='Known out')
title = f'In And Out after removing {percentage * 100}% edges'
# ax.plot(times, known_out, label='Known out')
ax.set_ylim(ymin=0)
title = f'In degree after removing {percentage * 100}% edges'
ax.set_title(title)
fig.autofmt_xdate()

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import networkx as nx
import statistics
import multiprocessing
from random import sample
@ -34,6 +35,19 @@ def sensor_rank(graph):
lambda g, node: sr(g, node, number_of_nodes)
)
def pr_nx(graph):
return nx.algorithms.link_analysis.pagerank(graph, alpha=1.0)
def sr_nx(graph, pr_nx):
sr = {}
V = len(list(graph.nodes()))
for node, rank in pr_nx.items():
succs = len(list(graph.successors(node)))
if succs != 0:
preds = len(list(graph.predecessors(node)))
sr[node] = (rank / succs) * (preds / V)
return sr
def find_known(g, known):
nodes = list(filter(lambda n: n.node == known.node, g.nodes()))
n = len(nodes)
@ -133,6 +147,49 @@ def rank(path):
res = {'sr': res_sr, 'pr': res_pr}
return res
def analyze2(g, data):
known = find_known(g, rank_with_churn.KNOWN)
# avg_r = rank_with_churn.avg_without_known(g)
avg_in = rank_with_churn.avg_in(g)
kn_in = known_in(g, known)
kn_out = known_out(g, known)
# d = list(map(lambda node: node.rank, g.nodes()))
d = list(map(lambda kv: kv[1], data.items()))
mean = statistics.mean(d)
stddev = statistics.stdev(d)
r_known = None
for k, v in data.items():
if k.node == known.node:
r_known = v
break
return {
'known_rank': r_known,
'known_in': kn_in,
'known_out': kn_out,
# 'avg_rank': avg_r,
'avg_in': avg_in,
'mean': mean,
'stdev': stddev,
}
def rank2(path):
edges = reduce_edges.load_data(path)
g = build_graph(edges, initial_rank=rank_with_churn.INITIAL_RANK)
print('pr start')
g_pr = pr_nx(g)
print('sr start')
g_sr = sr_nx(g, g_pr)
print('analyze pr start')
res_pr = analyze2(g, g_pr)
print('analyze sr start')
res_sr = analyze2(g, g_sr)
print('done!')
res = {'sr': res_sr, 'pr': res_pr}
return res
def main():
# pool = multiprocessing.Pool(processes=4)
params = []
@ -150,6 +207,7 @@ def main():
with multiprocessing.Pool(processes=8) as pool:
l_path_data = pool.map(wohoo, params)
for path_data in l_path_data:
print(f'{path_data=}')
for path, data in path_data.items():
with reduce_edges.open_mkdir(path, 'w') as f:
json.dump(data, f)
@ -171,8 +229,8 @@ def wohoo(p):
# with open() as f:
# json.dump(result, f)
when = datetime.fromtimestamp(float(file.split('/')[-1][:-4]))
path = f'./data_reduced/{reduced_percentage:.02f}/{when.timestamp()}.json'
result = rank(file)
path = f'./data_reduced2/{reduced_percentage:.02f}/{when.timestamp()}.json'
result = rank2(file)
return {path: result}

View File

@ -4,6 +4,7 @@ let
matplotlib
numpy
networkx
scipy
];
py-package = pkgs.python3.withPackages py-packages;
in