2022-04-21 23:05:11 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2022-04-24 23:23:22 +02:00
|
|
|
import numpy as np
|
2022-04-22 18:09:22 +02:00
|
|
|
import statistics
|
2022-04-21 23:05:11 +02:00
|
|
|
from collections import defaultdict
|
|
|
|
from typing import Dict
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import time
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
def load_log(path: str) -> Dict[datetime, str]:
|
|
|
|
time_crawler = {}
|
|
|
|
with open(path, 'r') as f:
|
|
|
|
for line in f:
|
|
|
|
unix_nanos, crawler, _ = line.split(' , ')
|
|
|
|
when = datetime.utcfromtimestamp(int(unix_nanos) / 1000000000)
|
|
|
|
time_crawler[when] = crawler
|
|
|
|
|
|
|
|
return time_crawler
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-04-22 18:09:22 +02:00
|
|
|
def plot_devi(data: Dict[datetime, str]):
|
2022-04-21 23:05:11 +02:00
|
|
|
diffs = []
|
|
|
|
per_crawler = defaultdict(list)
|
|
|
|
sor = list(sorted(data.items(), key=lambda kv: kv[0]))
|
2022-04-22 18:09:22 +02:00
|
|
|
# c = 0
|
|
|
|
per_diff = defaultdict(list)
|
2022-04-21 23:05:11 +02:00
|
|
|
for prev, next in zip(sor, sor[1:]):
|
2022-04-24 23:23:22 +02:00
|
|
|
# diff = abs(2.5 - (next[0].timestamp() - prev[0].timestamp()))
|
|
|
|
diff = ((next[0].timestamp() - prev[0].timestamp()) - 2.5)
|
2022-04-22 18:09:22 +02:00
|
|
|
diffs.append(diff)
|
2022-04-21 23:05:11 +02:00
|
|
|
per_crawler[prev[1]].append(prev[0])
|
2022-04-22 18:09:22 +02:00
|
|
|
per_diff[prev[1]].append(diff)
|
|
|
|
# c = (c + 1) % 4
|
2022-04-21 23:05:11 +02:00
|
|
|
|
|
|
|
|
|
|
|
# expected = [2.5] * len(diffs)
|
|
|
|
# x = list(range(len(diffs)))
|
|
|
|
# x = []
|
|
|
|
x = [2.5 * x for x in range(len(diffs))]
|
|
|
|
fig, ax = plt.subplots()
|
2022-04-22 18:09:22 +02:00
|
|
|
ax.set_title('Deviation between crawl events')
|
2022-04-21 23:05:11 +02:00
|
|
|
# ax.set_ylabel()
|
|
|
|
ax.set_xlabel('Time passed in seconds')
|
|
|
|
ax.set_ylabel('Deviation in seconds')
|
|
|
|
# ax.plot(x, expected, label='Expected difference')
|
2022-04-22 18:09:22 +02:00
|
|
|
ax.scatter(x, diffs, label='Deviation from the expected value', s=10)
|
2022-04-21 23:05:11 +02:00
|
|
|
fig.legend()
|
|
|
|
# plt.show()
|
2022-04-22 18:09:22 +02:00
|
|
|
plt.savefig('./time_devi.png')
|
|
|
|
plt.close()
|
|
|
|
|
|
|
|
# x = [2.5 * x for x in range(len(diffs))]
|
|
|
|
fig, ax = plt.subplots()
|
|
|
|
ax.set_title('Deviation between crawl events')
|
|
|
|
# ax.set_ylabel()
|
|
|
|
ax.set_xlabel('Time passed in seconds')
|
|
|
|
ax.set_ylabel('Deviation in seconds')
|
|
|
|
# ax.plot(x, expected, label='Expected difference')
|
|
|
|
for c, vals in per_diff.items():
|
|
|
|
# if not c in ['c0', 'c3']:
|
|
|
|
# continue
|
|
|
|
x = [10 * x for x in range(len(vals))]
|
|
|
|
|
|
|
|
n = int(c[1:])
|
|
|
|
ax.scatter(x, vals, label=f'Deviation between c{n} and c{(n+1)%4}', s=10)
|
|
|
|
fig.legend()
|
|
|
|
# plt.show()
|
|
|
|
plt.savefig('./xxx.png')
|
2022-04-21 23:05:11 +02:00
|
|
|
plt.close()
|
|
|
|
|
|
|
|
for c in per_crawler.keys():
|
|
|
|
t = per_crawler[c]
|
|
|
|
devi = []
|
|
|
|
for pre, nex in zip(t, t[1:]):
|
2022-04-24 23:23:22 +02:00
|
|
|
# devi.append(abs(10 - (nex.timestamp() - pre.timestamp())))
|
|
|
|
devi.append(((nex.timestamp() - pre.timestamp()) - 10))
|
|
|
|
x = np.array([10 * x for x in range(len(devi))])
|
|
|
|
devi = np.array(devi)
|
2022-04-21 23:05:11 +02:00
|
|
|
fig, ax = plt.subplots()
|
2022-04-22 18:09:22 +02:00
|
|
|
ax.scatter(x, devi, s=10)
|
2022-04-24 23:23:22 +02:00
|
|
|
m, b = np.polyfit(x, devi, 1)
|
|
|
|
plt.plot(x, m*x+b, color='red')
|
2022-04-21 23:05:11 +02:00
|
|
|
ax.set_title(f'Timedeviation for {c}')
|
|
|
|
ax.set_xlabel('Time passed in seconds')
|
|
|
|
ax.set_ylabel('Deviation in seconds')
|
2022-04-22 18:09:22 +02:00
|
|
|
plt.savefig(f'./time_devi_{c}.png')
|
2022-04-21 23:05:11 +02:00
|
|
|
plt.close()
|
2022-04-24 23:23:22 +02:00
|
|
|
print(f'{c} & \\num{{{statistics.mean(devi)}}} \\\\')
|
2022-04-21 23:05:11 +02:00
|
|
|
# for ts in per_crawler[c]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-04-22 18:09:22 +02:00
|
|
|
|
2022-04-21 23:05:11 +02:00
|
|
|
def main():
|
|
|
|
data = load_log('./dummy.log')
|
2022-04-22 18:09:22 +02:00
|
|
|
plot_devi(data)
|
2022-04-21 23:05:11 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|