masterthesis/codes/node-ranking/rank_with_churn.py
2022-04-15 17:21:44 +02:00

151 lines
4.0 KiB
Python

#!/usr/bin/env python3
import glob
import json
import asyncio
import os
import numpy as np
from random import sample, seed
from collections import defaultdict
from datetime import datetime
from functools import reduce
from node_ranking import (
page_rank,
sensor_rank,
find_rank,
parse_csv,
csv_loader,
build_graph,
Node,
RankedNode,
)
def background(f):
def wrapped(*args, **kwargs):
return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
return wrapped
def load_data(path):
data = []
with open(path, 'r') as f:
for line in f.readlines():
data.append(parse_csv(line))
return data
INITIAL_RANK = 0.5
COUNT_NEW_EDGES = 50
KNOWN = RankedNode(Node('34.204.196.211', 9796), INITIAL_RANK)
PR_ITERATIONS = 5
REMOVE_EDGES = 0.5
PERCENT_EDGES = 2.5
def avg_rank(g):
nodes = list(g.nodes())
n = len(nodes)
avg = reduce(lambda acc, x: acc + x.rank, nodes, 0.) / n
return avg
def avg_without_known(g):
nodes = list(filter(lambda n: n.node != KNOWN.node, g.nodes()))
n = len(nodes)
avg = reduce(lambda acc, x: acc + x.rank, nodes, 0.) / n
return avg
def avg_in(g):
nodes = list(filter(lambda n: n.node != KNOWN.node, g.nodes()))
n = len(nodes)
avg = reduce(lambda acc, x: acc + len(list(g.predecessors(x))), nodes, 0.) / n
return avg
def avg_out(g):
nodes = list(filter(lambda n: n.node != KNOWN.node, g.nodes()))
n = len(nodes)
avg = reduce(lambda acc, x: acc + len(list(g.successors(x))), nodes, 0.) / n
return avg
def known_rank(g):
for n in g.nodes():
if n.node == KNOWN.node:
return n.rank
return None
# @background
def do_work(path, initial_rank, percent_edges, remove_edges, known):
print(f'starting {path} {initial_rank} {percent_edges} {known}')
when = datetime.fromtimestamp(float(path.split('/')[-1][:-4]))
edges = load_data(path)
g = build_graph(edges, initial_rank=initial_rank)
edges = list(filter(lambda e: e[1] == known, g.edges()))
for_removal = sample(edges, int(len(edges) * remove_edges))
print(f'removing {len(for_removal)} incoming edges')
for edge in for_removal:
g.remove_edge(edge[0], edge[1])
n_known_in = len(list(filter(lambda e: e[1] == known, g.edges())))
a_in = avg_in(g)
a_out = avg_out(g)
count_new_edges = int(n_known_in * percent_edges)
print(f'adding {count_new_edges} outgoing edges')
candidates = sample(list(g.nodes()), count_new_edges)
for node in candidates:
g.add_edge(known, node)
n_known_out = len(list(filter(lambda e: e[0] == known, g.edges())))
# page ranking
for _ in range(PR_ITERATIONS):
g = page_rank(g)
pr_avg = avg_rank(g)
pr_avg_filtered = avg_without_known(g)
pr_known = known_rank(g)
pr_perc_50 = np.percentile(np.array(sorted(map(lambda n: n.rank, g))), 50)
sr_g = sensor_rank(g)
sr_avg = avg_rank(sr_g)
sr_avg_filtered = avg_without_known(sr_g)
sr_known = known_rank(sr_g)
sr_perc_50 = np.percentile(np.array(sorted(map(lambda n: n.rank, sr_g))), 50)
data = {
'sr_avg': sr_avg,
'sr_avg_filtered': sr_avg_filtered,
'sr_known': sr_known,
'sr_perc_50': sr_perc_50,
'pr_avg': pr_avg,
'pr_avg_filtered': pr_avg_filtered,
'pr_known': pr_known,
'pr_perc_50': pr_perc_50,
'known_in': n_known_in,
'known_out': n_known_out,
'avg_in': a_in,
'avg_out': a_out,
}
base_path = f'percent/{initial_rank}/{percent_edges}'
os.makedirs(base_path, exist_ok=True)
file = f'{base_path}/{when.timestamp()}.json'
with open(file, 'w') as f:
json.dump(data, f)
print(f'finished {path} {initial_rank} {percent_edges} {known}')
def main():
for file in glob.glob('./edges/*.txt'):
for perc in [0.5, 0.75, 0.8, 1.0, 1.5, 2.0, 2.5]:
do_work(file, INITIAL_RANK, perc, REMOVE_EDGES, KNOWN)
if __name__ == '__main__':
main()