-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
221 lines (172 loc) · 7.31 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import pandas as pd
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--path', help='folder path', type=str, default='')
parser.add_argument('-l', '--log_file', help='log file path', type=str, default='')
parser.add_argument('-c', '--clusters', help='most representative clusters ids', nargs='*', type=int, default=[])
def nodes_dict (edges, groups):
'''
Function to create dictionary with nodes attributes
Argguments:
edges{DataFrame} -- dataset with the edges in the graph
groups{DataFrame} -- dataset with the modularity class of each node in the graph
Returns:
[dict] -- nodes attributes
'''
nodes = np.unique([edges['source'],edges['target']])
d = {node:{
'gp':None,
'type':None,
'edges':{
'source':{'id':[], 'gp':[]},
'target':{'id':[], 'gp':[]}}
}
for node in nodes}
for key in d.keys():
# node's modularity
d[key]['gp'] = groups[groups['name'] == key]['group']
# source nodes --> nodes with node as target
source = list(edges[edges['target'] == key]['source'])
gp_source = list(groups[groups['name'].isin(source)]['group'])
d[key]['edges']['source']['id'] += source
d[key]['edges']['source']['gp'] += gp_source
# target nodes --> nodes with node as source
target = list(edges[edges['source'] == key]['target'])
gp_target = list(groups[groups['name'].isin(target)]['group'])
d[key]['edges']['target']['id'] += target
d[key]['edges']['source']['gp'] += gp_target
return d
def filter_graph (edges, groups, cluster_ids):
'''
Function to filter the graph, keeping only the most relevant modularity classes
Argguments:
edges{DataFrame} -- dataset with the edges in the graph
groups{DataFrame} -- dataset with the modularity class of each node in the graph
cluster_ids{list} -- optional list with the relevant modularity class ids
Returns:
[list] -- list of remaining edges and groups
'''
if len(cluster_ids) == 0:
# Removes clusters with less than 5% of nodes
freqs = groups['group'].value_counts()
ids = freqs[freqs > .05*np.sum(freqs)].index
groups = groups[groups['group'].isin(ids)]
else:
groups = groups[groups['group'].isin(cluster_ids)]
edges = edges[(edges['source'].isin(groups['name'])) & (edges['target'].isin(groups['name']))]
# Join edges and nodes' modularity
edges = edges.merge(groups, left_on = 'source', right_on = 'name')
edges = edges.merge(groups, left_on = 'target', right_on = 'name')
edges = edges.drop(['name_x','name_y'], 1)
edges.columns = ['source','target','gp_source','gp_target']
return [edges, groups]
def get_internal (edges):
'''
Function to create set of internal nodes
Argumrnts:
edges{DataFrame} -- dataset with the edges in the graph
Returns:
[list] -- list of internal nodes
'''
nodes = np.unique([edges['source'],edges['target']])
I = []
for node in nodes:
# edges linking this node to others
aux = edges[(edges['source'] == node) | (edges['target'] == node)]
# modularity of nodes linked with this node
gp = np.unique([aux['gp_source'].values,aux['gp_target'].values])
# keeping only if the node is linked with nodes on the same cluster (modularity)
if len(gp) == 1:
I.append(node)
return I
def get_boundaries (edges, groups, I):
'''
Function to create set of nodes on boundaries
Arguments:
edges{DataFrame} -- dataset with the edges in the graph
groups{DataFrame} -- dataset with the modularity class of each node in the graph
I{list} -- list with internal nodes
Returns:
[list] -- list of nodes on boundaries
'''
# condition 1 in the article
c1 = edges[edges['gp_source'] != edges['gp_target']]
nodes = np.unique([c1['source'], c1['target']])
B= []
for node in nodes:
# node modulatiry class id
gp = groups[groups['name'] == node]['group']
# internal nodes of this modularity
I_gp = groups[(groups['name'].isin(I)) & (groups['group'].isin(gp))]
# internal nodes linked with the node (condition 2)
I_edges = edges[(edges['source'] == node) | (edges['target'] == node)]
I_edges = I_edges[I_edges['gp_source'] == I_edges['gp_target']]
if I_edges.shape[0] > 0:
users = np.unique([I_edges['source'].values,I_edges['target'].values])
if any(u in I_gp['name'].values for u in users):
B.append(node)
return B
def get_internal_edges (edges, I, B):
'''
Function to create set of edges linking internal nodes with nodes on boundaries
Arguments:
edges{DataFrame} -- dataset with the edges in the graph
I{list} -- list with internal nodes
B{list} -- list with nodes on boundaries
Returns:
[DataFrame] -- edges of internal and boundaries nodes
'''
# edges where source is boundary and target is internal
e_i1 = edges[(edges['source'].isin(B)) & (edges['target'].isin(I))]
# edges where source is internal and target is boundary
e_i2 = edges[(edges['source'].isin(I)) & (edges['target'].isin(B))]
e_i = pd.concat([e_i1, e_i2])
e_i = e_i[e_i['gp_source'] == e_i['gp_target']]
return e_i
def get_boundary_edges (edges, B):
'''
Function to create set of edges linking nodes on boundaries within clusters
Arguments:
edges{DataFrame} -- dataset with the edges in the graph
B{list} -- list with nodes on boundaries
Returns:
[DataFrame] -- edges of nodes on boundaries
'''
# edges where source and target are boundary from different clusters (modularity)
e_b = edges[(edges['source'].isin(B)) & (edges['target'].isin(B))]
e_b = e_b[e_b['gp_source'] != e_b['gp_target']]
return e_b
def nodes_polarization (B, e_i, e_b):
'''
Function to calculate the polarization measure for each node on boundary
Arguments:
B{list} -- list with nodes on boundaries
e_i{DataFrame} -- dataset with internal edges
e_b{DataFrame} -- dataset with boundary edges
Returns:
[dict] -- polarization of each node
'''
P = {x:None for x in B}
for node in B:
# number of links with internal nodes
di = e_i[(e_i['source'] == node) | (e_i['target'] == node)].shape[0]
# number of links with boundary nodes
db = e_b[(e_b['source'] == node) | (e_b['target'] == node)].shape[0]
p = di/(di+db)-.5
P[node] = p
return P
def polarization (p_nodes):
'''
Function to calculate the polarization of the graph
Arguments:
p_nodes{dict} -- dictionary with nodes polarization
Returns:
[float] -- graph polarization measure
'''
# we cannot say that there is a polarization measure when there is no boundary conections
if len(p_nodes) == 0:
p = np.nan
else:
p = np.mean([value for key, value in p_nodes.items()])
return p