import copy
import random
from collections import defaultdict
from queue import Queue
import easygraph as eg
import numpy as np
from easygraph.utils import *
__all__ = [
"LPA",
"SLPA",
"HANP",
"BMLPA",
]
[docs]@not_implemented_for("multigraph")
def LPA(G):
"""Detect community by label propagation algorithm
Return the detected communities. But the result is random.
Each node in the network is initially assigned to its own community. At every iteration,nodes have
a label that the maximum number of their neighbors have. If there are more than one nodes fit and
available, choose a label randomly. Finally, nodes having the same labels are grouped together as
communities. In case two or more disconnected groups of nodes have the same label, we run a simple
breadth-first search to separate the disconnected communities
Parameters
----------
G : graph
A easygraph graph
Returns
----------
communities : dictionary
key: serial number of community , value: nodes in the community.
Examples
----------
>>> LPA(G)
References
----------
.. [1] Usha Nandini Raghavan, Réka Albert, and Soundar Kumara:
Near linear time algorithm to detect community structures in large-scale networks
"""
i = 0
label_dict = dict()
cluster_community = dict()
Next_label_dict = dict()
nodes = list(G.nodes.keys())
if len(nodes) == 1:
return {1: [nodes[0]]}
for node in nodes:
label_dict[node] = i
i = i + 1
loop_count = 0
while True:
loop_count += 1
random.shuffle(nodes)
for node in nodes:
labels = SelectLabels(G, node, label_dict)
if labels == []:
Next_label_dict[node] = label_dict[node]
continue
Next_label_dict[node] = random.choice(labels)
# Asynchronous updates. If you want to use synchronous updates, comment the line below
label_dict[node] = Next_label_dict[node]
label_dict = Next_label_dict
if estimate_stop_cond(G, label_dict) is True:
break
for node in label_dict.keys():
label = label_dict[node]
if label not in cluster_community.keys():
cluster_community[label] = [node]
else:
cluster_community[label].append(node)
result_community = CheckConnectivity(G, cluster_community)
return result_community
[docs]@not_implemented_for("multigraph")
def SLPA(G, T, r):
"""Detect Overlapping Communities by Speaker-listener Label Propagation Algorithm
Return the detected Overlapping communities. But the result is random.
Parameters
----------
G : graph
A easygraph graph.
T : int
The number of iterations, In general, T is set greater than 20, which produces relatively stable outputs.
r : int
a threshold between 0 and 1.
Returns
-------
communities : dictionary
key: serial number of community , value: nodes in the community.
Examples
----------
>>> SLPA(G,
... T = 20,
... r = 0.05
... )
References
----------
.. [1] Jierui Xie, Boleslaw K. Szymanski, Xiaoming Liu:
SLPA: Uncovering Overlapping Communities in Social Networks via A Speaker-listener Interaction Dynamic Process
"""
nodes = list(G.nodes.keys())
if len(nodes) == 1:
return {1: [nodes[0]]}
nodes = G.nodes
adj = G.adj
memory = {i: {i: 1} for i in nodes}
for i in range(0, T):
listenerslist = list(G.nodes)
random.shuffle(listenerslist)
for listener in listenerslist:
speakerlist = adj[listener]
if len(speakerlist) == 0:
continue
labels = defaultdict(int)
for speaker in speakerlist:
# Speaker Rule
total = float(sum(memory[speaker].values()))
keys = list(memory[speaker].keys())
index = np.random.multinomial(
1, [round(freq / total, 2) for freq in memory[speaker].values()]
).argmax()
chosen_label = keys[index]
labels[chosen_label] += 1
# Listener Rule
maxlabel = max(labels.items(), key=lambda x: x[1])[0]
if maxlabel in memory[listener]:
memory[listener][maxlabel] += 1
else:
memory[listener][maxlabel] = 1
for node, labels in memory.items():
name_list = []
for label_name, label_number in labels.items():
if round(label_number / float(T + 1), 2) < r:
name_list.append(label_name)
for name in name_list:
del labels[name]
# Find nodes membership
communities = {}
for node, labels in memory.items():
for label in labels:
if label in communities:
communities[label].add(node)
else:
communities[label] = {node}
# Remove nested communities
RemoveNested(communities)
# Check Connectivity
result_community = CheckConnectivity(G, communities)
return result_community
[docs]@not_implemented_for("multigraph")
def HANP(G, m, delta, threshod=1, hier_open=0, combine_open=0):
"""Detect community by Hop attenuation & node preference algorithm
Return the detected communities. But the result is random.
Implement the basic HANP algorithm and give more freedom through the parameters, e.g., you can use threshod
to set the condition for node updating. If network are known to be Hierarchical and overlapping communities,
it's recommended to choose geodesic distance as the measure(instead of receiving the current hop scores
from the neighborhood and carry out a subtraction) and When an equilibrium is reached, treat newly combined
communities as a single node.
For using Floyd to get the shortest distance, the time complexity is a little high.
Parameters
----------
G : graph
A easygraph graph
m : float
Used to calculate score, when m > 0, more preference is given to node with more neighbors; m < 0, less
delta : float
Hop attenuation
threshod : float
Between 0 and 1, only update node whose number of neighbors sharing the maximal label is less than the threshod.
e.g., threshod == 1 means updating all nodes.
hier_open :
1 means using geodesic distance as the score measure.
0 means not.
combine_open :
this option is valid only when hier_open = 1
1 means When an equilibrium is reached, treat newly combined communities as a single node.
0 means not.
Returns
----------
communities : dictionary
key: serial number of community , value: nodes in the community.
Examples
----------
>>> HANP(G,
... m = 0.1,
... delta = 0.05,
... threshod = 1,
... hier_open = 0,
... combine_open = 0
... )
References
----------
.. [1] Ian X. Y. Leung, Pan Hui, Pietro Liò, and Jon Crowcrof:
Towards real-time community detection in large networks
"""
nodes = list(G.nodes.keys())
if len(nodes) == 1:
return {1: [nodes[0]]}
label_dict = dict()
score_dict = dict()
node_dict = dict()
Next_label_dict = dict()
cluster_community = dict()
nodes = list(G.nodes.keys())
degrees = G.degree()
records = []
loop_count = 0
i = 0
old_score = 1
ori_G = G
if hier_open == 1:
distance_dict = eg.Floyd(G)
for node in nodes:
label_dict[node] = i
score_dict[i] = 1
node_dict[i] = node
i = i + 1
while True:
loop_count += 1
random.shuffle(nodes)
score = 1
for node in nodes:
labels = SelectLabels_HANP(
G, node, label_dict, score_dict, degrees, m, threshod
)
if labels == []:
Next_label_dict[node] = label_dict[node]
continue
old_label = label_dict[node]
Next_label_dict[node] = random.choice(labels)
# Asynchronous updates. If you want to use synchronous updates, comment the line below
label_dict[node] = Next_label_dict[node]
if hier_open == 1:
score_dict[Next_label_dict[node]] = UpdateScore_Hier(
G, node, label_dict, node_dict, distance_dict
)
score = min(score, score_dict[Next_label_dict[node]])
else:
if old_label == Next_label_dict[node]:
cdelta = 0
else:
cdelta = delta
score_dict[Next_label_dict[node]] = UpdateScore(
G, node, label_dict, score_dict, cdelta
)
if hier_open == 1 and combine_open == 1:
if old_score - score > 1 / 3:
old_score = score
(
records,
G,
label_dict,
score_dict,
node_dict,
Next_label_dict,
nodes,
degrees,
distance_dict,
) = CombineNodes(
records,
G,
label_dict,
score_dict,
node_dict,
Next_label_dict,
nodes,
degrees,
distance_dict,
)
label_dict = Next_label_dict
if (
estimate_stop_cond_HANP(G, label_dict, score_dict, degrees, m, threshod)
is True
):
break
"""As mentioned in the paper, it's suggested that the number of iterations
required is independent to the number of nodes and that after
five iterations, 95% of their nodes are already accurately clustered
"""
if loop_count > 20:
break
print("After %d iterations, HANP complete." % loop_count)
for node in label_dict.keys():
label = label_dict[node]
if label not in cluster_community.keys():
cluster_community[label] = [node]
else:
cluster_community[label].append(node)
if hier_open == 1 and combine_open == 1:
records.append(cluster_community)
cluster_community = ShowRecord(records)
result_community = CheckConnectivity(ori_G, cluster_community)
return result_community
[docs]@not_implemented_for("multigraph")
def BMLPA(G, p):
"""Detect community by Balanced Multi-Label Propagation algorithm
Return the detected communities.
Firstly, initialize 'old' using cores generated by RC function, the propagate label till the number and size
of communities stay no change, check if there are subcommunity and delete it. Finally, split discontinuous
communities.
For some directed graphs lead to oscillations of labels, modify the stop condition.
Parameters
----------
G : graph
A easygraph graph
p : float
Between 0 and 1, judge Whether a community identifier should be retained
Returns
----------
communities : dictionary
key: serial number of community , value: nodes in the community.
Examples
----------
>>> BMLPA(G,
... p = 0.1,
... )
References
----------
.. [1] Wu Zhihao, Lin You-Fang, Gregory Steve, Wan Huai-Yu, Tian Sheng-Feng
Balanced Multi-Label Propagation for Overlapping Community Detection in Social Networks
"""
nodes = list(G.nodes.keys())
if len(nodes) == 1:
return {1: [nodes[0]]}
cores = Rough_Cores(G)
nodes = G.nodes
i = 0
old_label_dict = dict()
new_label_dict = dict()
for core in cores:
for node in core:
if node not in old_label_dict:
old_label_dict[node] = {i: 1}
else:
old_label_dict[node][i] = 1
i += 1
oldMin = dict()
loop_count = 0
old_label_dictx = dict()
while True:
loop_count += 1
old_label_dictx = old_label_dict
for node in nodes:
Propagate_bbc(G, node, old_label_dict, new_label_dict, p)
if loop_count > 50 and old_label_dict == old_label_dictx:
break
Min = dict()
if Id(old_label_dict) == Id(new_label_dict):
Min = mc(count(old_label_dict), count(new_label_dict))
else:
Min = count(new_label_dict)
if loop_count > 500:
break
if Min != oldMin:
old_label_dict = copy.deepcopy(new_label_dict)
oldMin = copy.deepcopy(Min)
else:
break
print("After %d iterations, BMLPA complete." % loop_count)
communities = dict()
for node in nodes:
for label, _ in old_label_dict[node].items():
if label in communities:
communities[label].add(node)
else:
communities[label] = {node}
RemoveNested(communities)
result_community = CheckConnectivity(G, communities)
return result_community
def RemoveNested(communities):
nestedCommunities = set()
keys = list(communities.keys())
for i, label0 in enumerate(keys[:-1]):
comm0 = communities[label0]
for label1 in keys[i + 1 :]:
comm1 = communities[label1]
if comm0.issubset(comm1):
nestedCommunities.add(label0)
elif comm0.issuperset(comm1):
nestedCommunities.add(label1)
for comm in nestedCommunities:
del communities[comm]
def SelectLabels(G, node, label_dict):
adj = G.adj
count = {}
count_items = []
for neighbor in adj[node]:
neighbor_label = label_dict[neighbor]
count[neighbor_label] = count.get(neighbor_label, 0) + 1
count_items = sorted(count.items(), key=lambda x: x[1], reverse=True)
labels = [k for k, v in count_items if v == count_items[0][1]]
return labels
def estimate_stop_cond(G, label_dict):
for node in G.nodes:
if SelectLabels(G, node, label_dict) != [] and (
label_dict[node] not in SelectLabels(G, node, label_dict)
):
return False
return True
def SelectLabels_HANP(G, node, label_dict, score_dict, degrees, m, threshod):
adj = G.adj
count = defaultdict(float)
cnt = defaultdict(int)
for neighbor in adj[node]:
neighbor_label = label_dict[neighbor]
cnt[neighbor_label] += 1
count[neighbor_label] += (
score_dict[neighbor_label]
* (degrees[neighbor] ** m)
* adj[node][neighbor].get("weight", 1)
)
count_items = sorted(count.items(), key=lambda x: x[1], reverse=True)
labels = [k for k, v in count_items if v == count_items[0][1]]
# only update node whose number of neighbors sharing the maximal label is less than a certain percentage.
if count_items == []:
return []
if round(cnt[count_items[0][0]] / len(adj[node]), 2) > threshod:
return [label_dict[node]]
return labels
def HopAttenuation_Hier(G, node, label_dict, node_dict, distance_dict):
distance = float("inf")
Max_distance = 0
adj = G.adj
label = label_dict[node]
ori_node = node_dict[label]
for _, distancex in distance_dict[ori_node].items():
Max_distance = max(Max_distance, distancex)
for neighbor in adj[node]:
if label_dict[neighbor] == label:
distance = min(distance, distance_dict[ori_node][neighbor])
return round((1 + distance) / Max_distance, 2)
def UpdateScore_Hier(G, node, label_dict, node_dict, distance_dict):
return 1 - HopAttenuation_Hier(G, node, label_dict, node_dict, distance_dict)
def UpdateScore(G, node, label_dict, score_dict, delta):
adj = G.adj
Max_score = 0
label = label_dict[node]
for neighbor in adj[node]:
if label_dict[neighbor] == label:
Max_score = max(Max_score, score_dict[label_dict[neighbor]])
return Max_score - delta
def estimate_stop_cond_HANP(G, label_dict, score_dict, degrees, m, threshod):
for node in G.nodes:
if SelectLabels_HANP(
G, node, label_dict, score_dict, degrees, m, threshod
) != [] and label_dict[node] not in SelectLabels_HANP(
G, node, label_dict, score_dict, degrees, m, threshod
):
return False
return True
def CombineNodes(
records,
G,
label_dict,
score_dict,
node_dict,
Next_label_dict,
nodes,
degrees,
distance_dict,
):
onerecord = dict()
for node, label in label_dict.items():
if label in onerecord:
onerecord[label].append(node)
else:
onerecord[label] = [node]
records.append(onerecord)
Gx = eg.Graph()
label_dictx = dict()
score_dictx = dict()
node_dictx = dict()
nodesx = []
cnt = 0
for record_label in onerecord:
nodesx.append(cnt)
label_dictx[cnt] = record_label
score_dictx[record_label] = score_dict[record_label]
node_dictx[record_label] = cnt
cnt += 1
record_labels = list(onerecord.keys())
i = 0
edge = dict()
adj = G.adj
for i in range(0, len(record_labels)):
edge[i] = dict()
for j in range(0, len(record_labels)):
if i == j:
continue
inodes = onerecord[record_labels[i]]
jnodes = onerecord[record_labels[j]]
for unode in inodes:
for vnode in jnodes:
if unode in adj and vnode in adj[unode]:
if j not in edge[i]:
edge[i][j] = 0
edge[i][j] += adj[unode][vnode].get("weight", 1)
for unode in edge:
for vnode, w in edge[unode].items():
if unode < vnode:
Gx.add_edge(unode, vnode, weight=w)
G = Gx
label_dict = label_dictx
score_dict = score_dictx
node_dict = node_dictx
Next_label_dict = label_dictx
nodes = nodesx
degrees = G.degree()
distance_dict = eg.Floyd(G)
return (
records,
G,
label_dict,
score_dict,
node_dict,
Next_label_dict,
nodes,
degrees,
distance_dict,
)
def ShowRecord(records):
"""
e.g.
records : [ {1:[1,2,3,4],2:[5,6,7,8],3:[9],4:[10],5:[11],6:[12]},
{2:[0,1,3],3:[2,4,5]},
{2:[0,1]} ]
process : {1:[1,2,3,4],2:[5,6,7,8],3:[9],4:[10],5:[11],6:[12]} ->
{2:[ [1,2,3,4] + [5,6,7,8] + [10] ], 3:[ [9] + [11] + [12] ]} ->
{2:[ ([ [1,2,3,4] + [5,6,7,8] + [10] ]) + ([ [9] + [11] + [12] ] ]) } ->
return : {2:[1,2,3,4,5,6,7,8,10,9,11,12]}
"""
result = dict()
first = records[0]
for i in range(1, len(records)):
keys = list(first.keys())
onerecord = records[i]
result = {}
for label, nodes in onerecord.items():
for unode in nodes:
for vnode in first[keys[unode]]:
if label not in result:
result[label] = []
result[label].append(vnode)
first = result
return first
def CheckConnectivity(G, communities):
result_community = dict()
community = [list(community) for label, community in communities.items()]
communityx = []
for nodes in community:
BFS(G, nodes, communityx)
i = 0
for com in communityx:
i += 1
result_community[i] = com
return result_community
def BFS(G, nodes, result):
# check the nodes in G are connected or not. if not, desperate the nodes into different connected subgraphs.
if len(nodes) == 0:
return
if len(nodes) == 1:
result.append(nodes)
return
adj = G.adj
queue = Queue()
queue.put(nodes[0])
seen = set()
seen.add(nodes[0])
count = 0
while queue.empty() == 0:
vertex = queue.get()
count += 1
for w in adj[vertex]:
if w in nodes and w not in seen:
queue.put(w)
seen.add(w)
if count != len(nodes):
result.append([w for w in seen])
return BFS(G, [w for w in nodes if w not in seen], result)
else:
result.append(nodes)
return
def Rough_Cores(G):
nodes = G.nodes
degrees = G.degree()
adj = G.adj
seen_dict = dict()
label_dict = dict()
cores = []
i = 0
for node in nodes:
label_dict[node] = i
seen_dict[node] = 1
i += 1
degree_list = sorted(degrees.items(), key=lambda x: x[1], reverse=True)
for node, _ in degree_list:
core = []
if degrees[node] >= 3 and seen_dict[node] == 1:
for neighbor in adj[node]:
max_degree = 0
j = node
if seen_dict[neighbor] == 1:
if degrees[neighbor] > max_degree:
max_degree = degrees[neighbor]
j = neighbor
elif degrees[neighbor] == max_degree:
pass
if j != []:
core = [node] + [j]
commNeiber = [i for i in adj[node] if i in adj[j]]
commNeiber = [node for node, _ in degree_list if node in commNeiber]
commNeiber = commNeiber[::-1]
while commNeiber != []:
for h in commNeiber:
core.append(h)
for x in commNeiber:
if x not in adj[h]:
commNeiber.remove(x)
if h in commNeiber:
commNeiber.remove(h)
if len(core) >= 3:
for i in core:
seen_dict[i] = 0
cores.append(core)
core_node = []
for core in cores:
core_node += core
for node in nodes:
if node not in core_node:
cores.append([node])
return cores
def Normalizer(l):
Sum = 0
for identifier, coefficient in l.items():
Sum += coefficient
for identifier, coefficient in l.items():
l[identifier] = round(coefficient / Sum, 2)
def Propagate_bbc(G, x, source, dest, p):
adj = G.adj
dest[x] = dict()
max_b = 0
for y in adj[x]:
for identifier, coefficient in source[y].items():
b = coefficient
if identifier in dest[x]:
dest[x][identifier] += b
else:
dest[x][identifier] = b
max_b = max(dest[x][identifier], max_b)
if max_b == 0:
dest[x] = source[x]
return
for identifier in list(dest[x].keys()):
if dest[x][identifier] / max_b < p:
del dest[x][identifier]
Normalizer(dest[x])
def Id(l):
ids = dict()
for x in l:
ids[x] = Id1(l[x])
return ids
def Id1(x):
ids = []
for identifier, _ in x.items():
if identifier not in ids:
ids.append(identifier)
return ids
def count(l):
counts = dict()
for x in l:
for identifier, _ in l[x].items():
if identifier in counts:
counts[identifier] += 1
else:
counts[identifier] = 1
return counts
def mc(cs1, cs2):
cs = dict()
for identifier, _ in cs1.items():
cs[identifier] = min(cs1[identifier], cs2[identifier])
return cs