Source code for easygraph.functions.community.LPA

import copy
import random

from collections import defaultdict
from queue import Queue

import easygraph as eg
import numpy as np

from easygraph.utils import *


__all__ = [
    "LPA",
    "SLPA",
    "HANP",
    "BMLPA",
]


[docs]@not_implemented_for("multigraph")
def LPA(G):
    """Detect community by label propagation algorithm
    Return the detected communities. But the result is random.
    Each node in the network is initially assigned to its own community. At every iteration,nodes have
    a label that the maximum number of their neighbors have. If there are more than one nodes fit and
    available, choose a label randomly. Finally, nodes having the same labels are grouped together as
    communities. In case two or more disconnected groups of nodes have the same label, we run a simple
    breadth-first search to separate the disconnected communities

    Parameters
    ----------
    G : graph
      A easygraph graph

    Returns
    ----------
    communities : dictionary
      key: serial number of community , value: nodes in the community.

    Examples
    ----------
    >>> LPA(G)

    References
    ----------
    .. [1] Usha Nandini Raghavan, Réka Albert, and Soundar Kumara:
        Near linear time algorithm to detect community structures in large-scale networks
    """
    i = 0
    label_dict = dict()
    cluster_community = dict()
    Next_label_dict = dict()
    nodes = list(G.nodes.keys())
    if len(nodes) == 1:
        return {1: [nodes[0]]}
    for node in nodes:
        label_dict[node] = i
        i = i + 1
    loop_count = 0
    while True:
        loop_count += 1
        random.shuffle(nodes)
        for node in nodes:
            labels = SelectLabels(G, node, label_dict)
            if labels == []:
                Next_label_dict[node] = label_dict[node]
                continue
            Next_label_dict[node] = random.choice(labels)
            # Asynchronous updates. If you want to use synchronous updates, comment the line below
            label_dict[node] = Next_label_dict[node]
        label_dict = Next_label_dict
        if estimate_stop_cond(G, label_dict) is True:
            break
    for node in label_dict.keys():
        label = label_dict[node]
        if label not in cluster_community.keys():
            cluster_community[label] = [node]
        else:
            cluster_community[label].append(node)

    result_community = CheckConnectivity(G, cluster_community)
    return result_community


[docs]@not_implemented_for("multigraph")
def SLPA(G, T, r):
    """Detect Overlapping Communities by Speaker-listener Label Propagation Algorithm
    Return the detected Overlapping communities. But the result is random.

    Parameters
    ----------
    G : graph
      A easygraph graph.
    T : int
      The number of iterations, In general, T is set greater than 20, which produces relatively stable outputs.
    r : int
      a threshold between 0 and 1.

    Returns
    -------
    communities : dictionary
      key: serial number of community , value: nodes in the community.

    Examples
    ----------
    >>> SLPA(G,
    ...     T = 20,
    ...     r = 0.05
    ...     )

    References
    ----------
    .. [1] Jierui Xie, Boleslaw K. Szymanski, Xiaoming Liu:
        SLPA: Uncovering Overlapping Communities in Social Networks via A Speaker-listener Interaction Dynamic Process
    """
    nodes = list(G.nodes.keys())
    if len(nodes) == 1:
        return {1: [nodes[0]]}
    nodes = G.nodes
    adj = G.adj
    memory = {i: {i: 1} for i in nodes}
    for i in range(0, T):
        listenerslist = list(G.nodes)
        random.shuffle(listenerslist)
        for listener in listenerslist:
            speakerlist = adj[listener]
            if len(speakerlist) == 0:
                continue
            labels = defaultdict(int)
            for speaker in speakerlist:
                # Speaker Rule
                total = float(sum(memory[speaker].values()))
                keys = list(memory[speaker].keys())
                index = np.random.multinomial(
                    1, [round(freq / total, 2) for freq in memory[speaker].values()]
                ).argmax()
                chosen_label = keys[index]
                labels[chosen_label] += 1
            # Listener Rule
            maxlabel = max(labels.items(), key=lambda x: x[1])[0]
            if maxlabel in memory[listener]:
                memory[listener][maxlabel] += 1
            else:
                memory[listener][maxlabel] = 1

    for node, labels in memory.items():
        name_list = []
        for label_name, label_number in labels.items():
            if round(label_number / float(T + 1), 2) < r:
                name_list.append(label_name)
        for name in name_list:
            del labels[name]

    # Find nodes membership
    communities = {}
    for node, labels in memory.items():
        for label in labels:
            if label in communities:
                communities[label].add(node)
            else:
                communities[label] = {node}

    # Remove nested communities
    RemoveNested(communities)

    # Check Connectivity
    result_community = CheckConnectivity(G, communities)
    return result_community


[docs]@not_implemented_for("multigraph")
def HANP(G, m, delta, threshod=1, hier_open=0, combine_open=0):
    """Detect community by Hop attenuation & node preference algorithm

    Return the detected communities. But the result is random.

    Implement the basic HANP algorithm and give more freedom through the parameters, e.g., you can use threshod
    to set the condition for node updating. If network are known to be Hierarchical and overlapping communities,
    it's recommended to choose geodesic distance as the measure(instead of receiving the current hop scores
    from the neighborhood and carry out a subtraction) and When an equilibrium is reached, treat newly combined
    communities as a single node.

    For using Floyd to get the shortest distance, the time complexity is a little high.

    Parameters
    ----------
    G : graph
      A easygraph graph
    m : float
      Used to calculate score, when m > 0, more preference is given to node with more neighbors; m < 0, less
    delta : float
      Hop attenuation
    threshod : float
      Between 0 and 1, only update node whose number of neighbors sharing the maximal label is less than the threshod.
      e.g., threshod == 1 means updating all nodes.
    hier_open :
      1 means using geodesic distance as the score measure.
      0 means not.
    combine_open :
      this option is valid only when hier_open = 1
      1 means When an equilibrium is reached, treat newly combined communities as a single node.
      0 means not.

    Returns
    ----------
    communities : dictionary
      key: serial number of community , value: nodes in the community.

    Examples
    ----------
    >>> HANP(G,
    ...     m = 0.1,
    ...     delta = 0.05,
    ...     threshod = 1,
    ...     hier_open = 0,
    ...     combine_open = 0
    ...     )

    References
    ----------
    .. [1] Ian X. Y. Leung, Pan Hui, Pietro Liò, and Jon Crowcrof:
        Towards real-time community detection in large networks

    """
    nodes = list(G.nodes.keys())
    if len(nodes) == 1:
        return {1: [nodes[0]]}
    label_dict = dict()
    score_dict = dict()
    node_dict = dict()
    Next_label_dict = dict()
    cluster_community = dict()
    nodes = list(G.nodes.keys())
    degrees = G.degree()
    records = []
    loop_count = 0
    i = 0
    old_score = 1
    ori_G = G
    if hier_open == 1:
        distance_dict = eg.Floyd(G)
    for node in nodes:
        label_dict[node] = i
        score_dict[i] = 1
        node_dict[i] = node
        i = i + 1
    while True:
        loop_count += 1
        random.shuffle(nodes)
        score = 1
        for node in nodes:
            labels = SelectLabels_HANP(
                G, node, label_dict, score_dict, degrees, m, threshod
            )
            if labels == []:
                Next_label_dict[node] = label_dict[node]
                continue
            old_label = label_dict[node]
            Next_label_dict[node] = random.choice(labels)
            # Asynchronous updates. If you want to use synchronous updates, comment the line below
            label_dict[node] = Next_label_dict[node]
            if hier_open == 1:
                score_dict[Next_label_dict[node]] = UpdateScore_Hier(
                    G, node, label_dict, node_dict, distance_dict
                )
                score = min(score, score_dict[Next_label_dict[node]])
            else:
                if old_label == Next_label_dict[node]:
                    cdelta = 0
                else:
                    cdelta = delta
                score_dict[Next_label_dict[node]] = UpdateScore(
                    G, node, label_dict, score_dict, cdelta
                )
        if hier_open == 1 and combine_open == 1:
            if old_score - score > 1 / 3:
                old_score = score
                (
                    records,
                    G,
                    label_dict,
                    score_dict,
                    node_dict,
                    Next_label_dict,
                    nodes,
                    degrees,
                    distance_dict,
                ) = CombineNodes(
                    records,
                    G,
                    label_dict,
                    score_dict,
                    node_dict,
                    Next_label_dict,
                    nodes,
                    degrees,
                    distance_dict,
                )
        label_dict = Next_label_dict
        if (
            estimate_stop_cond_HANP(G, label_dict, score_dict, degrees, m, threshod)
            is True
        ):
            break
        """As mentioned in the paper, it's suggested that the number of iterations
        required is independent to the number of nodes and that after
        five iterations, 95% of their nodes are already accurately clustered
        """
        if loop_count > 20:
            break
    print("After %d iterations, HANP complete." % loop_count)
    for node in label_dict.keys():
        label = label_dict[node]
        if label not in cluster_community.keys():
            cluster_community[label] = [node]
        else:
            cluster_community[label].append(node)
    if hier_open == 1 and combine_open == 1:
        records.append(cluster_community)
        cluster_community = ShowRecord(records)
    result_community = CheckConnectivity(ori_G, cluster_community)
    return result_community


[docs]@not_implemented_for("multigraph")
def BMLPA(G, p):
    """Detect community by Balanced Multi-Label Propagation algorithm

    Return the detected communities.

    Firstly, initialize 'old' using cores generated by RC function, the propagate label till the number and size
    of communities stay no change, check if there are subcommunity and delete it. Finally, split discontinuous
    communities.

    For some directed graphs lead to oscillations of labels, modify the stop condition.

    Parameters
    ----------
    G : graph
      A easygraph graph
    p : float
      Between 0 and 1, judge Whether a community identifier should be retained

    Returns
    ----------
    communities : dictionary
      key: serial number of community , value: nodes in the community.

    Examples
    ----------
    >>> BMLPA(G,
    ...     p = 0.1,
    ...     )

    References
    ----------
    .. [1] Wu Zhihao, Lin You-Fang, Gregory Steve, Wan Huai-Yu, Tian Sheng-Feng
        Balanced Multi-Label Propagation for Overlapping Community Detection in Social Networks

    """
    nodes = list(G.nodes.keys())
    if len(nodes) == 1:
        return {1: [nodes[0]]}
    cores = Rough_Cores(G)
    nodes = G.nodes
    i = 0
    old_label_dict = dict()
    new_label_dict = dict()
    for core in cores:
        for node in core:
            if node not in old_label_dict:
                old_label_dict[node] = {i: 1}
            else:
                old_label_dict[node][i] = 1
            i += 1
    oldMin = dict()
    loop_count = 0
    old_label_dictx = dict()
    while True:
        loop_count += 1
        old_label_dictx = old_label_dict
        for node in nodes:
            Propagate_bbc(G, node, old_label_dict, new_label_dict, p)
        if loop_count > 50 and old_label_dict == old_label_dictx:
            break
        Min = dict()
        if Id(old_label_dict) == Id(new_label_dict):
            Min = mc(count(old_label_dict), count(new_label_dict))
        else:
            Min = count(new_label_dict)
        if loop_count > 500:
            break
        if Min != oldMin:
            old_label_dict = copy.deepcopy(new_label_dict)
            oldMin = copy.deepcopy(Min)
        else:
            break
    print("After %d iterations, BMLPA complete." % loop_count)
    communities = dict()
    for node in nodes:
        for label, _ in old_label_dict[node].items():
            if label in communities:
                communities[label].add(node)
            else:
                communities[label] = {node}
    RemoveNested(communities)
    result_community = CheckConnectivity(G, communities)
    return result_community


def RemoveNested(communities):
    nestedCommunities = set()
    keys = list(communities.keys())
    for i, label0 in enumerate(keys[:-1]):
        comm0 = communities[label0]
        for label1 in keys[i + 1 :]:
            comm1 = communities[label1]
            if comm0.issubset(comm1):
                nestedCommunities.add(label0)
            elif comm0.issuperset(comm1):
                nestedCommunities.add(label1)
    for comm in nestedCommunities:
        del communities[comm]


def SelectLabels(G, node, label_dict):
    adj = G.adj
    count = {}
    count_items = []
    for neighbor in adj[node]:
        neighbor_label = label_dict[neighbor]
        count[neighbor_label] = count.get(neighbor_label, 0) + 1
        count_items = sorted(count.items(), key=lambda x: x[1], reverse=True)
    labels = [k for k, v in count_items if v == count_items[0][1]]
    return labels


def estimate_stop_cond(G, label_dict):
    for node in G.nodes:
        if SelectLabels(G, node, label_dict) != [] and (
            label_dict[node] not in SelectLabels(G, node, label_dict)
        ):
            return False
    return True


def SelectLabels_HANP(G, node, label_dict, score_dict, degrees, m, threshod):
    adj = G.adj
    count = defaultdict(float)
    cnt = defaultdict(int)
    for neighbor in adj[node]:
        neighbor_label = label_dict[neighbor]
        cnt[neighbor_label] += 1
        count[neighbor_label] += (
            score_dict[neighbor_label]
            * (degrees[neighbor] ** m)
            * adj[node][neighbor].get("weight", 1)
        )
    count_items = sorted(count.items(), key=lambda x: x[1], reverse=True)
    labels = [k for k, v in count_items if v == count_items[0][1]]
    # only update node whose number of neighbors sharing the maximal label is less than a certain percentage.
    if count_items == []:
        return []
    if round(cnt[count_items[0][0]] / len(adj[node]), 2) > threshod:
        return [label_dict[node]]
    return labels


def HopAttenuation_Hier(G, node, label_dict, node_dict, distance_dict):
    distance = float("inf")
    Max_distance = 0
    adj = G.adj
    label = label_dict[node]
    ori_node = node_dict[label]
    for _, distancex in distance_dict[ori_node].items():
        Max_distance = max(Max_distance, distancex)
    for neighbor in adj[node]:
        if label_dict[neighbor] == label:
            distance = min(distance, distance_dict[ori_node][neighbor])
    return round((1 + distance) / Max_distance, 2)


def UpdateScore_Hier(G, node, label_dict, node_dict, distance_dict):
    return 1 - HopAttenuation_Hier(G, node, label_dict, node_dict, distance_dict)


def UpdateScore(G, node, label_dict, score_dict, delta):
    adj = G.adj
    Max_score = 0
    label = label_dict[node]
    for neighbor in adj[node]:
        if label_dict[neighbor] == label:
            Max_score = max(Max_score, score_dict[label_dict[neighbor]])
    return Max_score - delta


def estimate_stop_cond_HANP(G, label_dict, score_dict, degrees, m, threshod):
    for node in G.nodes:
        if SelectLabels_HANP(
            G, node, label_dict, score_dict, degrees, m, threshod
        ) != [] and label_dict[node] not in SelectLabels_HANP(
            G, node, label_dict, score_dict, degrees, m, threshod
        ):
            return False
    return True


def CombineNodes(
    records,
    G,
    label_dict,
    score_dict,
    node_dict,
    Next_label_dict,
    nodes,
    degrees,
    distance_dict,
):
    onerecord = dict()
    for node, label in label_dict.items():
        if label in onerecord:
            onerecord[label].append(node)
        else:
            onerecord[label] = [node]
    records.append(onerecord)
    Gx = eg.Graph()
    label_dictx = dict()
    score_dictx = dict()
    node_dictx = dict()
    nodesx = []
    cnt = 0
    for record_label in onerecord:
        nodesx.append(cnt)
        label_dictx[cnt] = record_label
        score_dictx[record_label] = score_dict[record_label]
        node_dictx[record_label] = cnt
        cnt += 1
    record_labels = list(onerecord.keys())
    i = 0
    edge = dict()
    adj = G.adj
    for i in range(0, len(record_labels)):
        edge[i] = dict()
        for j in range(0, len(record_labels)):
            if i == j:
                continue
            inodes = onerecord[record_labels[i]]
            jnodes = onerecord[record_labels[j]]
            for unode in inodes:
                for vnode in jnodes:
                    if unode in adj and vnode in adj[unode]:
                        if j not in edge[i]:
                            edge[i][j] = 0
                        edge[i][j] += adj[unode][vnode].get("weight", 1)
    for unode in edge:
        for vnode, w in edge[unode].items():
            if unode < vnode:
                Gx.add_edge(unode, vnode, weight=w)
    G = Gx
    label_dict = label_dictx
    score_dict = score_dictx
    node_dict = node_dictx
    Next_label_dict = label_dictx
    nodes = nodesx
    degrees = G.degree()
    distance_dict = eg.Floyd(G)
    return (
        records,
        G,
        label_dict,
        score_dict,
        node_dict,
        Next_label_dict,
        nodes,
        degrees,
        distance_dict,
    )


def ShowRecord(records):
    """
    e.g.
        records : [ {1:[1,2,3,4],2:[5,6,7,8],3:[9],4:[10],5:[11],6:[12]},
                        {2:[0,1,3],3:[2,4,5]},
                            {2:[0,1]} ]

        process :   {1:[1,2,3,4],2:[5,6,7,8],3:[9],4:[10],5:[11],6:[12]} ->
                        {2:[ [1,2,3,4] + [5,6,7,8] + [10] ], 3:[ [9] + [11] + [12] ]} ->
                            {2:[ ([ [1,2,3,4] + [5,6,7,8] + [10] ]) + ([ [9] + [11] + [12] ] ]) } ->

        return :    {2:[1,2,3,4,5,6,7,8,10,9,11,12]}
    """
    result = dict()
    first = records[0]
    for i in range(1, len(records)):
        keys = list(first.keys())
        onerecord = records[i]
        result = {}
        for label, nodes in onerecord.items():
            for unode in nodes:
                for vnode in first[keys[unode]]:
                    if label not in result:
                        result[label] = []
                    result[label].append(vnode)
        first = result
    return first


def CheckConnectivity(G, communities):
    result_community = dict()
    community = [list(community) for label, community in communities.items()]
    communityx = []
    for nodes in community:
        BFS(G, nodes, communityx)
    i = 0
    for com in communityx:
        i += 1
        result_community[i] = com
    return result_community


def BFS(G, nodes, result):
    # check the nodes in G are connected or not. if not, desperate the nodes into different connected subgraphs.
    if len(nodes) == 0:
        return
    if len(nodes) == 1:
        result.append(nodes)
        return
    adj = G.adj
    queue = Queue()
    queue.put(nodes[0])
    seen = set()
    seen.add(nodes[0])
    count = 0
    while queue.empty() == 0:
        vertex = queue.get()
        count += 1
        for w in adj[vertex]:
            if w in nodes and w not in seen:
                queue.put(w)
                seen.add(w)
    if count != len(nodes):
        result.append([w for w in seen])
        return BFS(G, [w for w in nodes if w not in seen], result)
    else:
        result.append(nodes)
        return


def Rough_Cores(G):
    nodes = G.nodes
    degrees = G.degree()
    adj = G.adj
    seen_dict = dict()
    label_dict = dict()
    cores = []
    i = 0
    for node in nodes:
        label_dict[node] = i
        seen_dict[node] = 1
        i += 1
    degree_list = sorted(degrees.items(), key=lambda x: x[1], reverse=True)
    for node, _ in degree_list:
        core = []
        if degrees[node] >= 3 and seen_dict[node] == 1:
            for neighbor in adj[node]:
                max_degree = 0
                j = node
                if seen_dict[neighbor] == 1:
                    if degrees[neighbor] > max_degree:
                        max_degree = degrees[neighbor]
                        j = neighbor
                    elif degrees[neighbor] == max_degree:
                        pass
                if j != []:
                    core = [node] + [j]
                    commNeiber = [i for i in adj[node] if i in adj[j]]
                    commNeiber = [node for node, _ in degree_list if node in commNeiber]
                    commNeiber = commNeiber[::-1]
                    while commNeiber != []:
                        for h in commNeiber:
                            core.append(h)
                            for x in commNeiber:
                                if x not in adj[h]:
                                    commNeiber.remove(x)
                            if h in commNeiber:
                                commNeiber.remove(h)
        if len(core) >= 3:
            for i in core:
                seen_dict[i] = 0
            cores.append(core)
    core_node = []
    for core in cores:
        core_node += core
    for node in nodes:
        if node not in core_node:
            cores.append([node])
    return cores


def Normalizer(l):
    Sum = 0
    for identifier, coefficient in l.items():
        Sum += coefficient
    for identifier, coefficient in l.items():
        l[identifier] = round(coefficient / Sum, 2)


def Propagate_bbc(G, x, source, dest, p):
    adj = G.adj
    dest[x] = dict()
    max_b = 0
    for y in adj[x]:
        for identifier, coefficient in source[y].items():
            b = coefficient
            if identifier in dest[x]:
                dest[x][identifier] += b
            else:
                dest[x][identifier] = b
            max_b = max(dest[x][identifier], max_b)
    if max_b == 0:
        dest[x] = source[x]
        return
    for identifier in list(dest[x].keys()):
        if dest[x][identifier] / max_b < p:
            del dest[x][identifier]
    Normalizer(dest[x])


def Id(l):
    ids = dict()
    for x in l:
        ids[x] = Id1(l[x])
    return ids


def Id1(x):
    ids = []
    for identifier, _ in x.items():
        if identifier not in ids:
            ids.append(identifier)
    return ids


def count(l):
    counts = dict()
    for x in l:
        for identifier, _ in l[x].items():
            if identifier in counts:
                counts[identifier] += 1
            else:
                counts[identifier] = 1
    return counts


def mc(cs1, cs2):
    cs = dict()
    for identifier, _ in cs1.items():
        cs[identifier] = min(cs1[identifier], cs2[identifier])
    return cs