Source code for easygraph.functions.structural_holes.metrics

import math
import random

import easygraph as eg
import numpy as np

from easygraph.utils import *


__all__ = [
    "sum_of_shortest_paths",
    "nodes_of_max_cc_without_shs",
    "structural_hole_influence_index",
]


[docs]@not_implemented_for("multigraph")
def sum_of_shortest_paths(G, S):
    r"""Returns the difference between the sum of lengths of all pairs shortest paths in G and the one in G\S.
    The experiment ml_metrics in [1]_

    Parameters
    ----------
    G: easygraph.Graph or easygraph.DiGraph

    S: list of int
        A list of nodes witch are structural hole spanners.

    Returns
    -------
    differ_between_sum : int
        The difference between the sum of lengths of all pairs shortest paths in G and the one in G\S.
        C(G/S)-C(G)

    Examples
    --------
    >>> G_t=eg.datasets.get_graph_blogcatalog()
    >>> S_t=eg.AP_Greedy(G_t, 10000)
    >>> diff = sum_of_shortest_paths(G_t, S_t)
    >>> print(diff)

    References
    ----------
    .. [1] https://dl.acm.org/profile/81484650642

    """
    mat_G = eg.Floyd(G)
    sum_G = 0
    inf_const_G = math.ceil((G.number_of_nodes() ** 3) / 3)
    for i in mat_G.values():
        for j in i.values():
            if math.isinf(j):
                j = inf_const_G
            sum_G += j
    G_S = G.copy()
    G_S.remove_nodes(S)
    mat_G_S = eg.Floyd(G_S)
    sum_G_S = 0
    inf_const_G_S = math.ceil((G_S.number_of_nodes() ** 3) / 3)
    for i in mat_G_S.values():
        for j in i.values():
            if math.isinf(j):
                j = inf_const_G_S
            sum_G_S += j
    return sum_G_S - sum_G


[docs]@not_implemented_for("multigraph")
def nodes_of_max_cc_without_shs(G, S):
    r"""Returns the number of nodes in the maximum connected component in graph G\S.
    The experiment ml_metrics in [1]_

    Parameters
    ----------
    G: easygraph.Graph or easygraph.DiGraph

    S: list of int
        A list of nodes witch are structural hole spanners.

    Returns
    -------
    G_S_nodes_of_max_CC: int
        The number of nodes in the maximum connected component in graph G\S.

    Examples
    --------
    >>> G_t=eg.datasets.get_graph_blogcatalog()
    >>> S_t=eg.AP_Greedy(G_t, 10000)
    >>> maxx = nodes_of_max_cc_without_shs(G_t, S_t)
    >>> print(maxx)

    References
    ----------
    .. [1] https://dl.acm.org/profile/81484650642

    """
    G_S = G.copy()
    G_S.remove_nodes(S)
    ccs = eg.connected_components(G_S)
    max_num = 0
    for cc in ccs:
        if len(cc) > max_num:
            max_num = len(cc)
    return max_num


class NodeParams:
    def __init__(self, active, inWeight, threshold):
        self.active = active
        self.inWeight = inWeight
        self.threshold = threshold


[docs]@not_implemented_for("multigraph")
def structural_hole_influence_index(
    G_original,
    S,
    C,
    model,
    variant=False,
    seedRatio=0.05,
    randSeedIter=10,
    countIterations=100,
    Directed=True,
):
    """Returns the SHII metric of each seed.

    Parameters
    ----------
    G_original: easygraph.Graph or easygraph.DiGraph

    S: list of int
        A list of nodes which are structural hole spanners.

    C: list of list
        Each list includes the nodes in one community.

    model: string
        Propagation Model. Should be IC or LT.

    variant: bool, default is False
        Whether returns variant SHII ml_metrics or not.
        variant SHII = # of the influenced outsider / # of the influenced insiders
        SHII = # of the influenced outsiders / # of the total influenced nodes

    seedRatio: float, default is 0.05
        # of sampled seeds / # of nodes of the community that the given SHS belongs to.

    randSeedIter: int, default is 10
        How many iterations to sample seeds.

    countIterations: int default is 100
        Number of monte carlo simulations to be used.

    Directed: bool, default is True
        Whether the graph is directed or not.

    Returns
    -------
    seed_shii_pair : dict
        the SHII metric of each seed

    Examples
    --------
    # >>> structural_hole_influence_index(G, [3, 20, 9], Com, 'LT', seedRatio=0.1, Directed=False)

    References
    ----------
    .. [1] https://dl.acm.org/doi/pdf/10.1145/2939672.2939807
    .. [2] https://github.com/LifangHe/KDD16_HAM/tree/master/SHII_metric

    """
    if not Directed:
        G = eg.DiGraph()
        for edge in G_original.edges:
            G.add_edge(edge[0], edge[1])
            G.add_edge(edge[1], edge[0])
    else:
        G = G_original.copy()
    # form pair like {node_1:community_label_1,node_2:community_label_2}
    node_label_pair = {}
    for community_label in range(len(C)):
        for node_i in range(len(C[community_label])):
            node_label_pair[C[community_label][node_i]] = community_label
    # print(node_label_pair)
    seed_shii_pair = {}
    for community_label in range(len(C)):
        nodesInCommunity = []
        seedSetInCommunity = []
        for node in node_label_pair.keys():
            if node_label_pair[node] == community_label:
                nodesInCommunity.append(node)
                if node in S:
                    seedSetInCommunity.append(node)

        seedSetSize = int(math.ceil(len(nodesInCommunity) * seedRatio))

        if len(seedSetInCommunity) == 0:
            continue

        for seed in seedSetInCommunity:
            print(">>>>>> processing seed ", seed, " now.")
            oneSeedSet = []
            if node not in oneSeedSet:
                oneSeedSet.append(seed)
            seedNeighborSet = []
            # using BFS to add neighbors of the SH spanner to the seedNeighborSet as seed candidates
            queue = []
            queue.append(seed)
            while len(queue) > 0:
                cur_node = queue[0]
                count_neighbor = 0
                for neighbor in G.neighbors(node=cur_node):
                    if neighbor not in seedNeighborSet:
                        seedNeighborSet.append(neighbor)
                    count_neighbor = count_neighbor + 1
                if count_neighbor > 0:
                    if (
                        len(queue) == 1
                        and len(oneSeedSet) + len(seedNeighborSet) < seedSetSize
                    ):
                        for node in seedNeighborSet:
                            if node not in oneSeedSet:
                                oneSeedSet.append(node)
                            queue.append(node)
                        seedNeighborSet.clear()
                queue.pop(0)

            avg_censor_score_1 = 0.0
            avg_censor_score_2 = 0.0

            for randIter in range(randSeedIter):
                if randIter % 5 == 0:
                    print("seed ", seed, ": ", randIter, " in ", randSeedIter)
                randSeedSet = []
                for node in oneSeedSet:
                    randSeedSet.append(node)
                seedNeighbors = []
                for node in seedNeighborSet:
                    seedNeighbors.append(node)
                while len(seedNeighbors) > 0 and len(randSeedSet) < seedSetSize:
                    r = random.randint(0, len(seedNeighbors) - 1)
                    if seedNeighbors[r] not in randSeedSet:
                        randSeedSet.append(seedNeighbors[r])
                    seedNeighbors.pop(r)

                if model == "IC":
                    censor_score_1, censor_score_2 = _independent_cascade(
                        G,
                        randSeedSet,
                        community_label,
                        countIterations,
                        node_label_pair,
                    )
                elif model == "LT":
                    censor_score_1, censor_score_2 = _linear_threshold(
                        G,
                        randSeedSet,
                        community_label,
                        countIterations,
                        node_label_pair,
                    )
                avg_censor_score_1 += censor_score_1 / randSeedIter
                avg_censor_score_2 += censor_score_2 / randSeedIter
                # print("seed ", seed, " avg_censor_score in ", randIter, "is ", censor_score_1 / randSeedIter)
            if variant:
                seed_shii_pair[seed] = avg_censor_score_2
            else:
                seed_shii_pair[seed] = avg_censor_score_1
    return seed_shii_pair


def _independent_cascade(G, S, community_label, countIterations, node_label_pair):
    avg_result_1 = 0
    avg_result_2 = 0
    N = G.number_of_nodes()
    for b in range(countIterations):
        # print(b, " in ", countIterations)
        p_vw = np.zeros((N, N))  # 节点被激活时，激活其它节点的概率,a对b的影响等于b对a的影响
        for random_i in range(N):
            for random_j in range(random_i + 1, N):
                num = random.random()
                p_vw[random_i][random_j] = num
                p_vw[random_j][random_i] = num
        Q = []
        activeNodes = []
        for v in S:
            Q.append(v)
            activeNodes.append(v)
        while len(Q) > 0:
            v = Q[0]
            for neighbor in G.neighbors(node=v):
                if neighbor not in activeNodes:
                    toss = random.random() + 0.1
                    if v <= 0 or neighbor <= 0:
                        print(v, neighbor)
                    # if toss>0.5:
                    #     activeNodes.append(neighbor)
                    #     Q.append(neighbor)
                    if toss >= p_vw[v - 1][neighbor - 1]:
                        activeNodes.append(neighbor)
                        Q.append(neighbor)
            Q.pop(0)
        self_cov = 0
        total_cov = 0
        uniqueActiveNodes = []
        for i in activeNodes:
            if i not in uniqueActiveNodes:
                uniqueActiveNodes.append(i)
        for v in uniqueActiveNodes:
            total_cov += 1
            if node_label_pair[v] == community_label:
                self_cov += 1
        censor_score_1 = (total_cov - self_cov) / total_cov
        censor_score_2 = (total_cov - self_cov) / self_cov
        avg_result_1 += censor_score_1 / countIterations
        avg_result_2 += censor_score_2 / countIterations
    return avg_result_1, avg_result_2


def _linear_threshold(G, S, community_label, countIterations, node_label_pair):
    tol = 0.00001
    avg_result_1 = 0
    avg_result_2 = 0
    for b in range(countIterations):
        activeNodes = []
        # T is the set of nodes that are to be processed
        T = []
        Q = {}
        for v in S:
            activeNodes.append(v)
            for neighbor in G.neighbors(node=v):
                if neighbor not in S:
                    weight_degree = 1.0 / float(G.in_degree()[neighbor])
                    if neighbor not in Q.keys():
                        np = NodeParams(False, weight_degree, random.random())
                        Q[neighbor] = np
                        T.append(neighbor)
                    else:
                        Q[neighbor].inWeight += weight_degree

        while len(T) > 0:
            u = T[0]
            if Q[u].inWeight >= Q[u].threshold + tol and not Q[u].active:
                activeNodes.append(u)
                Q[u].active = True
                for neighbor in G.neighbors(node=u):
                    if neighbor in S:
                        continue
                    weight_degree = 1.0 / float(G.in_degree()[neighbor])
                    if neighbor not in Q.keys():
                        np = NodeParams(False, weight_degree, random.random())
                        Q[neighbor] = np
                        T.append(neighbor)
                    else:
                        if not Q[neighbor].active:
                            T.append(neighbor)
                            Q[neighbor].inWeight += weight_degree
                            if Q[neighbor].inWeight - 1 > tol:
                                print("Error: the inweight for a node is > 1.")
            T.pop(0)

        T.clear()
        Q.clear()

        self_cov = 0
        total_cov = 0
        uniqueActiveNodes = []
        for i in activeNodes:
            if i not in uniqueActiveNodes:
                uniqueActiveNodes.append(i)
        for v in uniqueActiveNodes:
            total_cov += 1
            if node_label_pair[v] == community_label:
                self_cov += 1
        censor_score_1 = (total_cov - self_cov) / total_cov  # ==> SHII
        censor_score_2 = (total_cov - self_cov) / self_cov
        avg_result_1 += censor_score_1 / countIterations
        avg_result_2 += censor_score_2 / countIterations
    return avg_result_1, avg_result_2


if __name__ == "__main__":
    G = eg.datasets.get_graph_karateclub()
    Com = []
    t1 = [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 17, 18, 20, 22]
    Com.append(t1)
    t2 = [9, 10, 15, 16, 19, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
    Com.append(t2)
    print("community_label:", Com)
    result = structural_hole_influence_index(
        G, [3, 20, 9], Com, "IC", seedRatio=0.1, Directed=False
    )
    print(result)