Source code for easygraph.functions.structural_holes.AP_Greedy

import math
import random

import easygraph as eg

from easygraph.functions.components.biconnected import generator_articulation_points
from easygraph.functions.components.connected import connected_components
from easygraph.utils.decorators import *


__all__ = ["common_greedy", "AP_Greedy"]


[docs] @not_implemented_for("multigraph") @only_implemented_for_UnDirected_graph def common_greedy(G, k, c=1.0, weight="weight"): """Common greedy method for structural hole spanners detection. Returns top k nodes as structural hole spanners, Algorithm 1 of [1]_ Parameters ---------- G : easygraph.Graph An undirected graph. k : int top - k structural hole spanners c : float, optional (default : 1.0) To define zeta: zeta = c * (n*n*n), and zeta is the large value assigned as the shortest distance of two unreachable vertices. Default is 1. weight : String or None, optional (default : 'weight') Key for edge weight. None if not concerning about edge weight. Returns ------- common_greedy : list The list of each top-k structural hole spanners. See Also -------- AP_Greedy Examples -------- Returns the top k nodes as structural hole spanners, using **common_greedy**. >>> common_greedy(G, ... k = 3, # To find top three structural holes spanners. ... c = 1.0, # To define zeta: zeta = c * (n*n*n), and zeta is the large value assigned as the shortest distance of two unreachable vertices. ... weight = 'weight') References ---------- .. [1] https://dl.acm.org/profile/81484650642 """ v_sns = [] G_i = G.copy() N = len(G) for i in range(k): sorted_nodes = sort_nodes_by_degree(G_i, weight) C_max = 0 for j in range(N - i): G_i_j = G_i.copy() G_i_j.remove_node(sorted_nodes[j]) upper_bound = procedure1(G_i_j, c) if upper_bound < C_max: pass else: sum_all_shortest_paths = procedure2(G_i_j, c) if sum_all_shortest_paths >= C_max: v_i = sorted_nodes[j] C_max = sum_all_shortest_paths else: pass del G_i_j v_sns.append(v_i) G_i.remove_node(v_i) del G_i return v_sns
def sort_nodes_by_degree(G, weight="weight"): sorted_nodes = [] for node, degree in sorted( G.degree(weight=weight).items(), key=lambda x: x[1], reverse=True ): sorted_nodes.append(node) return sorted_nodes def procedure1(G, c=1.0): """ Procedure 1 of https://dl.acm.org/profile/81484650642 Parameters ----------- G : graph c : float To define zeta: zeta = c * (n*n*n) Default is 1. """ components = connected_components(G) upper_bound = 0 for component in components: component_subgraph = G.nodes_subgraph(from_nodes=list(component)) spanning_tree = _get_spanning_tree_of_component(component_subgraph) random_root = list(spanning_tree.nodes)[ random.randint(0, len(spanning_tree) - 1) ] num_subtree_nodes = _get_num_subtree_nodes(spanning_tree, random_root) N_tree = num_subtree_nodes[random_root] for node, num in num_subtree_nodes.items(): upper_bound += 2 * num * (N_tree - num) del component_subgraph, spanning_tree N_G = len(G) zeta = c * math.pow(N_G, 3) for component in components: N_c = len(component) upper_bound += N_c * (N_G - N_c) * zeta return upper_bound def _get_spanning_tree_of_component(G): spanning_tree = eg.Graph() seen = set() def _plain_dfs(u): for v, edge_data in G.adj[u].items(): if v not in seen: seen.add(v) spanning_tree.add_edge(u, v) _plain_dfs(v) random_node = list(G.nodes)[0] seen.add(random_node) spanning_tree.add_node(random_node) _plain_dfs(random_node) return spanning_tree def _get_num_subtree_nodes(G, root): num_subtree_nodes = dict() seen = set() def _plain_dfs(u): num_nodes = 1 for v, edge_data in G.adj[u].items(): if v not in seen: seen.add(v) num_nodes += _plain_dfs(v) num_subtree_nodes[u] = num_nodes return num_nodes seen.add(root) _plain_dfs(root) return num_subtree_nodes def procedure2(G, c=1.0): """ Procedure 2 of https://dl.acm.org/profile/81484650642 Parameters ----------- G : graph c : float To define zeta: zeta = c * (n*n*n) Default is 1. """ components = connected_components(G) C = 0 N_G = len(G) zeta = c * math.pow(N_G, 3) for component in components: component_subgraph = G.nodes_subgraph(from_nodes=list(component)) C_l = _get_sum_all_shortest_paths_of_component(component_subgraph) N_c = len(component) C += C_l + N_c * (N_G - N_c) * zeta del component_subgraph return C def _get_sum_all_shortest_paths_of_component(G): # TODO: Using randomized algorithm in http://de.arxiv.org/pdf/1503.08528 # instead of bfs method. def _plain_bfs(G, source): seen = {source} nextlevel = {source} level = 1 sum_paths_of_G = 0 while nextlevel: thislevel = nextlevel nextlevel = set() for u in thislevel: for v in G.adj[u]: if v not in seen: seen.add(v) nextlevel.add(v) sum_paths_of_G += level level += 1 return sum_paths_of_G sum_paths = 0 for node in G.nodes: sum_paths += _plain_bfs(G, node) return sum_paths
[docs] @not_implemented_for("multigraph") @only_implemented_for_UnDirected_graph def AP_Greedy(G, k, c=1.0, weight="weight"): """AP greedy method for structural hole spanners detection. Returns top k nodes as structural hole spanners, Algorithm 2 of [1]_ Parameters ---------- G : easygraph.Graph An undirected graph. k : int top - k structural hole spanners c : float, optional (default : 1.0) To define zeta: zeta = c * (n*n*n), and zeta is the large value assigned as the shortest distance of two unreachable vertices. Default is 1. weight : String or None, optional (default : 'weight') Key for edge weight. None if not concerning about edge weight. Returns ------- AP_greedy : list The list of each top-k structural hole spanners. Examples -------- Returns the top k nodes as structural hole spanners, using **AP_greedy**. >>> AP_greedy(G, ... k = 3, # To find top three structural holes spanners. ... c = 1.0, # To define zeta: zeta = c * (n*n*n), and zeta is the large value assigned as the shortest distance of two unreachable vertices. ... weight = 'weight') References ---------- .. [1] https://dl.acm.org/profile/81484650642 """ v_sns = [] G_i = G.copy() N = len(G) for i in range(k): v_ap, lower_bound = _get_lower_bound_of_ap_nodes(G_i, c) upper_bound = _get_upper_bound_of_non_ap_nodes(G_i, v_ap, c) lower_bound = sorted(lower_bound.items(), key=lambda x: x[1], reverse=True) # print(upper_bound) # print(lower_bound) if len(lower_bound) != 0 and lower_bound[0][1] > max(upper_bound): v_i = lower_bound[0][0] else: # If acticulation points not chosen, use common_greedy instead. sorted_nodes = sort_nodes_by_degree(G_i, weight) C_max = 0 for j in range(N - i): G_i_j = G_i.copy() G_i_j.remove_node(sorted_nodes[j]) upper_bound = procedure1(G_i_j, c) if upper_bound < C_max: pass else: sum_all_shortest_paths = procedure2(G_i_j, c) if sum_all_shortest_paths >= C_max: v_i = sorted_nodes[j] C_max = sum_all_shortest_paths else: pass del G_i_j v_sns.append(v_i) G_i.remove_node(v_i) del G_i return v_sns
def _get_lower_bound_of_ap_nodes(G, c=1.0): """ Returns the articulation points and lower bound for each of them. Procedure 3 of https://dl.acm.org/profile/81484650642 Parameters ---------- G : graph An undirected graph. c : float To define zeta: zeta = c * (n*n*n), and zeta is the large value assigned as the shortest distance of two unreachable vertices. Default is 1. """ v_ap = [] lower_bound = dict() N_G = len(G) zeta = c * math.pow(N_G, 3) components = connected_components(G) for component in components: component_subgraph = G.nodes_subgraph(from_nodes=list(component)) articulation_points = list(generator_articulation_points(component_subgraph)) N_component = len(component_subgraph) for articulation in articulation_points: component_subgraph_after_remove = component_subgraph.copy() component_subgraph_after_remove.remove_node(articulation) lower_bound_value = 0 lower_bound_value += sum( (len(temp) * (N_G - len(temp))) for temp in components ) lower_bound_value += sum( (len(temp) * (N_component - 1 - len(temp))) for temp in connected_components(component_subgraph_after_remove) ) lower_bound_value += 2 * N_component - 2 * N_G lower_bound_value *= zeta v_ap.append(articulation) lower_bound[articulation] = lower_bound_value del component_subgraph_after_remove del component_subgraph return v_ap, lower_bound def _get_upper_bound_of_non_ap_nodes(G, ap: list, c=1.0): """ Returns the upper bound value for each non-articulation points. Eq.(14) of https://dl.acm.org/profile/81484650642 Parameters ---------- G : graph An undirected graph. ap : list Articulation points of G. c : float To define zeta: zeta = c * (n*n*n), and zeta is the large value assigned as the shortest distance of two unreachable vertices. Default is 1. """ upper_bound = [] N_G = len(G) zeta = c * math.pow(N_G, 3) components = connected_components(G) for component in components: non_articulation_points = component - set(ap) for node in non_articulation_points: upper_bound_value = 0 upper_bound_value += sum( (len(temp) * (N_G - len(temp))) for temp in components ) upper_bound_value += 2 * len(component) + 1 - 2 * N_G upper_bound_value *= zeta upper_bound.append(upper_bound_value) return upper_bound