Source code for easygraph.functions.graph_embedding.line

import time
import warnings

import easygraph as eg
import numpy as np
import torch
import torch.nn as nn

from easygraph.utils import alias_draw
from easygraph.utils import alias_setup
from sklearn import preprocessing

# from easygraph.functions.graph_embedding import *
from tqdm import tqdm


warnings.filterwarnings("ignore")


[docs] class LINE(nn.Module): """Graph embedding via LINE. Parameters ---------- G : easygraph.Graph or easygraph.DiGraph dimension: int walk_length: int walk_num: int negative: int batch_size: int init_alpha: float order: int Returns ------- embedding_vector : dict The embedding vector of each node Examples -------- >>> model = LINE( ... dimension=128, ... walk_length=80, ... walk_num=20, ... negative=5, ... batch_size=128, ... init_alpha=0.025, ... order=3 ) >>> model.train() >>> emb = model(g, return_dict=True) # g: easygraph.Graph or easygraph.DiGraph References ---------- .. [1] Tang, J., Qu, M., Wang, M., Zhang, M., Yan, J., & Mei, Q. (2015, May). Line: Large-scale information network embedding. In Proceedings of the 24th international conference on world wide web (pp. 1067-1077). https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/frp0228-Tang.pdf """
[docs] @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--walk-length", type=int, default=80, help="Length of walk per source. Default is 80.", ) parser.add_argument( "--walk-num", type=int, default=20, help="Number of walks per source. Default is 20.", ) parser.add_argument( "--negative", type=int, default=5, help="Number of negative node in sampling. Default is 5.", ) parser.add_argument( "--batch-size", type=int, default=1000, help="Batch size in SGD training process. Default is 1000.", ) parser.add_argument( "--alpha", type=float, default=0.025, help="Initial learning rate of SGD. Default is 0.025.", ) parser.add_argument( "--order", type=int, default=3, help="Order of proximity in LINE. Default is 3 for 1+2.", ) parser.add_argument("--hidden-size", type=int, default=128)
[docs] @classmethod def build_model_from_args(cls, args): return cls( args.hidden_size, args.walk_length, args.walk_num, args.negative, args.batch_size, args.alpha, args.order, )
def __init__( self, dimension=128, walk_length=80, walk_num=20, negative=5, batch_size=128, init_alpha=0.025, order=3, ): super(LINE, self).__init__() self.dimension = dimension self.walk_length = walk_length self.walk_num = walk_num self.negative = negative self.batch_size = batch_size self.init_alpha = init_alpha self.order = order
[docs] def forward(self, g, return_dict=True): # run LINE algorithm, 1-order, 2-order or 3(1-order + 2-order) self.G = g self.is_directed = g.is_directed() self.num_node = g.size() self.num_edge = g.number_of_edges() self.num_sampling_edge = self.walk_length * self.walk_num * self.num_node node2id = dict([(node, vid) for vid, node in enumerate(g.nodes)]) self.edges = [[node2id[e[0]], node2id[e[1]]] for e in self.G.edges] self.edges_prob = np.asarray([1.0 for e in g.edges]) self.edges_prob /= np.sum(self.edges_prob) self.edges_table, self.edges_prob = alias_setup(self.edges_prob) degree_weight = np.asarray([0] * self.num_node) degree_weight = np.array(list(g.degree(node2id[u] for u in g.nodes).values())) # for u,v in g.edges: # degree_weight[node2id[u]] += 1.0 # if not self.is_directed: # degree_weight[node2id[v]] += 1.0 self.node_prob = np.power(degree_weight, 0.75) self.node_prob /= np.sum(self.node_prob) self.node_table, self.node_prob = alias_setup(self.node_prob) if self.order == 3: self.dimension = int(self.dimension / 2) if self.order == 1 or self.order == 3: print("train line with 1-order") print(type(self.dimension)) self.emb_vertex = ( np.random.random((self.num_node, self.dimension)) - 0.5 ) / self.dimension self._train_line(order=1) embedding1 = preprocessing.normalize(self.emb_vertex, "l2") if self.order == 2 or self.order == 3: print("train line with 2-order") self.emb_vertex = ( np.random.random((self.num_node, self.dimension)) - 0.5 ) / self.dimension self.emb_context = self.emb_vertex self._train_line(order=2) embedding2 = preprocessing.normalize(self.emb_vertex, "l2") if self.order == 1: embeddings = embedding1 elif self.order == 2: embeddings = embedding2 else: print("concatenate two embedding...") embeddings = np.hstack((embedding1, embedding2)) if return_dict: features_matrix = dict() for vid, node in enumerate(g.nodes): features_matrix[node] = embeddings[vid] else: features_matrix = np.zeros((g.num_nodes, embeddings.shape[1])) nx_nodes = g.nodes() features_matrix[nx_nodes] = embeddings[np.arange(g.num_nodes)] return features_matrix
def _update(self, vec_u, vec_v, vec_error, label): # update vetex embedding and vec_error f = 1 / (1 + np.exp(-np.sum(vec_u * vec_v, axis=1))) g = (self.alpha * (label - f)).reshape((len(label), 1)) vec_error += g * vec_v vec_v += g * vec_u def _train_line(self, order): # train Line model with order self.alpha = self.init_alpha batch_size = self.batch_size t0 = time.time() num_batch = int(self.num_sampling_edge / batch_size) epoch_iter = tqdm(range(num_batch)) for b in epoch_iter: if b % 100 == 0: epoch_iter.set_description( # f"Progress: {b * 1.0 / num_batch * 100:.4f}, alpha: {self.alpha:.6f}, time: {time.time() - t0:.4f}" ) self.alpha = self.init_alpha * max((1 - b * 1.0 / num_batch), 0.0001) u, v = [0] * batch_size, [0] * batch_size for i in range(batch_size): edge_id = alias_draw(self.edges_table, self.edges_prob) u[i], v[i] = self.edges[edge_id] if not self.is_directed and np.random.rand() > 0.5: v[i], u[i] = self.edges[edge_id] vec_error = np.zeros((batch_size, self.dimension)) label, target = np.asarray([1 for i in range(batch_size)]), np.asarray(v) for j in range(1 + self.negative): if j != 0: label = np.asarray([0 for i in range(batch_size)]) for i in range(batch_size): target[i] = alias_draw(self.node_table, self.node_prob) if order == 1: self._update( self.emb_vertex[u], self.emb_vertex[target], vec_error, label ) else: self._update( self.emb_vertex[u], self.emb_context[target], vec_error, label ) self.emb_vertex[u] += vec_error
if __name__ == "__main__": dataset = eg.CiteseerGraphDataset( force_reload=True ) # Download CiteseerGraphDataset contained in EasyGraph num_classes = dataset.num_classes g = dataset[0] labels = g.ndata["label"] edge_list = [] for i in g.edges: edge_list.append((i[0], i[1])) g1 = eg.Graph() g1.add_edges_from(edge_list) # print(g.edges) # print(g.__dir__()) model = LINE( dimension=128, walk_length=80, walk_num=20, negative=5, batch_size=128, init_alpha=0.025, order=3, ) print(model) model.train() out = model(g1, return_dict=True) keylist = sorted(out) tmp = torch.cat( ( torch.unsqueeze(torch.tensor(out[keylist[0]]), -2), torch.unsqueeze(torch.tensor(out[keylist[1]]), -2), ), 0, ) for i in range(2, len(keylist)): tmp = torch.cat((tmp, torch.unsqueeze(torch.tensor(out[keylist[i]]), -2)), 0) torch.save(tmp, "line.emb") print(tmp, tmp.shape) line_emb = [] for i in range(0, len(tmp)): line_emb.append(list(tmp[i])) line_emb = np.array(line_emb) # tsne = TSNE(n_components=2) # z = tsne.fit_transform(line_emb) # z_data = np.vstack((z.T, labels)).T # df_tsne = pd.DataFrame(z_data, columns=['Dim1', 'Dim2', 'class']) # df_tsne['class'] = df_tsne['class'].astype(int) # df_tsne.head() # # plt.figure(figsize=(8, 8)) # sns.scatterplot(data=df_tsne, hue='class', x='Dim1', y='Dim2', palette=['green','orange','brown','red', 'blue','black']) # plt.savefig('torch_line_citeseer.pdf', bbox_inches='tight') # plt.show() # #