Source code for easygraph.datasets.github

"""GitHub Users Social Network Dataset (musae_git)

This dataset represents a directed social network of GitHub users collected in 2019.
Nodes represent GitHub developers, and a directed edge from user A to user B indicates that A follows B.

Each node also includes:
- Features: User profile and activity-based features.
- Labels: Developer's project area (e.g., machine learning, web dev, etc.)

Statistics:
- Nodes: 37,700
- Edges: 289,003
- Feature dim: 5,575
- Classes: 2

Reference:
J. Leskovec et al. "SNAP Datasets: Stanford Large Network Dataset Collection",
https://snap.stanford.edu/data/github-social.html
"""

import csv
import json
import os

import easygraph as eg
import numpy as np

from easygraph.classes.graph import Graph

from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import download
from .utils import extract_archive


[docs] class GitHubUsersDataset(EasyGraphBuiltinDataset): r"""GitHub developers social graph (musae_git). Parameters ---------- raw_dir : str, optional Directory to store raw data. Default: None force_reload : bool, optional Force re-download and processing. Default: False verbose : bool, optional Print processing information. Default: True transform : callable, optional Transform to apply to the graph on load. Examples -------- >>> from easygraph.datasets import GitHubUsersDataset >>> dataset = GitHubUsersDataset() >>> g = dataset[0] >>> print("Nodes:", g.number_of_nodes()) >>> print("Edges:", g.number_of_edges()) >>> print("Feature shape:", g.nodes[0]['feat'].shape) >>> print("Label:", g.nodes[0]['label']) """ def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None): name = "musae_git" url = "https://snap.stanford.edu/data/git_web_ml.zip" super(GitHubUsersDataset, self).__init__( name=name, url=url, raw_dir=raw_dir, force_reload=force_reload, verbose=verbose, transform=transform, )
[docs] def download(self): archive = os.path.join(self.raw_dir, self.name + ".zip") download(self.url, path=archive) extract_archive(archive, self.raw_path)
[docs] def process(self): g = eg.DiGraph() base_path = os.path.join(self.raw_path, "git_web_ml") # Load node features with open(os.path.join(base_path, "musae_git_features.json"), "r") as f: features = json.load(f) # Load labels labels = {} with open(os.path.join(base_path, "musae_git_target.csv"), "r") as f: reader = csv.DictReader(f) for row in reader: node_id = int(row["id"]) labels[node_id] = int(row["ml_target"]) # Load edges with open(os.path.join(base_path, "musae_git_edges.csv"), "r") as f: reader = csv.DictReader(f) for row in reader: u, v = int(row["id_1"]), int(row["id_2"]) g.add_edge(u, v) # Add node attributes for node_id in g.nodes: feat = np.array(features[str(node_id)], dtype=np.float32) label = labels.get(node_id, -1) g.add_node(node_id, feat=feat, label=label) self._g = g self._num_classes = len(set(labels.values())) if self.verbose: print("Finished loading GitHub Users dataset.") print(f" NumNodes: {g.number_of_nodes()}") print(f" NumEdges: {g.number_of_edges()}") print(f" Feature dim: {feat.shape[0]}") print(f" NumClasses: {self._num_classes}")
def __getitem__(self, idx): assert idx == 0, "GitHubUsersDataset only contains one graph" return self._g if self._transform is None else self._transform(self._g) def __len__(self): return 1 @property def num_classes(self): return self._num_classes