Source code for easygraph.datasets.arxiv
"""Arxiv HEP-TH Citation Network
This dataset represents the citation network of preprints from the High Energy Physics - Theory (HEP-TH) category on arXiv, covering the period from January 1993 to April 2003.
Each node corresponds to a paper, and a directed edge from paper A to paper B indicates that A cites B.
No features or labels are included in this dataset.
Statistics:
- Nodes: 27,770
- Edges: 352,807
- Features: None
- Labels: None
Reference:
J. Leskovec, J. Kleinberg and C. Faloutsos, "Graphs over Time: Densification Laws, Shrinking Diameters and Possible Explanations,"
in KDD 2005. Dataset: https://snap.stanford.edu/data/cit-HepTh.html
"""
import gzip
import os
import shutil
import easygraph as eg
from easygraph.classes.graph import Graph
from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import download
[docs]
class ArxivHEPTHDataset(EasyGraphBuiltinDataset):
r"""Arxiv HEP-TH citation network dataset.
Parameters
----------
raw_dir : str, optional
Directory to store the raw downloaded files. Default: None
force_reload : bool, optional
Whether to re-download and process the dataset. Default: False
verbose : bool, optional
Whether to print detailed processing logs. Default: True
transform : callable, optional
Optional transform to apply on the graph.
Examples
--------
>>> from easygraph.datasets import ArxivHEPTHDataset
>>> dataset = ArxivHEPTHDataset()
>>> g = dataset[0]
>>> print("Nodes:", g.number_of_nodes())
>>> print("Edges:", g.number_of_edges())
"""
def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
name = "cit-HepTh"
url = "https://snap.stanford.edu/data/cit-HepTh.txt.gz"
super(ArxivHEPTHDataset, self).__init__(
name=name,
url=url,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
[docs]
def download(self):
r"""Download and decompress the .txt.gz file."""
compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz")
extracted_path = os.path.join(self.raw_path, self.name + ".txt")
download(self.url, path=compressed_path)
if not os.path.exists(self.raw_path):
os.makedirs(self.raw_path)
with gzip.open(compressed_path, "rb") as f_in:
with open(extracted_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
[docs]
def process(self):
graph = eg.DiGraph() # Citation network is directed
edge_list_path = os.path.join(self.raw_path, self.name + ".txt")
with open(edge_list_path, "r") as f:
for line in f:
if line.startswith("#") or line.strip() == "":
continue
u, v = map(int, line.strip().split())
graph.add_edge(u, v)
self._g = graph
self._num_nodes = graph.number_of_nodes()
self._num_edges = graph.number_of_edges()
if self.verbose:
print("Finished loading Arxiv HEP-TH dataset.")
print(f" NumNodes: {self._num_nodes}")
print(f" NumEdges: {self._num_edges}")
def __getitem__(self, idx):
assert idx == 0, "ArxivHEPTHDataset only contains one graph"
return self._g if self._transform is None else self._transform(self._g)
def __len__(self):
return 1