Source code for easygraph.datasets.wiki_topcats

"""Wikipedia Top Categories Dataset (wiki-topcats)

This dataset is a directed graph of Wikipedia articles restricted to
top-level categories (at least 100 articles), capturing the largest
strongly connected component.

Statistics:
- Nodes: 1,791,489
- Edges: 28,511,807
- Categories: 17,364
- Overlapping labels per node

Source:
H. Yin, A. Benson, J. Leskovec, D. Gleich.
"Local Higher-order Graph Clustering", KDD 2017
Data: https://snap.stanford.edu/data/wiki-topcats.html
"""

import gzip
import os

import easygraph as eg

from easygraph.datasets.graph_dataset_base import EasyGraphBuiltinDataset
from easygraph.datasets.utils import download
from easygraph.datasets.utils import extract_archive



[docs]
class WikiTopCatsDataset(EasyGraphBuiltinDataset):
    """Wikipedia Top Categories Snapshot from 2011 (SNAP)"""

    def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
        super(WikiTopCatsDataset, self).__init__(
            name="wiki_topcats",
            url="https://snap.stanford.edu/data/wiki-topcats.txt.gz",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )


[docs]
    def download(self):
        # Download the main graph file
        gz_path = os.path.join(self.raw_dir, "wiki-topcats.txt.gz")
        download(self.url, path=gz_path)

        # Also download category info and page names
        cat_url = "https://snap.stanford.edu/data/wiki-topcats-categories.txt.gz"
        names_url = "https://snap.stanford.edu/data/wiki-topcats-page-names.txt.gz"
        download(
            cat_url, path=os.path.join(self.raw_dir, "wiki-topcats-categories.txt.gz")
        )
        download(
            names_url, path=os.path.join(self.raw_dir, "wiki-topcats-page-names.txt.gz")
        )



[docs]
    def process(self):
        raw = self.raw_dir

        # Decompress and read edges
        edge_gz = os.path.join(raw, "wiki-topcats.txt.gz")
        edge_txt = os.path.join(raw, "wiki-topcats.txt")
        if not os.path.exists(edge_txt):
            with gzip.open(edge_gz, "rt") as fin, open(edge_txt, "w") as fout:
                fout.writelines(fin)
        G = eg.DiGraph()
        edge_count = 0
        with open(edge_txt, "r") as f:
            for line in f:
                u, v = map(int, line.strip().split())
                G.add_edge(u, v)
                edge_count += 1
        if self.verbose:
            print(f"Loaded graph: {G.number_of_nodes()} nodes, {edge_count} edges")

        # Compress node names
        names_gz = os.path.join(raw, "wiki-topcats-page-names.txt.gz")
        names = {}
        with gzip.open(names_gz, "rt") as f:
            for idx, line in enumerate(f):
                names[idx] = line.strip()

        # Load categories
        cats_gz = os.path.join(raw, "wiki-topcats-categories.txt.gz")
        labels = {}  # mapping: node -> list of category strings
        with gzip.open(cats_gz, "rt") as f:
            for idx, line in enumerate(f):
                categories = line.strip().split(";")
                categories = [cat.strip() for cat in categories if cat.strip()]
                labels[idx] = categories

        # Attach node features: empty, and node labels
        for n in G.nodes:
            G.add_node(n, name=names.get(n, ""), label=labels.get(n, []))

        self._graph = G
        self._graphs = [G]
        self._processed = True


    def __getitem__(self, idx):
        assert idx == 0
        return self._graph

    def __len__(self):
        return 1