Source code for easygraph.datasets.web_google

"""Web-Google Dataset

This dataset is a web graph based on Google's web pages and their hyperlink
structure, as crawled by the Stanford WebBase project in 2002.

Each node represents a web page, and a directed edge from u to v indicates
a hyperlink from page u to page v.

Statistics:
- Nodes: 875713
- Edges: 5105039
- Features: None
- Labels: None

Reference:
J. Leskovec, A. Rajaraman, J. Ullman, “Mining of Massive Datasets.”
Dataset from SNAP: https://snap.stanford.edu/data/web-Google.html
"""

import gzip
import os
import shutil

import easygraph as eg

from easygraph.classes.graph import Graph

from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import download
from .utils import extract_archive



[docs]
class WebGoogleDataset(EasyGraphBuiltinDataset):
    r"""Web-Google hyperlink network dataset.

    Parameters
    ----------
    raw_dir : str, optional
        Directory to store the raw downloaded files. Default: None
    force_reload : bool, optional
        Whether to re-download and process the dataset. Default: False
    verbose : bool, optional
        Whether to print detailed processing logs. Default: True
    transform : callable, optional
        Optional transform to apply on the graph.

    Examples
    --------
    >>> from easygraph.datasets import WebGoogleDataset
    >>> dataset = WebGoogleDataset()
    >>> g = dataset[0]
    >>> print("Nodes:", g.number_of_nodes())
    >>> print("Edges:", g.number_of_edges())
    """

    def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
        name = "web-Google"
        url = "https://snap.stanford.edu/data/web-Google.txt.gz"
        super(WebGoogleDataset, self).__init__(
            name=name,
            url=url,
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    def download(self):
        r"""Download and extract .gz edge list."""
        if self.url is not None:
            file_path = os.path.join(self.raw_dir, self.name + ".txt.gz")
            download(self.url, path=file_path)
            extract_archive(file_path, self.raw_path)


[docs]
    def process(self):
        graph = eg.DiGraph()  # Web-Google is directed
        edge_list_path = os.path.join(self.raw_path, self.name + ".txt")

        with open(edge_list_path, "r") as f:
            for line in f:
                if line.startswith("#") or line.strip() == "":
                    continue
                u, v = map(int, line.strip().split())
                graph.add_edge(u, v)

        self._g = graph
        self._num_nodes = graph.number_of_nodes()
        self._num_edges = graph.number_of_edges()

        if self.verbose:
            print("Finished loading Web-Google dataset.")
            print(f"  NumNodes: {self._num_nodes}")
            print(f"  NumEdges: {self._num_edges}")


    def __getitem__(self, idx):
        assert idx == 0, "WebGoogleDataset only contains one graph"
        return self._g if self._transform is None else self._transform(self._g)

    def __len__(self):
        return 1


[docs]
    def download(self):
        r"""Download and decompress the .txt.gz file."""
        if self.url is not None:
            compressed_path = os.path.join(self.raw_dir, self.name + ".txt.gz")
            extracted_path = os.path.join(self.raw_path, self.name + ".txt")

            # Download .gz file
            download(self.url, path=compressed_path)

            # Ensure output directory exists
            if not os.path.exists(self.raw_path):
                os.makedirs(self.raw_path)

            # Decompress manually
            with gzip.open(compressed_path, "rb") as f_in:
                with open(extracted_path, "wb") as f_out:
                    shutil.copyfileobj(f_in, f_out)