Source code for easygraph.datasets.facebook_ego

"""Facebook Ego-Net Dataset

This dataset contains a subset of Facebook’s social network collected from
survey participants in the SNAP EgoNet project. Nodes represent users, and
edges indicate friendship links between them.

Each ego network is centered on a user and includes their friend connections
and friend-to-friend connections. The `.circles` files contain labeled groups
(i.e., communities) of friends identified by the ego user.

This version processes all ego-nets as a single undirected graph. Node features
are not provided. Labels (circles) are optional and not included by default.

Statistics (based on merged graph):
- Nodes: ~4,000+
- Edges: ~88,000+
- Features: None
- Classes: None

Reference:
J. McAuley and J. Leskovec, “Learning to Discover Social Circles in Ego Networks,”
in NIPS, 2012. [https://snap.stanford.edu/data/egonets-Facebook.html]
"""

import os

import easygraph as eg

from easygraph.classes.graph import Graph

from .graph_dataset_base import EasyGraphBuiltinDataset
from .utils import download
from .utils import extract_archive



[docs]
class FacebookEgoNetDataset(EasyGraphBuiltinDataset):
    r"""Facebook Ego-Net social network dataset.

    Each node is a user, and edges represent friendship. The dataset
    includes 10 ego networks centered on different users.

    Parameters
    ----------
    raw_dir : str, optional
        Directory to store the raw downloaded files. Default: None
    force_reload : bool, optional
        Whether to re-download and process the dataset. Default: False
    verbose : bool, optional
        Whether to print detailed processing logs. Default: True
    transform : callable, optional
        Optional transform to apply on the graph.

    Examples
    --------
    >>> from easygraph.datasets import FacebookEgoNetDataset
    >>> dataset = FacebookEgoNetDataset()
    >>> g = dataset[0]
    >>> print("Nodes:", g.number_of_nodes())
    >>> print("Edges:", g.number_of_edges())
    """

    def __init__(self, raw_dir=None, force_reload=False, verbose=True, transform=None):
        name = "facebook"
        url = "https://snap.stanford.edu/data/facebook.tar.gz"
        super(FacebookEgoNetDataset, self).__init__(
            name=name,
            url=url,
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )


[docs]
    def process(self):
        parent_dir = os.path.join(self.raw_path, "facebook")
        g = eg.Graph()

        # Iterate over all .edges files in the subdirectory
        for filename in os.listdir(parent_dir):
            if filename.endswith(".edges"):
                edge_file = os.path.join(parent_dir, filename)

                with open(edge_file, "r") as f:
                    for line in f:
                        u, v = map(int, line.strip().split())
                        g.add_edge(u, v)

        self._g = g
        self._num_nodes = g.number_of_nodes()
        self._num_edges = g.number_of_edges()

        if self.verbose:
            print("Finished loading Facebook Ego-Net dataset.")
            print(f"  NumNodes: {self._num_nodes}")
            print(f"  NumEdges: {self._num_edges}")


    def __getitem__(self, idx):
        assert idx == 0, "FacebookEgoNetDataset only contains one merged graph"
        return self._g if self._transform is None else self._transform(self._g)

    def __len__(self):
        return 1


[docs]
    def download(self):
        r"""Automatically download data and extract it."""
        if self.url is not None:
            archive_path = os.path.join(self.raw_dir, self.name + ".tar.gz")
            download(self.url, path=archive_path)
            extract_archive(archive_path, self.raw_path)