Source code for easygraph.datasets.hypergraph.loadDeepSetDatasets
import os.path as osp
import numpy as np
import scipy.sparse as sp
import torch
from torch_geometric.data import Data
from torch_sparse import coalesce
__all__ = ["load_line_expansion_dataset"]
[docs]def load_line_expansion_dataset(
path=None, dataset="cocitation-cora", train_percent=0.5
):
# load edges, features, and labels.
print("Loading {} dataset...".format(dataset))
file_name = f"{dataset}.content"
p2idx_features_labels = osp.join(path, dataset, file_name)
idx_features_labels = np.genfromtxt(p2idx_features_labels, dtype=np.dtype(str))
# features = np.array(idx_features_labels[:, 1:-1])
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
# labels = encode_onehot(idx_features_labels[:, -1])
labels = torch.LongTensor(idx_features_labels[:, -1].astype(float))
print("load features")
# build graph
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
file_name = f"{dataset}.edges"
p2edges_unordered = osp.join(path, dataset, file_name)
edges_unordered = np.genfromtxt(p2edges_unordered, dtype=np.int32)
edges = np.array(
list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32
).reshape(edges_unordered.shape)
print("load edges")
# From adjacency matrix to edge_list
edge_index = edges.T
# ipdb.set_trace()
assert edge_index[0].max() == edge_index[1].min() - 1
# check if values in edge_index is consecutive. i.e. no missing value for node_id/he_id.
assert len(np.unique(edge_index)) == edge_index.max() + 1
num_nodes = edge_index[0].max() + 1
num_he = edge_index[1].max() - num_nodes + 1
edge_index = np.hstack((edge_index, edge_index[::-1, :]))
# build torch data class
data = Data(
x=torch.FloatTensor(np.array(features[:num_nodes].todense())),
edge_index=torch.LongTensor(edge_index),
y=labels[:num_nodes],
)
# used user function to override the default function.
# the following will also sort the edge_index and remove duplicates.
total_num_node_id_he_id = len(np.unique(edge_index))
data.edge_index, data.edge_attr = coalesce(
data.edge_index, None, total_num_node_id_he_id, total_num_node_id_he_id
)
n_x = num_nodes
# n_x = n_expanded
num_class = len(np.unique(labels[:num_nodes].numpy()))
data.n_x = n_x
# add parameters to attribute
data.train_percent = train_percent
data.num_hyperedges = num_he
return data