Source code for easygraph.readwrite.ucinet

"""
**************
UCINET DL
**************
Read and write graphs in UCINET DL format.
This implementation currently supports only the 'fullmatrix' data format.
Format
------
The UCINET DL format is the most common file format used by UCINET package.
Basic example:
DL N = 5
Data:
0 1 1 1 1
1 0 1 0 0
1 1 0 0 1
1 0 0 0 0
1 0 1 0 0
References
----------
    See UCINET User Guide or http://www.analytictech.com/ucinet/help/hs5000.htm
    for full format information. Short version on http://www.analytictech.com/networks/dataentry.htm
"""


import re
import shlex

import easygraph as eg
import numpy as np

from easygraph.utils import open_file


__all__ = ["generate_ucinet", "read_ucinet", "parse_ucinet", "write_ucinet"]


[docs] def generate_ucinet(G): """Generate lines in UCINET graph format. Parameters ---------- G : graph A EasyGraph graph Examples -------- Notes ----- The default format 'fullmatrix' is used (for UCINET DL format). References ---------- See UCINET User Guide or http://www.analytictech.com/ucinet/help/hs5000.htm for full format information. Short version on http://www.analytictech.com/networks/dataentry.htm """ n = G.number_of_nodes() nodes = sorted(list(G.nodes)) yield "dl n=%i format=fullmatrix" % n # Labels try: int(nodes[0]) except ValueError: s = "labels:\n" for label in nodes: s += label + " " yield s yield "data:" yield str(np.asmatrix(eg.to_numpy_array(G, nodelist=nodes, dtype=int))).replace( "[", " " ).replace("]", " ").lstrip().rstrip()
[docs] @open_file(0, mode="rb") def read_ucinet(path, encoding="UTF-8"): """Read graph in UCINET format from path. Parameters ---------- path : file or string File or filename to read. Filenames ending in .gz or .bz2 will be uncompressed. Returns ------- G : EasyGraph MultiGraph or MultiDiGraph. Examples -------- >>> G=eg.path_graph(4) >>> eg.write_ucinet(G, "test.dl") >>> G=eg.read_ucinet("test.dl") To create a Graph instead of a MultiGraph use >>> G1=eg.Graph(G) See Also -------- parse_ucinet() References ---------- See UCINET User Guide or http://www.analytictech.com/ucinet/help/hs5000.htm for full format information. Short version on http://www.analytictech.com/networks/dataentry.htm """ lines = (line.decode(encoding) for line in path) return parse_ucinet(lines)
[docs] @open_file(1, mode="wb") def write_ucinet(G, path, encoding="UTF-8"): """Write graph in UCINET format to path. Parameters ---------- G : graph A EasyGraph graph path : file or string File or filename to write. Filenames ending in .gz or .bz2 will be compressed. Examples -------- >>> G=eg.path_graph(4) >>> eg.write_ucinet(G, "test.net") References ---------- See UCINET User Guide or http://www.analytictech.com/ucinet/help/hs5000.htm for full format information. Short version on http://www.analytictech.com/networks/dataentry.htm """ for line in generate_ucinet(G): line += "\n" path.write(line.encode(encoding))
[docs] def parse_ucinet(lines): """Parse UCINET format graph from string or iterable. Currently only the 'fullmatrix', 'nodelist1' and 'nodelist1b' formats are supported. Parameters ---------- lines : string or iterable Data in UCINET format. Returns ------- G : EasyGraph graph See Also -------- read_ucinet() References ---------- See UCINET User Guide or http://www.analytictech.com/ucinet/help/hs5000.htm for full format information. Short version on http://www.analytictech.com/networks/dataentry.htm """ from numpy import genfromtxt from numpy import isnan from numpy import reshape G = eg.MultiDiGraph() if not isinstance(lines, str): s = "" for line in lines: if type(line) == bytes: s += line.decode("utf-8") else: s += line lines = s lexer = shlex.shlex(lines.lower()) lexer.whitespace += ",=" lexer.whitespace_split = True number_of_nodes = 0 number_of_matrices = 0 nr = 0 # number of rows (rectangular matrix) nc = 0 # number of columns (rectangular matrix) ucinet_format = "fullmatrix" # Format by default labels = {} # Contains labels of nodes row_labels_embedded = False # Whether labels are embedded in data or not cols_labels_embedded = False diagonal = True # whether the main diagonal is present or absent KEYWORDS = ("format", "data:", "labels:") # TODO remove ':' in keywords while lexer: try: token = next(lexer) except StopIteration: break # print "Token : %s" % token if token.startswith("n"): if token.startswith("nr"): nr = int(get_param(r"\d+", token, lexer)) number_of_nodes = max(nr, nc) elif token.startswith("nc"): nc = int(get_param(r"\d+", token, lexer)) number_of_nodes = max(nr, nc) elif token.startswith("nm"): number_of_matrices = int(get_param(r"\d+", token, lexer)) else: number_of_nodes = int(get_param(r"\d+", token, lexer)) nr = number_of_nodes nc = number_of_nodes elif token.startswith("diagonal"): diagonal = get_param("present|absent", token, lexer) elif token.startswith("format"): ucinet_format = get_param( """^(fullmatrix|upperhalf|lowerhalf|nodelist1|nodelist2|nodelist1b|\ edgelist1|edgelist2|blockmatrix|partition)$""", token, lexer, ) # TODO : row and columns labels elif token.startswith("row"): # Row labels pass elif token.startswith("column"): # Columns labels pass elif token.startswith("labels"): token = next(lexer) i = 0 while token not in KEYWORDS: if token.startswith("embedded"): row_labels_embedded = True cols_labels_embedded = True break else: labels[i] = token.replace( '"', "" ) # for labels with embedded spaces i += 1 try: token = next(lexer) except StopIteration: break elif token.startswith("data"): break data_lines = lines.lower().split("data:", 1)[1] # Generate edges params = {} if cols_labels_embedded: # params['names'] = True labels = dict(zip(range(0, nc), data_lines.splitlines()[1].split())) # params['skip_header'] = 2 # First character is \n if row_labels_embedded: # Skip first column # TODO rectangular case : labels can differ from rows to columns # params['usecols'] = range(1, nc + 1) pass if ucinet_format == "fullmatrix": # In Python3 genfromtxt requires bytes string try: data_lines = bytes(data_lines, "utf-8") except TypeError: pass # Do not use splitlines() because it is not necessarily written as a square matrix data = genfromtxt([data_lines], case_sensitive=False, **params) if cols_labels_embedded or row_labels_embedded: # data = insert(data, 0, float('nan')) data = data[~isnan(data)] mat = reshape(data, (max(number_of_nodes, nr), -1)) G = eg.from_numpy_array(mat, create_using=eg.MultiDiGraph()) elif ucinet_format in ( "nodelist1", "nodelist1b", ): # Since genfromtxt only accepts square matrix... s = "" for i, line in enumerate(data_lines.splitlines()): row = line.split() if row: if ucinet_format == "nodelist1b" and row[0] == "0": pass else: for neighbor in row[1:]: if ucinet_format == "nodelist1": source = row[0] else: source = str(i) s += source + " " + neighbor + "\n" G = eg.parse_edgelist( s.splitlines(), nodetype=str if row_labels_embedded and cols_labels_embedded else int, create_using=eg.MultiDiGraph(), ) if not row_labels_embedded or not cols_labels_embedded: G = eg.relabel_nodes(G, dict(zip(list(G.nodes), [i - 1 for i in G.nodes]))) elif ucinet_format == "edgelist1": G = eg.parse_edgelist( data_lines.splitlines(), nodetype=str if row_labels_embedded and cols_labels_embedded else int, create_using=eg.MultiDiGraph(), ) if not row_labels_embedded or not cols_labels_embedded: G = eg.relabel_nodes(G, dict(zip(list(G.nodes), [i - 1 for i in G.nodes]))) # Relabel nodes if labels: try: if len(list(G.nodes)) < number_of_nodes: G.add_nodes_from( labels.values() if labels else range(0, number_of_nodes) ) G = eg.relabel_nodes(G, labels) except KeyError: pass # Nodes already labelled return G
def get_param(regex, token, lines): """ Get a parameter value in UCINET DL file :param regex: string with the regex matching the parameter value :param token: token (string) in which we search for the parameter :param lines: to iterate through the next tokens :return: """ n = token query = re.search(regex, n) while query is None: try: n = next(lines) except StopIteration: raise Exception("Parameter %s value not recognized" % token) query = re.search(regex, n) return query.group()