From 145a5b2a69fa67570ab0f28358e0c5883200a9ae Mon Sep 17 00:00:00 2001 From: wyatt-joyner-pometry Date: Fri, 22 Nov 2024 17:46:35 +0000 Subject: [PATCH] add fast_rp algorithm Added Rust implementation; Rust test; Python integration; Python test --- python/python/raphtory/__init__.pyi | 39 ++ .../python/raphtory/algorithms/__init__.pyi | 23 + python/python/raphtory/graphql/__init__.pyi | 22 +- .../tests/graphql/edit_graph/test_graphql.py | 133 ++++-- python/tests/test_algorithms.py | 170 +++++++ python/tests/test_graphdb/test_graphdb.py | 181 ++++---- raphtory/src/algorithms/embeddings/fast_rp.rs | 427 ++++++++++++++++++ raphtory/src/algorithms/embeddings/mod.rs | 3 + raphtory/src/algorithms/mod.rs | 1 + raphtory/src/python/graph/algorithm_result.rs | 6 + raphtory/src/python/packages/algorithms.rs | 33 ++ raphtory/src/python/packages/base_modules.rs | 1 + 12 files changed, 921 insertions(+), 118 deletions(-) create mode 100644 raphtory/src/algorithms/embeddings/fast_rp.rs create mode 100644 raphtory/src/algorithms/embeddings/mod.rs diff --git a/python/python/raphtory/__init__.pyi b/python/python/raphtory/__init__.pyi index 7a6ad44fc..d000b539d 100644 --- a/python/python/raphtory/__init__.pyi +++ b/python/python/raphtory/__init__.pyi @@ -1235,6 +1235,25 @@ class Graph(GraphView): path (str): The path to the cache file """ + def create_node( + self, + timestamp: TimeInput, + id: str | int, + properties: Optional[PropInput] = None, + node_type: Optional[str] = None, + ) -> MutableNode: + """ + Creates a new node with the given id and properties to the graph. It fails if the node already exists. + + Arguments: + timestamp (TimeInput): The timestamp of the node. + id (str|int): The id of the node. + properties (PropInput, optional): The properties of the node. + node_type (str, optional): The optional string which will be used as a node type + Returns: + MutableNode: The created node + """ + @staticmethod def deserialise(bytes: bytes): """ @@ -3506,6 +3525,26 @@ class PersistentGraph(GraphView): path (str): The path to the cache file """ + def create_node( + self, + timestamp: TimeInput, + id: str | int, + properties: dict = None, + node_type: str = None, + ): + """ + Creates a new node with the given id and properties to the graph. It fails if the node already exists. + + Arguments: + timestamp (TimeInput): The timestamp of the node. + id (str | int): The id of the node. + properties (dict): The properties of the node. + node_type (str) : The optional string which will be used as a node type + + Returns: + MutableNode + """ + def delete_edge( self, timestamp: int, src: str | int, dst: str | int, layer: str = None ): diff --git a/python/python/raphtory/algorithms/__init__.pyi b/python/python/raphtory/algorithms/__init__.pyi index 880a5c91e..9ec8c261d 100644 --- a/python/python/raphtory/algorithms/__init__.pyi +++ b/python/python/raphtory/algorithms/__init__.pyi @@ -212,6 +212,29 @@ def directed_graph_density(g: GraphView): float : Directed graph density of G. """ +def fast_rp( + g: GraphView, + embedding_dim: int, + normalization_strength: float, + iter_weights: list[float], + seed: Optional[int] = None, + threads: Optional[int] = None, +) -> AlgorithmResult: + """ + Computes embedding vectors for each vertex of an undirected/bidirectional graph according to the Fast RP algorithm. + Original Paper: https://doi.org/10.48550/arXiv.1908.11512 + Arguments: + g (GraphView): The graph view on which embeddings are generated. + embedding_dim (int): The size (dimension) of the generated embeddings. + normalization_strength (float): The extent to which high-degree vertices should be discounted (range: 1-0) + iter_weights (list[float]): The scalar weights to apply to the results of each iteration + seed (int, optional): The seed for initialisation of random vectors + threads (int, optional): The number of threads to be used for parallel execution. + + Returns: + AlgorithmResult: Vertices mapped to their corresponding embedding vectors + """ + def fruchterman_reingold( graph: GraphView, iterations: int | None = 100, diff --git a/python/python/raphtory/graphql/__init__.pyi b/python/python/raphtory/graphql/__init__.pyi index 28827d487..c66635a92 100644 --- a/python/python/raphtory/graphql/__init__.pyi +++ b/python/python/raphtory/graphql/__init__.pyi @@ -329,9 +329,6 @@ class RemoteEdgeAddition(object): """Create and return a new object. See help(type) for accurate signature.""" class RemoteGraph(object): - def __new__(self, path, client) -> RemoteGraph: - """Create and return a new object. See help(type) for accurate signature.""" - def add_constant_properties(self, properties: dict): """ Adds constant properties to the remote graph. @@ -406,6 +403,25 @@ class RemoteGraph(object): properties (dict): The temporal properties of the graph. """ + def create_node( + self, + timestamp: int | str | datetime, + id: str | int, + properties: Optional[dict] = None, + node_type: Optional[str] = None, + ): + """ + Create a new node with the given id and properties to the remote graph and fail if the node already exists. + + Arguments: + timestamp (int|str|datetime): The timestamp of the node. + id (str|int): The id of the node. + properties (dict, optional): The properties of the node. + node_type (str, optional): The optional string which will be used as a node type + Returns: + RemoteNode + """ + def delete_edge( self, timestamp: int, diff --git a/python/tests/graphql/edit_graph/test_graphql.py b/python/tests/graphql/edit_graph/test_graphql.py index 28b36f959..a75d6259f 100644 --- a/python/tests/graphql/edit_graph/test_graphql.py +++ b/python/tests/graphql/edit_graph/test_graphql.py @@ -21,8 +21,8 @@ def test_encode_graph(): encoded = encode_graph(g) assert ( - encoded - == "EgxaCgoIX2RlZmF1bHQSDBIKCghfZGVmYXVsdBoFCgNiZW4aCQoFaGFtemEYARoLCgdoYWFyb29uGAIiAhABIgYIAhABGAEiBBACGAIqAhoAKgQSAhABKgQSAhADKgIKACoGEgQIARABKgYSBAgBEAIqBAoCCAEqBhIECAIQAioGEgQIAhADKgQKAggCKgQ6AhABKgIyACoIOgYIARACGAEqBDICCAEqCDoGCAIQAxgCKgQyAggC" + encoded + == "EgxaCgoIX2RlZmF1bHQSDBIKCghfZGVmYXVsdBoFCgNiZW4aCQoFaGFtemEYARoLCgdoYWFyb29uGAIiAhABIgYIAhABGAEiBBACGAIqAhoAKgQSAhABKgQSAhADKgIKACoGEgQIARABKgYSBAgBEAIqBAoCCAEqBhIECAIQAioGEgQIAhADKgQKAggCKgQ6AhABKgIyACoIOgYIARACGAEqBDICCAEqCDoGCAIQAxgCKgQyAggC" ) @@ -42,8 +42,8 @@ def test_wrong_url(): with pytest.raises(Exception) as excinfo: client = RaphtoryClient("http://broken_url.com") assert ( - str(excinfo.value) - == "Could not connect to the given server - no response --error sending request for url (http://broken_url.com/)" + str(excinfo.value) + == "Could not connect to the given server - no response --error sending request for url (http://broken_url.com/)" ) @@ -393,16 +393,14 @@ def test_create_node(): query_nodes = """{graph(path: "g") {nodes {list {name}}}}""" assert client.query(query_nodes) == { - "graph": { - "nodes": { - "list": [{"name": "ben"}, {"name": "shivam"}] - } - } + "graph": {"nodes": {"list": [{"name": "ben"}, {"name": "shivam"}]}} } create_node_query = """{updateGraph(path: "g") { createNode(time: 0, name: "oogway") { success } }}""" - assert client.query(create_node_query) == {"updateGraph": {"createNode": {"success": True}}} + assert client.query(create_node_query) == { + "updateGraph": {"createNode": {"success": True}} + } assert client.query(query_nodes) == { "graph": { "nodes": { @@ -428,11 +426,7 @@ def test_create_node_using_client(): query_nodes = """{graph(path: "g") {nodes {list {name}}}}""" assert client.query(query_nodes) == { - "graph": { - "nodes": { - "list": [{"name": "ben"}, {"name": "shivam"}] - } - } + "graph": {"nodes": {"list": [{"name": "ben"}, {"name": "shivam"}]}} } remote_graph = client.remote_graph(path="g") @@ -460,23 +454,56 @@ def test_create_node_using_client_with_properties(): client = RaphtoryClient("http://localhost:1737") client.send_graph(path="g", graph=g) - query_nodes = """{graph(path: "g") {nodes {list {name, properties { keys }}}}}""" + query_nodes = ( + """{graph(path: "g") {nodes {list {name, properties { keys }}}}}""" + ) assert client.query(query_nodes) == { "graph": { "nodes": { - "list": [{"name": "ben", 'properties': {'keys': []}}, {"name": "shivam", 'properties': {'keys': []}}] + "list": [ + {"name": "ben", "properties": {"keys": []}}, + {"name": "shivam", "properties": {"keys": []}}, + ] } } } remote_graph = client.remote_graph(path="g") - remote_graph.create_node(timestamp=0, id="oogway", properties={"prop1": 60, "prop2": 31.3, "prop3": "abc123", "prop4": True, "prop5": [1, 2, 3]}) - nodes = json.loads(json.dumps(client.query(query_nodes)))['graph']['nodes']['list'] - node_oogway = next(node for node in nodes if node['name'] == 'oogway') - assert sorted(node_oogway['properties']['keys']) == ['prop1', 'prop2', 'prop3', 'prop4', 'prop5'] + remote_graph.create_node( + timestamp=0, + id="oogway", + properties={ + "prop1": 60, + "prop2": 31.3, + "prop3": "abc123", + "prop4": True, + "prop5": [1, 2, 3], + }, + ) + nodes = json.loads(json.dumps(client.query(query_nodes)))["graph"]["nodes"][ + "list" + ] + node_oogway = next(node for node in nodes if node["name"] == "oogway") + assert sorted(node_oogway["properties"]["keys"]) == [ + "prop1", + "prop2", + "prop3", + "prop4", + "prop5", + ] with pytest.raises(Exception) as excinfo: - remote_graph.create_node(timestamp=0, id="oogway", properties={"prop1": 60, "prop2": 31.3, "prop3": "abc123", "prop4": True, "prop5": [1, 2, 3]}) + remote_graph.create_node( + timestamp=0, + id="oogway", + properties={ + "prop1": 60, + "prop2": 31.3, + "prop3": "abc123", + "prop4": True, + "prop5": [1, 2, 3], + }, + ) assert "Node already exists" in str(excinfo.value) @@ -494,20 +521,57 @@ def test_create_node_using_client_with_properties_node_type(): assert client.query(query_nodes) == { "graph": { "nodes": { - "list": [{"name": "ben", 'nodeType': None, 'properties': {'keys': []}}, {"name": "shivam", 'nodeType': None, 'properties': {'keys': []}}] + "list": [ + {"name": "ben", "nodeType": None, "properties": {"keys": []}}, + { + "name": "shivam", + "nodeType": None, + "properties": {"keys": []}, + }, + ] } } } remote_graph = client.remote_graph(path="g") - remote_graph.create_node(timestamp=0, id="oogway", properties={"prop1": 60, "prop2": 31.3, "prop3": "abc123", "prop4": True, "prop5": [1, 2, 3]}, node_type="master") - nodes = json.loads(json.dumps(client.query(query_nodes)))['graph']['nodes']['list'] - node_oogway = next(node for node in nodes if node['name'] == 'oogway') - assert node_oogway['nodeType'] == 'master' - assert sorted(node_oogway['properties']['keys']) == ['prop1', 'prop2', 'prop3', 'prop4', 'prop5'] + remote_graph.create_node( + timestamp=0, + id="oogway", + properties={ + "prop1": 60, + "prop2": 31.3, + "prop3": "abc123", + "prop4": True, + "prop5": [1, 2, 3], + }, + node_type="master", + ) + nodes = json.loads(json.dumps(client.query(query_nodes)))["graph"]["nodes"][ + "list" + ] + node_oogway = next(node for node in nodes if node["name"] == "oogway") + assert node_oogway["nodeType"] == "master" + assert sorted(node_oogway["properties"]["keys"]) == [ + "prop1", + "prop2", + "prop3", + "prop4", + "prop5", + ] with pytest.raises(Exception) as excinfo: - remote_graph.create_node(timestamp=0, id="oogway", properties={"prop1": 60, "prop2": 31.3, "prop3": "abc123", "prop4": True, "prop5": [1, 2, 3]}, node_type="master") + remote_graph.create_node( + timestamp=0, + id="oogway", + properties={ + "prop1": 60, + "prop2": 31.3, + "prop3": "abc123", + "prop4": True, + "prop5": [1, 2, 3], + }, + node_type="master", + ) assert "Node already exists" in str(excinfo.value) @@ -525,7 +589,10 @@ def test_create_node_using_client_with_node_type(): assert client.query(query_nodes) == { "graph": { "nodes": { - "list": [{"name": "ben", 'nodeType': None}, {"name": "shivam", 'nodeType': None}] + "list": [ + {"name": "ben", "nodeType": None}, + {"name": "shivam", "nodeType": None}, + ] } } } @@ -535,7 +602,11 @@ def test_create_node_using_client_with_node_type(): assert client.query(query_nodes) == { "graph": { "nodes": { - "list": [{"name": "ben", 'nodeType': None}, {"name": "shivam", 'nodeType': None}, {"name": "oogway", 'nodeType': "master"}] + "list": [ + {"name": "ben", "nodeType": None}, + {"name": "shivam", "nodeType": None}, + {"name": "oogway", "nodeType": "master"}, + ] } } } diff --git a/python/tests/test_algorithms.py b/python/tests/test_algorithms.py index 47112b169..e75aed3fb 100644 --- a/python/tests/test_algorithms.py +++ b/python/tests/test_algorithms.py @@ -1,7 +1,9 @@ import pytest import pandas as pd import pandas.core.frame +from numpy.ma.testutils import assert_equal +import raphtory.algorithms from raphtory import Graph, PersistentGraph from raphtory import algorithms from raphtory import graph_loader @@ -545,3 +547,171 @@ def test_max_weight_matching(): assert max_weight.dst(2).id == 3 assert max_weight.dst(3) is None + + +def test_fast_rp(): + g = Graph() + edges = [ + (1, 2, 1), + (1, 3, 1), + (2, 3, 1), + (4, 5, 1), + (4, 6, 1), + (4, 7, 1), + (5, 6, 1), + (5, 7, 1), + (6, 7, 1), + (6, 8, 1), + ] + for src, dst, ts in edges: + g.add_edge(ts, src, dst) + + result = algorithms.fast_rp(g, 16, 1.0, [1.0, 1.0], 42).get_all_with_names() + baseline = { + "7": [ + 0.0, + 3.3635856610148585, + -1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + "6": [ + -1.6817928305074292, + 5.045378491522287, + -1.6817928305074292, + 0.0, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + "5": [ + 0.0, + 3.3635856610148585, + -1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + "2": [ + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + 3.3635856610148585, + -3.3635856610148585, + 0.0, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + -3.3635856610148585, + ], + "8": [ + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + 0.0, + 0.0, + ], + "3": [ + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + 3.3635856610148585, + -3.3635856610148585, + 0.0, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + -3.3635856610148585, + ], + "1": [ + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + 3.3635856610148585, + -3.3635856610148585, + 0.0, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + -3.3635856610148585, + ], + "4": [ + 0.0, + 3.3635856610148585, + -1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + } + + assert result == baseline diff --git a/python/tests/test_graphdb/test_graphdb.py b/python/tests/test_graphdb/test_graphdb.py index eb1c43b6b..bed24790c 100644 --- a/python/tests/test_graphdb/test_graphdb.py +++ b/python/tests/test_graphdb/test_graphdb.py @@ -353,8 +353,8 @@ def test_getitem(): @with_disk_graph def check(g): assert ( - g.node(1).properties.temporal.get("cost") - == g.node(1).properties.temporal["cost"] + g.node(1).properties.temporal.get("cost") + == g.node(1).properties.temporal["cost"] ) check(g) @@ -607,7 +607,7 @@ def time_history_test(time, key, value): assert g.at(time).node(1).properties.temporal.get(key) is None assert g.at(time).nodes.properties.temporal.get(key) is None assert ( - g.at(time).nodes.out_neighbours.properties.temporal.get(key) is None + g.at(time).nodes.out_neighbours.properties.temporal.get(key) is None ) else: assert g.at(time).node(1).properties.temporal.get(key).items() == value @@ -812,22 +812,22 @@ def no_static_property_test(key, value): assert sorted(g.node(1).properties.temporal.keys()) == expected_names_no_static assert sorted(g.nodes.properties.temporal.keys()) == expected_names_no_static assert ( - sorted(g.nodes.out_neighbours.properties.temporal.keys()) - == expected_names_no_static + sorted(g.nodes.out_neighbours.properties.temporal.keys()) + == expected_names_no_static ) expected_names_no_static_at_1 = sorted(["prop 4", "prop 1", "prop 3"]) assert ( - sorted(g.at(1).node(1).properties.temporal.keys()) - == expected_names_no_static_at_1 + sorted(g.at(1).node(1).properties.temporal.keys()) + == expected_names_no_static_at_1 ) assert ( - sorted(g.at(1).nodes.properties.temporal.keys()) - == expected_names_no_static_at_1 + sorted(g.at(1).nodes.properties.temporal.keys()) + == expected_names_no_static_at_1 ) assert ( - sorted(g.at(1).nodes.out_neighbours.properties.temporal.keys()) - == expected_names_no_static_at_1 + sorted(g.at(1).nodes.out_neighbours.properties.temporal.keys()) + == expected_names_no_static_at_1 ) # testing has_property @@ -1325,11 +1325,11 @@ def test_constant_property_update(): def updates(v): v.update_constant_properties({"prop1": "value1", "prop2": 123}) assert ( - v.properties.get("prop1") == "value1" and v.properties.get("prop2") == 123 + v.properties.get("prop1") == "value1" and v.properties.get("prop2") == 123 ) v.update_constant_properties({"prop1": "value2", "prop2": 345}) assert ( - v.properties.get("prop1") == "value2" and v.properties.get("prop2") == 345 + v.properties.get("prop1") == "value2" and v.properties.get("prop2") == 345 ) v.add_constant_properties({"name": "value1"}) @@ -1666,18 +1666,18 @@ def check(g): assert g.exclude_layer("layer2").count_edges() == 4 with pytest.raises( - Exception, - match=re.escape( - "Invalid layer: test_layer. Valid layers: _default, layer1, layer2" - ), + Exception, + match=re.escape( + "Invalid layer: test_layer. Valid layers: _default, layer1, layer2" + ), ): g.layers(["test_layer"]) with pytest.raises( - Exception, - match=re.escape( - "Invalid layer: test_layer. Valid layers: _default, layer1, layer2" - ), + Exception, + match=re.escape( + "Invalid layer: test_layer. Valid layers: _default, layer1, layer2" + ), ): g.edge(1, 2).layers(["test_layer"]) @@ -1754,20 +1754,20 @@ def test_layer_name(): assert str(e.value) == error_msg assert [ - list(iterator) for iterator in g.nodes.neighbours.edges.explode().layer_name - ] == [ - ["_default", "awesome layer"], - ["_default", "awesome layer"], - ["_default", "awesome layer"], - ] + list(iterator) for iterator in g.nodes.neighbours.edges.explode().layer_name + ] == [ + ["_default", "awesome layer"], + ["_default", "awesome layer"], + ["_default", "awesome layer"], + ] assert [ - list(iterator) - for iterator in g.nodes.neighbours.edges.explode_layers().layer_name - ] == [ - ["_default", "awesome layer"], - ["_default", "awesome layer"], - ["_default", "awesome layer"], - ] + list(iterator) + for iterator in g.nodes.neighbours.edges.explode_layers().layer_name + ] == [ + ["_default", "awesome layer"], + ["_default", "awesome layer"], + ["_default", "awesome layer"], + ] def test_time(): @@ -1801,12 +1801,12 @@ def check(g): # assert str(e.value) == error_msg assert [ - list(iterator) for iterator in g.nodes.neighbours.edges.explode().time - ] == [ - [0, 0, 1], - [0, 0, 1], - [0, 0, 1], - ] + list(iterator) for iterator in g.nodes.neighbours.edges.explode().time + ] == [ + [0, 0, 1], + [0, 0, 1], + [0, 0, 1], + ] check(g) @@ -2371,8 +2371,8 @@ def test_weird_windows(): @with_disk_graph def check(g): with pytest.raises( - Exception, - match="'ddd' is not a valid datetime, valid formats are RFC3339, RFC2822, %Y-%m-%d, %Y-%m-%dT%H:%M:%S%.3f, %Y-%m-%dT%H:%M:%S%, %Y-%m-%d %H:%M:%S%.3f and %Y-%m-%d %H:%M:%S%", + Exception, + match="'ddd' is not a valid datetime, valid formats are RFC3339, RFC2822, %Y-%m-%d, %Y-%m-%dT%H:%M:%S%.3f, %Y-%m-%dT%H:%M:%S%, %Y-%m-%d %H:%M:%S%.3f and %Y-%m-%d %H:%M:%S%", ): g.window("ddd", "aaa") @@ -2569,9 +2569,9 @@ def check(g): assert g.nodes.type_filter(["a"]).neighbours.type_filter( ["c"] ).name.collect() == [ - [], - ["5"], - ] + [], + ["5"], + ] assert g.nodes.type_filter(["a"]).neighbours.type_filter([]).name.collect() == [ [], [], @@ -2582,9 +2582,9 @@ def check(g): assert g.nodes.type_filter(["a"]).neighbours.type_filter( ["d"] ).name.collect() == [ - [], - [], - ] + [], + [], + ] assert g.nodes.type_filter(["a"]).neighbours.neighbours.name.collect() == [ ["1", "3", "4"], ["1", "3", "4", "4", "6"], @@ -2639,20 +2639,20 @@ def check(g): for edge in edges: time_nested.append(edge.time) assert [ - item - for sublist in g.nodes.edges.explode().time.collect() - for item in sublist - ] == time_nested + item + for sublist in g.nodes.edges.explode().time.collect() + for item in sublist + ] == time_nested date_time_nested = [] for edges in g.nodes.edges.explode(): for edge in edges: date_time_nested.append(edge.date_time) assert [ - item - for sublist in g.nodes.edges.explode().date_time.collect() - for item in sublist - ] == date_time_nested + item + for sublist in g.nodes.edges.explode().date_time.collect() + for item in sublist + ] == date_time_nested check(g) @@ -2966,44 +2966,44 @@ def check(g): assert len(index.fuzzy_search_nodes("name:habza", levenshtein_distance=1)) == 1 assert ( - len( - index.fuzzy_search_nodes( - "name:haa", levenshtein_distance=1, prefix=True - ) + len( + index.fuzzy_search_nodes( + "name:haa", levenshtein_distance=1, prefix=True ) - == 2 + ) + == 2 ) assert ( - len( - index.fuzzy_search_nodes( - "value_str:abc123", levenshtein_distance=2, prefix=True - ) + len( + index.fuzzy_search_nodes( + "value_str:abc123", levenshtein_distance=2, prefix=True ) - == 2 + ) + == 2 ) assert ( - len( - index.fuzzy_search_nodes( - "value_str:dsss312", levenshtein_distance=2, prefix=False - ) + len( + index.fuzzy_search_nodes( + "value_str:dsss312", levenshtein_distance=2, prefix=False ) - == 1 + ) + == 1 ) assert len(index.fuzzy_search_edges("from:bon", levenshtein_distance=1)) == 2 assert ( - len( - index.fuzzy_search_edges("from:bo", levenshtein_distance=1, prefix=True) - ) - == 2 + len( + index.fuzzy_search_edges("from:bo", levenshtein_distance=1, prefix=True) + ) + == 2 ) assert ( - len( - index.fuzzy_search_edges( - "from:eon", levenshtein_distance=2, prefix=True - ) + len( + index.fuzzy_search_edges( + "from:eon", levenshtein_distance=2, prefix=True ) - == 2 + ) + == 2 ) check(g) @@ -3011,26 +3011,38 @@ def check(g): def test_create_node_graph(): g = Graph() - g.create_node(1, "shivam", properties={"value": 60, "value_f": 31.3, "value_str": "abc123"}) + g.create_node( + 1, "shivam", properties={"value": 60, "value_f": 31.3, "value_str": "abc123"} + ) node = g.node("shivam") assert node.name == "shivam" assert node.properties == {"value": 60, "value_f": 31.3, "value_str": "abc123"} with pytest.raises(Exception) as excinfo: - g.create_node(1, "shivam", properties={"value": 60, "value_f": 31.3, "value_str": "abc123"}) + g.create_node( + 1, + "shivam", + properties={"value": 60, "value_f": 31.3, "value_str": "abc123"}, + ) assert "Node already exists" in str(excinfo.value) def test_create_node_graph_with_deletion(): g = PersistentGraph() - g.create_node(1, "shivam", properties={"value": 60, "value_f": 31.3, "value_str": "abc123"}) + g.create_node( + 1, "shivam", properties={"value": 60, "value_f": 31.3, "value_str": "abc123"} + ) node = g.node("shivam") assert node.name == "shivam" assert node.properties == {"value": 60, "value_f": 31.3, "value_str": "abc123"} with pytest.raises(Exception) as excinfo: - g.create_node(1, "shivam", properties={"value": 60, "value_f": 31.3, "value_str": "abc123"}) + g.create_node( + 1, + "shivam", + properties={"value": 60, "value_f": 31.3, "value_str": "abc123"}, + ) assert "Node already exists" in str(excinfo.value) @@ -3047,6 +3059,7 @@ def datadir(tmpdir, request): raise e return tmpdir + # def currently_broken_fuzzy_search(): #TODO: Fix fuzzy searching for properties # g = Graph() # g.add_edge(2,"haaroon","hamza", properties={"value":60,"value_f":31.3,"value_str":"abc123"}) diff --git a/raphtory/src/algorithms/embeddings/fast_rp.rs b/raphtory/src/algorithms/embeddings/fast_rp.rs new file mode 100644 index 000000000..515a3162b --- /dev/null +++ b/raphtory/src/algorithms/embeddings/fast_rp.rs @@ -0,0 +1,427 @@ +use crate::{ + algorithms::algorithm_result::AlgorithmResult, + core::{entities::VID, state::compute_state::ComputeStateVec}, + db::{ + api::view::{NodeViewOps, StaticGraphViewOps}, + task::{ + context::Context, + node::eval_node::EvalNodeView, + task::{ATask, Job, Step}, + task_runner::TaskRunner, + }, + }, +}; +use rand::prelude::*; +use rayon::prelude::*; +use std::sync::Arc; + +#[derive(Clone, Debug, Default)] +struct FastRPState { + embedding_state: Vec, +} + +/// Computes the embeddings of each vertex of a graph using the Fast RP algorithm +/// +/// # Arguments +/// +/// * `graph` - A reference to the graph +/// * `embedding_dim` - The size of the generated embeddings +/// * `normalization_strength` - The extent to which high-degree vertices should be discounted (range: 1-0) +/// * `iter_weights` - The scalar weights to apply to the results of each iteration +/// * `seed` - The seed for initialisation of random vectors +/// * `threads` - Number of threads to use +/// +/// Returns: +/// +/// An AlgorithmResult containing the mapping from the node to its embedding +/// +pub fn fast_rp( + graph: &G, + embedding_dim: usize, + normalization_strength: f64, + iter_weights: Vec, + seed: Option, + threads: Option, +) -> AlgorithmResult, Vec> +where + G: StaticGraphViewOps, +{ + let ctx: Context = graph.into(); + let m = graph.count_nodes() as f64; + let s = m.sqrt(); + let beta = normalization_strength - 1.0; + let num_iters = iter_weights.len() - 1; + let weights = Arc::new(iter_weights); + let seed = seed.unwrap_or(rand::thread_rng().gen()); + + // initialize each vertex with a random vector according to FastRP's construction rules + let step1 = { + let weights = Arc::clone(&weights); + ATask::new(move |vv| { + let l = ((vv.degree() as f64) / (m * 2.0)).powf(beta); + let choices = [ + (l * s.sqrt(), 1.0 / (s * 2.0)), + (0.0, 1.0 - (1.0 / s)), + (-l * s.sqrt(), 1.0 / (s * 2.0)), + ]; + let mut rng = SmallRng::seed_from_u64(vv.node.0 as u64 ^ seed); + let state: &mut FastRPState = vv.get_mut(); + state.embedding_state = (0..embedding_dim) + .map(|_| choices.choose_weighted(&mut rng, |item| item.1).unwrap().0 * weights[0]) + .collect(); + Step::Continue + }) + }; + + // sum each vector from neighbours and scale + let step2 = ATask::new(move |vv: &mut EvalNodeView| { + // for neighbor, for i, add neighbors.prev[i] to current state + // scale state by iteration weight + let weights = Arc::clone(&weights); + let ss = vv.eval_graph.ss; + // TODO: rewrite using iters? + for neighbour in vv.neighbours() { + for i in 0..embedding_dim { + vv.get_mut().embedding_state[i] += neighbour.prev().embedding_state[i]; + } + } + for value in vv.get_mut().embedding_state.iter_mut() { + *value *= weights[ss]; + } + + Step::Continue + }); + + let mut runner: TaskRunner = TaskRunner::new(ctx); + let results_type = std::any::type_name::(); + + let res = runner.run( + vec![Job::new(step1)], + vec![Job::read_only(step2)], + None, + |_, _, _, local: Vec| { + graph + .nodes() + .par_iter() + .map(|node| { + let VID(id) = node.node; + let embedding = local[id].embedding_state.clone(); + (id, embedding) + }) + .collect() + }, + threads, + num_iters, + None, + None, + ); + + // TODO: add flag to optionally normalize results + + AlgorithmResult::new(graph.clone(), "Fast RP", results_type, res) +} + +#[cfg(test)] +mod fast_rp_test { + use super::*; + use crate::{db::api::mutation::AdditionOps, prelude::*, test_storage}; + use std::collections::HashMap; + + #[test] + fn simple_fast_rp_test() { + let graph = Graph::new(); + + let edges = vec![ + (1, 2, 1), + (1, 3, 1), + (2, 3, 1), + (4, 5, 1), + (4, 6, 1), + (4, 7, 1), + (5, 6, 1), + (5, 7, 1), + (6, 7, 1), + (6, 8, 1), + ]; + + for (src, dst, ts) in edges { + graph.add_edge(ts, src, dst, NO_PROPS, None).unwrap(); + } + + let baseline = HashMap::from([ + ( + "7", + [ + 0.0, + 3.3635856610148585, + -1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + ), + ( + "6", + [ + -1.6817928305074292, + 5.045378491522287, + -1.6817928305074292, + 0.0, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + ), + ( + "5", + [ + 0.0, + 3.3635856610148585, + -1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + ), + ( + "2", + [ + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + 3.3635856610148585, + -3.3635856610148585, + 0.0, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + -3.3635856610148585, + ], + ), + ( + "8", + [ + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + 0.0, + 0.0, + ], + ), + ( + "3", + [ + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + 3.3635856610148585, + -3.3635856610148585, + 0.0, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + -3.3635856610148585, + ], + ), + ( + "1", + [ + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 0.0, + 0.0, + 3.3635856610148585, + 1.6817928305074292, + 1.6817928305074292, + 3.3635856610148585, + -3.3635856610148585, + 0.0, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + -3.3635856610148585, + ], + ), + ( + "4", + [ + 0.0, + 3.3635856610148585, + -1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + 1.6817928305074292, + 0.0, + -1.6817928305074292, + 0.0, + 0.0, + -1.6817928305074292, + 1.6817928305074292, + 1.6817928305074292, + -1.6817928305074292, + -1.6817928305074292, + ], + ), + ]); + + test_storage!(&graph, |graph| { + let results = + fast_rp(graph, 16, 1.0, vec![1.0, 1.0], Some(42), None).get_all_with_names(); + for (v_id, embedding) in results { + assert_eq!(embedding, *baseline.get(v_id.as_str()).unwrap()); + } + }); + } + + // NOTE(Wyatt): the simple fast_rp test is more of a validation of idempotency than correctness (although the results are expected) + // This test-- in progress-- is going to validate that the algorithm preserves the pairwise topological distances + /* + use crate::io::csv_loader::CsvLoader; + use serde::{Deserialize, Serialize}; + use std::path::PathBuf; + + fn print_samples(map: &HashMap>, n: usize) { + let mut count = 0; + + for (key, value) in map { + println!("Key: {}, Value: {:#?}", key, value); + + count += 1; + if count >= n { + break; + } + } + } + + fn top_k_neighbors( + data: &HashMap>, + k: usize, + ) -> HashMap> { + let mut neighbors: HashMap> = HashMap::new(); + + // Iterate over each ID to find its top K neighbors + for (id, vector) in data { + // Collect distances to all other IDs + let mut distances: Vec<(&String, f64)> = Vec::new(); + for (other_id, other_vector) in data { + if id == other_id { + continue; // Skip self + } + // Compute Euclidean distance + let distance = euclidean_distance(vector, other_vector); + distances.push((other_id, distance)); + } + // Sort the distances in ascending order + distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + // Collect top K neighbor IDs + let top_k: Vec = distances + .iter() + .take(k) + .map(|(other_id, _)| (*other_id).clone()) + .collect(); + // Insert into the neighbors map + neighbors.insert(id.clone(), top_k); + } + + neighbors + } + + fn euclidean_distance(a: &Vec, b: &Vec) -> f64 { + assert_eq!(a.len(), b.len(), "Vectors must be of the same length"); + a.iter() + .zip(b.iter()) + .map(|(&x, &y)| (x - y).powi(2)) + .sum::() + .sqrt() + } + + #[test] + fn big_fast_rp_test() { + let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + d.push("resources/test"); + let loader = CsvLoader::new(d.join("test.csv")).set_delimiter(","); + let graph = Graph::new(); + + #[derive(Deserialize, Serialize, Debug)] + struct CsvEdge { + src: u64, + dst: u64, + } + + loader + .load_into_graph(&graph, |e: CsvEdge, g| { + g.add_edge(1, e.src, e.dst, NO_PROPS, None).unwrap(); + g.add_edge(1, e.dst, e.src, NO_PROPS, None).unwrap(); + }) + .unwrap(); + + test_storage!(&graph, |graph| { + let results = fast_rp( + graph, + 32, + 1.0, + vec![1.0, 1.0, 0.5], + Some(42), + None, + ).get_all_with_names(); + // println!("Result: {:#?}", results); + print_samples(&results, 10); + }); + } + */ +} diff --git a/raphtory/src/algorithms/embeddings/mod.rs b/raphtory/src/algorithms/embeddings/mod.rs new file mode 100644 index 000000000..98e406485 --- /dev/null +++ b/raphtory/src/algorithms/embeddings/mod.rs @@ -0,0 +1,3 @@ +mod fast_rp; + +pub use fast_rp::fast_rp; diff --git a/raphtory/src/algorithms/mod.rs b/raphtory/src/algorithms/mod.rs index 8f21dd870..91710fc55 100644 --- a/raphtory/src/algorithms/mod.rs +++ b/raphtory/src/algorithms/mod.rs @@ -34,6 +34,7 @@ pub mod bipartite; pub mod components; pub mod cores; pub mod dynamics; +pub mod embeddings; pub mod layout; pub mod metrics; pub mod motifs; diff --git a/raphtory/src/python/graph/algorithm_result.rs b/raphtory/src/python/graph/algorithm_result.rs index 6e3402ee3..7dfb276f4 100644 --- a/raphtory/src/python/graph/algorithm_result.rs +++ b/raphtory/src/python/graph/algorithm_result.rs @@ -312,6 +312,12 @@ py_algorithm_result_new_ord_hash_eq!( Vec<(i64, String)> ); +py_algorithm_result!(AlgorithmResultVecF64, DynamicGraph, Vec, Vec); +// NOTE(Wyatt): This is currently just used by FastRP and it doesn't really make sense to implement max, min, median, etc. methods on the output. +// Nonetheless, I will convert it to a Vec> at a later point. For now, it just needs the base functions. +//py_algorithm_result_new_ord_hash_eq!(AlgorithmResultVecF64, DynamicGraph, Vec, Vec); +py_algorithm_result_base!(AlgorithmResultVecF64, DynamicGraph, Vec, Vec); + py_algorithm_result!(AlgorithmResultUsize, DynamicGraph, usize, usize); py_algorithm_result_new_ord_hash_eq!(AlgorithmResultUsize, DynamicGraph, usize, usize); diff --git a/raphtory/src/python/packages/algorithms.rs b/raphtory/src/python/packages/algorithms.rs index 7aef68bd3..eed3251ae 100644 --- a/raphtory/src/python/packages/algorithms.rs +++ b/raphtory/src/python/packages/algorithms.rs @@ -15,6 +15,7 @@ use crate::{ }, components, dynamics::temporal::epidemics::{temporal_SEIR as temporal_SEIR_rs, Infected, SeedError}, + embeddings::fast_rp as fast_rp_rs, layout::{ cohesive_fruchterman_reingold::cohesive_fruchterman_reingold as cohesive_fruchterman_reingold_rs, fruchterman_reingold::fruchterman_reingold_unbounded as fruchterman_reingold_rs, @@ -911,3 +912,35 @@ pub fn max_weight_matching( verify_optimum_flag, ) } + +/// Computes embedding vectors for each vertex of an undirected/bidirectional graph according to the Fast RP algorithm. +/// Original Paper: https://doi.org/10.48550/arXiv.1908.11512 +/// Arguments: +/// g (GraphView): The graph view on which embeddings are generated. +/// embedding_dim (int): The size (dimension) of the generated embeddings. +/// normalization_strength (float): The extent to which high-degree vertices should be discounted (range: 1-0) +/// iter_weights (list[float]): The scalar weights to apply to the results of each iteration +/// seed (int, optional): The seed for initialisation of random vectors +/// threads (int, optional): The number of threads to be used for parallel execution. +/// +/// Returns: +/// AlgorithmResult: Vertices mapped to their corresponding embedding vectors +#[pyfunction] +#[pyo3[signature = (g, embedding_dim, normalization_strength, iter_weights, seed=None, threads=None)]] +pub fn fast_rp( + g: &PyGraphView, + embedding_dim: usize, + normalization_strength: f64, + iter_weights: Vec, + seed: Option, + threads: Option, +) -> AlgorithmResult, Vec> { + fast_rp_rs( + &g.graph, + embedding_dim, + normalization_strength, + iter_weights, + seed, + threads, + ) +} diff --git a/raphtory/src/python/packages/base_modules.rs b/raphtory/src/python/packages/base_modules.rs index 46f8814a0..404c69d2a 100644 --- a/raphtory/src/python/packages/base_modules.rs +++ b/raphtory/src/python/packages/base_modules.rs @@ -93,6 +93,7 @@ pub fn base_algorithm_module(py: Python<'_>) -> Result, PyErr> { in_component, out_components, out_component, + fast_rp, global_temporal_three_node_motif, global_temporal_three_node_motif_multi, local_temporal_three_node_motifs,