From 419db768f112d72ebd56a15d102f7cb1de006d58 Mon Sep 17 00:00:00 2001 From: ljeub-pometry <97447091+ljeub-pometry@users.noreply.github.com> Date: Wed, 4 Dec 2024 18:09:51 +0100 Subject: [PATCH] add support for loading node ids from node property parquet (#1886) add support for loading node ids from node property parquet in disk graph --- pometry-storage-private | 2 +- raphtory-cypher/examples/raphtory_cypher.rs | 6 +++++- raphtory-cypher/src/lib.rs | 1 + raphtory/src/disk_graph/graph_impl/mod.rs | 1 + raphtory/src/disk_graph/mod.rs | 2 ++ raphtory/src/io/arrow/df_loaders.rs | 1 + raphtory/src/python/graph/disk_graph.rs | 4 +++- 7 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pometry-storage-private b/pometry-storage-private index 89a99e15e..fb201c805 160000 --- a/pometry-storage-private +++ b/pometry-storage-private @@ -1 +1 @@ -Subproject commit 89a99e15e665e01d3304ca2202eecc6a62a2a9d0 +Subproject commit fb201c805c39d46d82faef6e84ebd5efe04fcf5c diff --git a/raphtory-cypher/examples/raphtory_cypher.rs b/raphtory-cypher/examples/raphtory_cypher.rs index 612ec68d5..c45965561 100644 --- a/raphtory-cypher/examples/raphtory_cypher.rs +++ b/raphtory-cypher/examples/raphtory_cypher.rs @@ -65,10 +65,13 @@ mod cypher { #[arg(short, long)] node_props: Option, - /// Node properties to load + /// Node properties column to load as node type #[arg(short, long)] node_type_col: Option, + /// Node properties column to load as node + #[arg(short, long)] + node_id_col: Option, /// Edge list parquet files to load as layers #[arg(short='l', last = true, value_parser = parse_key_val::)] layers: Vec<(String, ArgLayer)>, @@ -196,6 +199,7 @@ mod cypher { args.t_prop_chunk_size, args.num_threads, args.node_type_col.as_deref(), + args.node_id_col.as_deref(), ) .expect("Failed to load graph"); } diff --git a/raphtory-cypher/src/lib.rs b/raphtory-cypher/src/lib.rs index f1ff97ee7..a3e7222e7 100644 --- a/raphtory-cypher/src/lib.rs +++ b/raphtory-cypher/src/lib.rs @@ -356,6 +356,7 @@ mod cypher { 100, 1, None, + None, ) .unwrap(); diff --git a/raphtory/src/disk_graph/graph_impl/mod.rs b/raphtory/src/disk_graph/graph_impl/mod.rs index 08222302a..d1f728bfa 100644 --- a/raphtory/src/disk_graph/graph_impl/mod.rs +++ b/raphtory/src/disk_graph/graph_impl/mod.rs @@ -489,6 +489,7 @@ mod test { t_props_chunk_size, num_threads, node_type_col, + None, ) .unwrap() .into_graph(); diff --git a/raphtory/src/disk_graph/mod.rs b/raphtory/src/disk_graph/mod.rs index 9f5b98861..edd04ee1d 100644 --- a/raphtory/src/disk_graph/mod.rs +++ b/raphtory/src/disk_graph/mod.rs @@ -303,6 +303,7 @@ impl DiskGraphStorage { t_props_chunk_size: usize, num_threads: usize, node_type_col: Option<&str>, + node_id_col: Option<&str>, ) -> Result { let edge_lists: Vec> = layer_parquet_cols .into_iter() @@ -337,6 +338,7 @@ impl DiskGraphStorage { &[], node_properties.as_ref().map(|p| p.as_ref()), node_type_col, + node_id_col, )?; Ok(Self::new(t_graph)) } diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index b009e03c1..09e6beac4 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -733,6 +733,7 @@ mod tests { &[], None, None, + None, ) .unwrap(); let actual = diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index 546bcd01d..0c80c1e81 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -218,7 +218,7 @@ impl PyDiskGraph { #[staticmethod] #[pyo3( - signature = (graph_dir, layer_parquet_cols, node_properties, chunk_size, t_props_chunk_size, num_threads, node_type_col) + signature = (graph_dir, layer_parquet_cols, node_properties=None, chunk_size=10_000_000, t_props_chunk_size=10_000_000, num_threads=4, node_type_col=None, node_id_col=None) )] fn load_from_parquets( graph_dir: PathBuf, @@ -228,6 +228,7 @@ impl PyDiskGraph { t_props_chunk_size: usize, num_threads: usize, node_type_col: Option<&str>, + node_id_col: Option<&str>, ) -> Result { let layer_cols = layer_parquet_cols .iter() @@ -241,6 +242,7 @@ impl PyDiskGraph { t_props_chunk_size, num_threads, node_type_col, + node_id_col, ) .map_err(|err| { GraphError::LoadFailure(format!("Failed to load graph from parquet files: {err:?}"))