From 168dbb1e4f02d2671c46e8432c115dad879fed10 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Tue, 18 Jun 2024 21:16:39 +0100 Subject: [PATCH 01/33] rename graph/pandas to graph/io --- raphtory/src/python/graph/disk_graph.rs | 2 +- raphtory/src/python/graph/graph_with_deletions.rs | 2 +- raphtory/src/python/graph/{pandas => io}/dataframe.rs | 0 raphtory/src/python/graph/{pandas => io}/loaders.rs | 2 +- raphtory/src/python/graph/{pandas => io}/mod.rs | 2 +- raphtory/src/python/graph/{pandas => io}/prop_handler.rs | 2 +- raphtory/src/python/graph/mod.rs | 2 +- raphtory/src/python/graph/utils.rs | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename raphtory/src/python/graph/{pandas => io}/dataframe.rs (100%) rename raphtory/src/python/graph/{pandas => io}/loaders.rs (99%) rename raphtory/src/python/graph/{pandas => io}/mod.rs (99%) rename raphtory/src/python/graph/{pandas => io}/prop_handler.rs (99%) diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index 826076477f..ab668ec804 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -33,7 +33,7 @@ use pyo3::{ types::{IntoPyDict, PyDict, PyList, PyString}, }; -use super::pandas::dataframe::{process_pandas_py_df, PretendDF}; +use super::io::dataframe::{process_pandas_py_df, PretendDF}; impl From for PyErr { fn from(value: Error) -> Self { diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index a53c0cebee..6652d90c88 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -32,7 +32,7 @@ use std::{ use super::{ graph::PyGraph, - pandas::{ + io::{ dataframe::{process_pandas_py_df, GraphLoadException}, loaders::load_edges_deletions_from_df, }, diff --git a/raphtory/src/python/graph/pandas/dataframe.rs b/raphtory/src/python/graph/io/dataframe.rs similarity index 100% rename from raphtory/src/python/graph/pandas/dataframe.rs rename to raphtory/src/python/graph/io/dataframe.rs diff --git a/raphtory/src/python/graph/pandas/loaders.rs b/raphtory/src/python/graph/io/loaders.rs similarity index 99% rename from raphtory/src/python/graph/pandas/loaders.rs rename to raphtory/src/python/graph/io/loaders.rs index 88b3e74437..dd906d12f5 100644 --- a/raphtory/src/python/graph/pandas/loaders.rs +++ b/raphtory/src/python/graph/io/loaders.rs @@ -2,7 +2,7 @@ use crate::{ core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError}, db::api::mutation::AdditionOps, prelude::*, - python::graph::pandas::{ + python::graph::io::{ dataframe::PretendDF, prop_handler::{get_prop_rows, lift_layer}, }, diff --git a/raphtory/src/python/graph/pandas/mod.rs b/raphtory/src/python/graph/io/mod.rs similarity index 99% rename from raphtory/src/python/graph/pandas/mod.rs rename to raphtory/src/python/graph/io/mod.rs index a000426a75..28706addeb 100644 --- a/raphtory/src/python/graph/pandas/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -7,7 +7,7 @@ mod test { use crate::{ core::ArcStr, prelude::*, - python::graph::pandas::{ + python::graph::io::{ dataframe::PretendDF, loaders::{load_edges_from_df, load_nodes_from_df}, }, diff --git a/raphtory/src/python/graph/pandas/prop_handler.rs b/raphtory/src/python/graph/io/prop_handler.rs similarity index 99% rename from raphtory/src/python/graph/pandas/prop_handler.rs rename to raphtory/src/python/graph/io/prop_handler.rs index 5b672c642f..e4e5ce72ac 100644 --- a/raphtory/src/python/graph/pandas/prop_handler.rs +++ b/raphtory/src/python/graph/io/prop_handler.rs @@ -7,7 +7,7 @@ use polars_arrow::{ use crate::{ core::{utils::errors::GraphError, IntoPropList}, prelude::Prop, - python::graph::pandas::dataframe::PretendDF, + python::graph::io::dataframe::PretendDF, }; pub struct PropIter<'a> { diff --git a/raphtory/src/python/graph/mod.rs b/raphtory/src/python/graph/mod.rs index 7b457b6a1b..fbc78150d3 100644 --- a/raphtory/src/python/graph/mod.rs +++ b/raphtory/src/python/graph/mod.rs @@ -11,7 +11,7 @@ pub mod edges; #[cfg(feature = "search")] pub mod index; pub mod node; -pub mod pandas; +pub mod io; pub mod properties; pub mod utils; pub mod views; diff --git a/raphtory/src/python/graph/utils.rs b/raphtory/src/python/graph/utils.rs index 98701b92cb..c52ddbf77e 100644 --- a/raphtory/src/python/graph/utils.rs +++ b/raphtory/src/python/graph/utils.rs @@ -2,7 +2,7 @@ use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphEr use pyo3::{prelude::*, types::IntoPyDict}; use std::collections::HashMap; -use super::pandas::{ +use super::io::{ dataframe::{process_pandas_py_df, GraphLoadException}, loaders::{ load_edges_from_df, load_edges_props_from_df, load_node_props_from_df, load_nodes_from_df, From 1826f12cad643c303815d4f67c059d30503136f8 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Tue, 18 Jun 2024 21:36:07 +0100 Subject: [PATCH 02/33] rename utils to panda_loaders --- raphtory/src/python/graph/graph.rs | 10 +++++----- raphtory/src/python/graph/graph_with_deletions.rs | 14 +++++++------- .../python/graph/io/{loaders.rs => df_loaders.rs} | 0 raphtory/src/python/graph/io/mod.rs | 5 +++-- .../python/graph/{utils.rs => io/panda_loaders.rs} | 4 ++-- raphtory/src/python/graph/mod.rs | 1 - 6 files changed, 17 insertions(+), 17 deletions(-) rename raphtory/src/python/graph/io/{loaders.rs => df_loaders.rs} (100%) rename raphtory/src/python/graph/{utils.rs => io/panda_loaders.rs} (99%) diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 4347b87ad8..96017d4ac9 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -3,7 +3,6 @@ //! This is the base class used to create a temporal graph, add nodes and edges, //! create windows, and query the graph with a variety of algorithms. //! In Python, this class wraps around the rust graph. -use super::utils; use crate::{ algorithms::components::LargestConnectedComponent, core::{entities::nodes::node_ref::NodeRef, utils::errors::GraphError, ArcStr}, @@ -26,6 +25,7 @@ use std::{ fmt::{Debug, Formatter}, path::Path, }; +use crate::python::graph::io::panda_loaders::*; /// A temporal graph. #[derive(Clone)] @@ -483,7 +483,7 @@ impl PyGraph { const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { - utils::load_nodes_from_pandas( + load_nodes_from_pandas( &self.graph.0, df, id, @@ -524,7 +524,7 @@ impl PyGraph { layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - utils::load_edges_from_pandas( + load_edges_from_pandas( &self.graph.0, df, src, @@ -556,7 +556,7 @@ impl PyGraph { const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { - utils::load_node_props_from_pandas( + load_node_props_from_pandas( &self.graph.0, df, id, @@ -589,7 +589,7 @@ impl PyGraph { layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - utils::load_edge_props_from_pandas( + load_edge_props_from_pandas( &self.graph.0, df, src, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 6652d90c88..2d2077f075 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -34,9 +34,9 @@ use super::{ graph::PyGraph, io::{ dataframe::{process_pandas_py_df, GraphLoadException}, - loaders::load_edges_deletions_from_df, + panda_loaders::*, + df_loaders::load_edges_deletions_from_df }, - utils, }; /// A temporal graph that allows edges and nodes to be deleted. @@ -480,7 +480,7 @@ impl PyPersistentGraph { const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { - utils::load_nodes_from_pandas( + load_nodes_from_pandas( &self.graph.0, df, id, @@ -521,7 +521,7 @@ impl PyPersistentGraph { layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - utils::load_edges_from_pandas( + load_edges_from_pandas( &self.graph.0, df, src, @@ -556,7 +556,7 @@ impl PyPersistentGraph { time: &str, layer: Option<&str>, layer_in_df: Option, - ) -> Result<(), GraphError> { + ) -> Result<(), GraphError> { // TODO: move this to panda_loaders let graph = &self.graph.0; Python::with_gil(|py| { let size: usize = py @@ -613,7 +613,7 @@ impl PyPersistentGraph { const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { - utils::load_node_props_from_pandas( + load_node_props_from_pandas( &self.graph.0, df, id, @@ -646,7 +646,7 @@ impl PyPersistentGraph { layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - utils::load_edge_props_from_pandas( + load_edge_props_from_pandas( &self.graph.0, df, src, diff --git a/raphtory/src/python/graph/io/loaders.rs b/raphtory/src/python/graph/io/df_loaders.rs similarity index 100% rename from raphtory/src/python/graph/io/loaders.rs rename to raphtory/src/python/graph/io/df_loaders.rs diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index 28706addeb..0fdf06accb 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -1,6 +1,7 @@ pub mod dataframe; -pub mod loaders; +pub mod df_loaders; mod prop_handler; +pub mod panda_loaders; #[cfg(test)] mod test { @@ -9,7 +10,7 @@ mod test { prelude::*, python::graph::io::{ dataframe::PretendDF, - loaders::{load_edges_from_df, load_nodes_from_df}, + df_loaders::{load_edges_from_df, load_nodes_from_df}, }, }; use polars_arrow::array::{PrimitiveArray, Utf8Array}; diff --git a/raphtory/src/python/graph/utils.rs b/raphtory/src/python/graph/io/panda_loaders.rs similarity index 99% rename from raphtory/src/python/graph/utils.rs rename to raphtory/src/python/graph/io/panda_loaders.rs index c52ddbf77e..f7a42bb2ad 100644 --- a/raphtory/src/python/graph/utils.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -2,9 +2,9 @@ use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphEr use pyo3::{prelude::*, types::IntoPyDict}; use std::collections::HashMap; -use super::io::{ +use crate::python::graph::io::{ dataframe::{process_pandas_py_df, GraphLoadException}, - loaders::{ + df_loaders::{ load_edges_from_df, load_edges_props_from_df, load_node_props_from_df, load_nodes_from_df, }, }; diff --git a/raphtory/src/python/graph/mod.rs b/raphtory/src/python/graph/mod.rs index fbc78150d3..eec1e2b894 100644 --- a/raphtory/src/python/graph/mod.rs +++ b/raphtory/src/python/graph/mod.rs @@ -13,5 +13,4 @@ pub mod index; pub mod node; pub mod io; pub mod properties; -pub mod utils; pub mod views; From d433bbce24605bfecc98266ccfc1db108c8050fc Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Tue, 18 Jun 2024 21:58:02 +0100 Subject: [PATCH 03/33] refactor panda loader related functions to panda_loader from df_loaders --- raphtory/src/python/graph/disk_graph.rs | 3 +- .../src/python/graph/graph_with_deletions.rs | 2 +- raphtory/src/python/graph/io/dataframe.rs | 128 +--------------- raphtory/src/python/graph/io/mod.rs | 1 + raphtory/src/python/graph/io/panda_loaders.rs | 141 ++++++++++++++++-- .../src/python/graph/io/parquet_loaders.rs | 0 6 files changed, 135 insertions(+), 140 deletions(-) create mode 100644 raphtory/src/python/graph/io/parquet_loaders.rs diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index ab668ec804..edfd3be259 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -33,7 +33,8 @@ use pyo3::{ types::{IntoPyDict, PyDict, PyList, PyString}, }; -use super::io::dataframe::{process_pandas_py_df, PretendDF}; +use super::io::dataframe::PretendDF; +use super::io::panda_loaders::*; impl From for PyErr { fn from(value: Error) -> Self { diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 2d2077f075..84cd1c5163 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -33,7 +33,7 @@ use std::{ use super::{ graph::PyGraph, io::{ - dataframe::{process_pandas_py_df, GraphLoadException}, + dataframe::GraphLoadException, panda_loaders::*, df_loaders::load_edges_deletions_from_df }, diff --git a/raphtory/src/python/graph/io/dataframe.rs b/raphtory/src/python/graph/io/dataframe.rs index a38fd90e19..a3dae3d9c0 100644 --- a/raphtory/src/python/graph/io/dataframe.rs +++ b/raphtory/src/python/graph/io/dataframe.rs @@ -4,16 +4,12 @@ use polars_arrow::{ array::{Array, PrimitiveArray, Utf8Array}, compute::cast::{self, CastOptions}, datatypes::{ArrowDataType as DataType, TimeUnit}, - ffi, offset::Offset, types::NativeType, }; use itertools::Itertools; -use pyo3::{ - create_exception, exceptions::PyException, ffi::Py_uintptr_t, types::IntoPyDict, PyAny, PyErr, - PyResult, Python, -}; +use pyo3::{create_exception, exceptions::PyException}; #[derive(Debug)] pub(crate) struct PretendDF { @@ -37,7 +33,7 @@ impl PretendDF { pub(crate) fn iter_col( &self, name: &str, - ) -> Option> + '_> { + ) -> Option> + '_> { let idx = self.names.iter().position(|n| n == name)?; let _ = (&self.arrays[0])[idx] @@ -53,7 +49,7 @@ impl PretendDF { Some(iter) } - pub fn utf8(&self, name: &str) -> Option> + '_> { + pub fn utf8(&self, name: &str) -> Option> + '_> { let idx = self.names.iter().position(|n| n == name)?; // test that it's actually a utf8 array let _ = (&self.arrays[0])[idx] @@ -69,7 +65,7 @@ impl PretendDF { Some(iter) } - pub fn time_iter_col(&self, name: &str) -> Option> + '_> { + pub fn time_iter_col(&self, name: &str) -> Option> + '_> { let idx = self.names.iter().position(|n| n == name)?; let _ = (&self.arrays[0])[idx] @@ -84,7 +80,7 @@ impl PretendDF { &DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), CastOptions::default(), ) - .unwrap(); + .unwrap(); array } else { arr.clone() @@ -98,120 +94,6 @@ impl PretendDF { } } -fn is_jupyter(py: Python) { - let code = r#" -try: - shell = get_ipython().__class__.__name__ - if shell == 'ZMQInteractiveShell': - result = True # Jupyter notebook or qtconsole - elif shell == 'TerminalInteractiveShell': - result = False # Terminal running IPython - else: - result = False # Other type, assuming not a Jupyter environment -except NameError: - result = False # Probably standard Python interpreter -"#; - - if let Err(e) = py.run(code, None, None) { - println!("Error checking if running in a jupyter notebook: {}", e); - return; - } - - match py.eval("result", None, None) { - Ok(x) => { - if let Ok(x) = x.extract() { - kdam::set_notebook(x); - } - } - Err(e) => { - println!("Error checking if running in a jupyter notebook: {}", e); - } - }; -} - -pub(crate) fn process_pandas_py_df( - df: &PyAny, - py: Python, - _size: usize, - col_names: Vec<&str>, -) -> PyResult { - is_jupyter(py); - py.import("pandas")?; - let module = py.import("pyarrow")?; - let pa_table = module.getattr("Table")?; - - let df_columns: Vec = df.getattr("columns")?.extract()?; - - let cols_to_drop: Vec = df_columns - .into_iter() - .filter(|x| !col_names.contains(&x.as_str())) - .collect(); - - let dropped_df = if !cols_to_drop.is_empty() { - let drop_method = df.getattr("drop")?; - drop_method.call((cols_to_drop,), Some(vec![("axis", 1)].into_py_dict(py)))? - } else { - df - }; - - let _df_columns: Vec = dropped_df.getattr("columns")?.extract()?; - - let table = pa_table.call_method("from_pandas", (dropped_df,), None)?; - - let rb = table.call_method0("to_batches")?.extract::>()?; - let names: Vec = if let Some(batch0) = rb.get(0) { - let schema = batch0.getattr("schema")?; - schema.getattr("names")?.extract::>()? - } else { - vec![] - } - .into_iter() - .filter(|x| col_names.contains(&x.as_str())) - .collect(); - - let arrays = rb - .iter() - .map(|rb| { - (0..names.len()) - .map(|i| { - let array = rb.call_method1("column", (i,))?; - let arr = array_to_rust(array)?; - Ok::, PyErr>(arr) - }) - .collect::, PyErr>>() - }) - .collect::, PyErr>>()?; - - let df = PretendDF { names, arrays }; - Ok(df) -} - -pub fn array_to_rust(obj: &PyAny) -> PyResult { - // prepare a pointer to receive the Array struct - let array = Box::new(ffi::ArrowArray::empty()); - let schema = Box::new(ffi::ArrowSchema::empty()); - - let array_ptr = &*array as *const ffi::ArrowArray; - let schema_ptr = &*schema as *const ffi::ArrowSchema; - - // make the conversion through PyArrow's private API - // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds - obj.call_method1( - "_export_to_c", - (array_ptr as Py_uintptr_t, schema_ptr as Py_uintptr_t), - )?; - - unsafe { - let field = ffi::import_field_from_c(schema.as_ref()) - .map_err(|e| ArrowErrorException::new_err(format!("{:?}", e)))?; - - let array = ffi::import_array_from_c(*array, field.data_type) - .map_err(|e| ArrowErrorException::new_err(format!("{:?}", e)))?; - - Ok(array) - } -} - pub type ArrayRef = Box; create_exception!(exceptions, ArrowErrorException, PyException); diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index 0fdf06accb..f4a7590a3f 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -2,6 +2,7 @@ pub mod dataframe; pub mod df_loaders; mod prop_handler; pub mod panda_loaders; +mod parquet_loaders; #[cfg(test)] mod test { diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index f7a42bb2ad..f4a8679a17 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -1,13 +1,10 @@ use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; use pyo3::{prelude::*, types::IntoPyDict}; use std::collections::HashMap; - -use crate::python::graph::io::{ - dataframe::{process_pandas_py_df, GraphLoadException}, - df_loaders::{ - load_edges_from_df, load_edges_props_from_df, load_node_props_from_df, load_nodes_from_df, - }, -}; +use polars_arrow::array::Array; +use polars_arrow::ffi; +use pyo3::ffi::Py_uintptr_t; +use crate::python::graph::io::{dataframe::*, df_loaders::*}; pub fn load_nodes_from_pandas( graph: &InternalGraph, @@ -53,10 +50,10 @@ pub fn load_nodes_from_pandas( node_type_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -106,11 +103,11 @@ pub fn load_edges_from_pandas( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -142,11 +139,11 @@ pub fn load_node_props_from_pandas( shared_const_properties, graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -188,10 +185,124 @@ pub fn load_edge_props_from_pandas( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; df.check_cols_exist(&cols_to_check)?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } + +pub(crate) fn process_pandas_py_df( + df: &PyAny, + py: Python, + _size: usize, + col_names: Vec<&str>, +) -> PyResult { + is_jupyter(py); + py.import("pandas")?; + let module = py.import("pyarrow")?; + let pa_table = module.getattr("Table")?; + + let df_columns: Vec = df.getattr("columns")?.extract()?; + + let cols_to_drop: Vec = df_columns + .into_iter() + .filter(|x| !col_names.contains(&x.as_str())) + .collect(); + + let dropped_df = if !cols_to_drop.is_empty() { + let drop_method = df.getattr("drop")?; + drop_method.call((cols_to_drop, ), Some(vec![("axis", 1)].into_py_dict(py)))? + } else { + df + }; + + let _df_columns: Vec = dropped_df.getattr("columns")?.extract()?; + + let table = pa_table.call_method("from_pandas", (dropped_df, ), None)?; + + let rb = table.call_method0("to_batches")?.extract::>()?; + let names: Vec = if let Some(batch0) = rb.get(0) { + let schema = batch0.getattr("schema")?; + schema.getattr("names")?.extract::>()? + } else { + vec![] + } + .into_iter() + .filter(|x| col_names.contains(&x.as_str())) + .collect(); + + let arrays = rb + .iter() + .map(|rb| { + (0..names.len()) + .map(|i| { + let array = rb.call_method1("column", (i, ))?; + let arr = array_to_rust(array)?; + Ok::, PyErr>(arr) + }) + .collect::, PyErr>>() + }) + .collect::, PyErr>>()?; + + let df = PretendDF { names, arrays }; + Ok(df) +} + +pub fn array_to_rust(obj: &PyAny) -> PyResult { + // prepare a pointer to receive the Array struct + let array = Box::new(ffi::ArrowArray::empty()); + let schema = Box::new(ffi::ArrowSchema::empty()); + + let array_ptr = &*array as *const ffi::ArrowArray; + let schema_ptr = &*schema as *const ffi::ArrowSchema; + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds + obj.call_method1( + "_export_to_c", + (array_ptr as Py_uintptr_t, schema_ptr as Py_uintptr_t), + )?; + + unsafe { + let field = ffi::import_field_from_c(schema.as_ref()) + .map_err(|e| ArrowErrorException::new_err(format!("{:?}", e)))?; + + let array = ffi::import_array_from_c(*array, field.data_type) + .map_err(|e| ArrowErrorException::new_err(format!("{:?}", e)))?; + + Ok(array) + } +} + +fn is_jupyter(py: Python) { + let code = r#" +try: + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell': + result = True # Jupyter notebook or qtconsole + elif shell == 'TerminalInteractiveShell': + result = False # Terminal running IPython + else: + result = False # Other type, assuming not a Jupyter environment +except NameError: + result = False # Probably standard Python interpreter +"#; + + if let Err(e) = py.run(code, None, None) { + println!("Error checking if running in a jupyter notebook: {}", e); + return; + } + + match py.eval("result", None, None) { + Ok(x) => { + if let Ok(x) = x.extract() { + kdam::set_notebook(x); + } + } + Err(e) => { + println!("Error checking if running in a jupyter notebook: {}", e); + } + }; +} diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs new file mode 100644 index 0000000000..e69de29bb2 From f0da7b281a2f25971851bdf52a57030d1f3b4475 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Tue, 18 Jun 2024 22:03:51 +0100 Subject: [PATCH 04/33] init parquet loader --- .../src/python/graph/io/parquet_loaders.rs | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index e69de29bb2..a06b36af01 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -0,0 +1,63 @@ +use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; +use std::collections::HashMap; +use std::path::Path; +use crate::python::graph::io::{dataframe::*, df_loaders::*}; + +pub fn load_nodes_from_parquet( + graph: &InternalGraph, + parquet_file_path: &Path, + id: &str, + time: &str, + node_type: Option<&str>, + node_type_in_df: Option, + properties: Option>, + const_properties: Option>, + shared_const_properties: Option>, +) -> Result<(), GraphError> { + todo!() +} + +pub fn load_edges_from_parquet( + graph: &InternalGraph, + parquet_file_path: &Path, + src: &str, + dst: &str, + time: &str, + properties: Option>, + const_properties: Option>, + shared_const_properties: Option>, + layer: Option<&str>, + layer_in_df: Option, +) -> Result<(), GraphError> { + todo!() +} + +pub fn load_node_props_from_parquet( + graph: &InternalGraph, + parquet_file_path: &Path, + id: &str, + const_properties: Option>, + shared_const_properties: Option>, +) -> Result<(), GraphError> { + todo!() +} + +pub fn load_edge_props_from_parquet( + graph: &InternalGraph, + parquet_file_path: &Path, + src: &str, + dst: &str, + const_properties: Option>, + shared_const_properties: Option>, + layer: Option<&str>, + layer_in_df: Option, +) -> Result<(), GraphError> { + todo!() +} + +pub(crate) fn process_parquet_file_to_df( + parquet_file_path: &Path, + col_names: Vec<&str>, +) -> Result { + todo!() +} From d3ad3d8a599590d181f4f7542ebd05ad6fe7f35a Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Wed, 19 Jun 2024 13:03:18 +0100 Subject: [PATCH 05/33] impl and test process_parquet_file_to_df --- raphtory/resources/test/test_data.parquet | Bin 0 -> 2130 bytes raphtory/src/core/utils/errors.rs | 3 + raphtory/src/python/graph/disk_graph.rs | 2 +- .../src/python/graph/graph_with_deletions.rs | 2 +- raphtory/src/python/graph/io/panda_loaders.rs | 9 +- .../src/python/graph/io/parquet_loaders.rs | 121 +++++++++++++++++- 6 files changed, 127 insertions(+), 10 deletions(-) create mode 100644 raphtory/resources/test/test_data.parquet diff --git a/raphtory/resources/test/test_data.parquet b/raphtory/resources/test/test_data.parquet new file mode 100644 index 0000000000000000000000000000000000000000..59f63d062bc458aebb25cdc362f2cdd5ec3655a5 GIT binary patch literal 2130 zcmcguy>HV{5I@IG;+Pa7)MweqLkDVQXiG{}gb-5m9Qr{&Agu~5QkOKb(ty(hCneB{ zkqI#}FflVRcj&+$(2 z4v~EPSi&;YP-3W3prb?fj9V05$;NQiBw2np z2lg+?Vv0EpiHvE;Ex*iSx~Bh@^kd1c*y?3Gn_~z9SPXE<;0DkODn_7W0;M3RFy%y$ za6hO}VyNVeh(PtB&+!PTN~;|K^PYXGm&kwt#G==zR2jycPlhj+uoi_omllVCOuxw$Px@KDD)<7rT9_-}V z1D*hT;GJmytM1SbQ@UoW`p48TfN2?NzP9#w<6gH}j6?YrQLgH6&R6jxTH?IJohqN> zyvhqzap$1I3ULL_AJUHJaL3aV{2mC+1_|dowB~3b$9#4jgWM%%ZkEoDkPDWMoR{%) z+Rru@Yuocr_bC;$6Y_EqO-X%qMdT-fo}`La6m_BEl{PAk3bIJ^lsDevE~vnlPU083 z@kOeU_~hEjl{O|lVX^o;$d@}J=%ZeHaT~Fx(l{^5Q>*-Sm@kzN36-!oBA*4SE0O?5 z3`&E1dg~$l?U(B-RBt?>eWzP*VMgT+2#k?t$$z=$w#UoO=6K2X8=a9?uHULPc1I?s WCvp?F%`Kej=pRhsFIo}*oA?9hwl=N+ literal 0 HcmV?d00001 diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index ce5fe09555..9a87c84006 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -1,3 +1,4 @@ +use polars_arrow::legacy::error; use crate::core::{utils::time::error::ParseTimeError, ArcStr, Prop, PropType}; #[cfg(feature = "search")] use tantivy; @@ -6,6 +7,8 @@ use tantivy::query::QueryParserError; #[derive(thiserror::Error, Debug)] pub enum GraphError { + #[error("Arrow error: {0}")] + Arrow(#[from] error::PolarsError), #[error("Graph error occurred")] UnsupportedDataType, #[error("Graph already exists by name = {name}")] diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index edfd3be259..c9f20c627d 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -173,7 +173,7 @@ impl PyDiskGraph { let df_columns: Vec = edge_df.getattr("columns")?.extract()?; let df_columns: Vec<&str> = df_columns.iter().map(|x| x.as_str()).collect(); - let df = process_pandas_py_df(edge_df, py, size, df_columns)?; + let df = process_pandas_py_df(edge_df, py, df_columns)?; df.check_cols_exist(&cols_to_check)?; let graph = Self::from_pandas(graph_dir, df, src_col, dst_col, time_col)?; diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 84cd1c5163..3dd743fef0 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -574,7 +574,7 @@ impl PyPersistentGraph { } } - let df = process_pandas_py_df(df, py, size, cols_to_check.clone())?; + let df = process_pandas_py_df(df, py, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; load_edges_deletions_from_df( diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index f4a8679a17..274eea197b 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -35,7 +35,7 @@ pub fn load_nodes_from_pandas( } } - let df = process_pandas_py_df(df, py, size, cols_to_check.clone())?; + let df = process_pandas_py_df(df, py, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; load_nodes_from_df( @@ -87,7 +87,7 @@ pub fn load_edges_from_pandas( } } - let df = process_pandas_py_df(df, py, size, cols_to_check.clone())?; + let df = process_pandas_py_df(df, py, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; load_edges_from_df( @@ -128,7 +128,7 @@ pub fn load_node_props_from_pandas( .extract()?; let mut cols_to_check = vec![id]; cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); - let df = process_pandas_py_df(df, py, size, cols_to_check.clone())?; + let df = process_pandas_py_df(df, py, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; load_node_props_from_df( @@ -172,7 +172,7 @@ pub fn load_edge_props_from_pandas( } } cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); - let df = process_pandas_py_df(df, py, size, cols_to_check.clone())?; + let df = process_pandas_py_df(df, py, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; load_edges_props_from_df( &df, @@ -196,7 +196,6 @@ pub fn load_edge_props_from_pandas( pub(crate) fn process_pandas_py_df( df: &PyAny, py: Python, - _size: usize, col_names: Vec<&str>, ) -> PyResult { is_jupyter(py); diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index a06b36af01..f947b54a73 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -1,7 +1,18 @@ use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; use std::collections::HashMap; -use std::path::Path; +use std::num::TryFromIntError; +use std::path::{Path, PathBuf}; +use itertools::Itertools; +use polars_arrow::array::Array; +use polars_arrow::datatypes::{ArrowSchema, Field}; +use polars_arrow::legacy::error; +use polars_arrow::legacy::error::PolarsResult; +use polars_parquet::read; +use polars_parquet::read::{FileMetaData, FileReader, read_metadata}; use crate::python::graph::io::{dataframe::*, df_loaders::*}; +use polars_arrow::datatypes::ArrowDataType as DataType; +use polars_arrow::datatypes::ArrowSchema as Schema; +use polars_arrow::record_batch::RecordBatch as Chunk; pub fn load_nodes_from_parquet( graph: &InternalGraph, @@ -29,7 +40,34 @@ pub fn load_edges_from_parquet( layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - todo!() + let mut cols_to_check = vec![src, dst, time]; + cols_to_check.extend(properties.as_ref().unwrap_or(&Vec::new())); + cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); + if layer_in_df.unwrap_or(false) { + if let Some(ref layer) = layer { + cols_to_check.push(layer.as_ref()); + } + } + + let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; + + // df.check_cols_exist(&cols_to_check)?; + // load_edges_from_df( + // &df, + // size, + // src, + // dst, + // time, + // properties, + // const_properties, + // shared_const_properties, + // layer, + // layer_in_df.unwrap_or(true), + // graph, + // ) + // .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + + Ok(()) } pub fn load_node_props_from_parquet( @@ -59,5 +97,82 @@ pub(crate) fn process_parquet_file_to_df( parquet_file_path: &Path, col_names: Vec<&str>, ) -> Result { - todo!() + let iter = read_parquet_file(parquet_file_path)?; + + let names = col_names.iter().map(|s| s.to_string()).collect(); + let arrays = iter.map_ok(|r| { + r.into_iter().map(|boxed| boxed.clone()).collect_vec() + }).collect::, _>>()?; + + Ok(PretendDF { + names, + arrays, + }) +} + +fn read_parquet_file( + path: impl AsRef, +) -> Result>, error::PolarsError>>, GraphError> { + fn read_schema(metadata: &FileMetaData) -> Result { + let schema = read::infer_schema(metadata)?; + let fields = schema + .fields + .iter() + .map(|f| { + if f.data_type == DataType::Utf8View { + Field::new(f.name.clone(), DataType::LargeUtf8, f.is_nullable) + } else { + f.clone() + } + }) + .collect::>(); + + Ok(ArrowSchema::from(fields).with_metadata(schema.metadata)) + } + + let mut file = std::fs::File::open(&path)?; + let metadata = read_metadata(&mut file)?; + let row_groups = metadata.clone().row_groups; + let schema = read_schema(&metadata)?; + let reader = FileReader::new(file, row_groups, schema, None, None, None); + Ok(reader) +} + +#[cfg(test)] +mod test { + use polars_arrow::array::{PrimitiveArray, Utf8Array}; + use super::*; + + #[test] + fn test_process_parquet_file_to_df() { + // let parquet_file_path = Path::new("/tmp/parquet/test_data.parquet"); + let parquet_file_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .map(|p| p.join("raphtory/resources/test/test_data.parquet")) + .unwrap(); + + let col_names = vec!["src", "dst", "time", "weight", "marbles"]; + let df = process_parquet_file_to_df( + parquet_file_path.as_path(), + col_names, + ).unwrap(); + + let df1 = PretendDF { + names: vec!["src", "dst", "time", "weight", "marbles"] + .iter() + .map(|s| s.to_string()) + .collect(), + arrays: vec![ + vec![ + Box::new(PrimitiveArray::::from_values(vec![1, 2, 3, 4, 5])), + Box::new(PrimitiveArray::::from_values(vec![2, 3, 4, 5, 6])), + Box::new(PrimitiveArray::::from_values(vec![1, 2, 3, 4, 5])), + Box::new(PrimitiveArray::::from_values(vec![1f64, 2f64, 3f64, 4f64, 5f64])), + Box::new(Utf8Array::::from_iter_values(vec!["red", "blue", "green", "yellow", "purple"].into_iter())), + ]], + }; + + assert_eq!(df.names, df1.names); + assert_eq!(df.arrays, df1.arrays); + } } From 4280dbacfdfbc5122d8480ea2893ce1625c16924 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Wed, 19 Jun 2024 13:32:37 +0100 Subject: [PATCH 06/33] impl load edges from parquet for graph --- python/tests/data/parquet/edges.parquet | Bin 0 -> 2130 bytes python/tests/test_load_from_parquet.py | 68 ++++++++++++++++++ raphtory/src/python/graph/graph.rs | 44 ++++++++++++ raphtory/src/python/graph/io/mod.rs | 2 +- .../src/python/graph/io/parquet_loaders.rs | 35 ++++----- 5 files changed, 131 insertions(+), 18 deletions(-) create mode 100644 python/tests/data/parquet/edges.parquet create mode 100644 python/tests/test_load_from_parquet.py diff --git a/python/tests/data/parquet/edges.parquet b/python/tests/data/parquet/edges.parquet new file mode 100644 index 0000000000000000000000000000000000000000..59f63d062bc458aebb25cdc362f2cdd5ec3655a5 GIT binary patch literal 2130 zcmcguy>HV{5I@IG;+Pa7)MweqLkDVQXiG{}gb-5m9Qr{&Agu~5QkOKb(ty(hCneB{ zkqI#}FflVRcj&+$(2 z4v~EPSi&;YP-3W3prb?fj9V05$;NQiBw2np z2lg+?Vv0EpiHvE;Ex*iSx~Bh@^kd1c*y?3Gn_~z9SPXE<;0DkODn_7W0;M3RFy%y$ za6hO}VyNVeh(PtB&+!PTN~;|K^PYXGm&kwt#G==zR2jycPlhj+uoi_omllVCOuxw$Px@KDD)<7rT9_-}V z1D*hT;GJmytM1SbQ@UoW`p48TfN2?NzP9#w<6gH}j6?YrQLgH6&R6jxTH?IJohqN> zyvhqzap$1I3ULL_AJUHJaL3aV{2mC+1_|dowB~3b$9#4jgWM%%ZkEoDkPDWMoR{%) z+Rru@Yuocr_bC;$6Y_EqO-X%qMdT-fo}`La6m_BEl{PAk3bIJ^lsDevE~vnlPU083 z@kOeU_~hEjl{O|lVX^o;$d@}J=%ZeHaT~Fx(l{^5Q>*-Sm@kzN36-!oBA*4SE0O?5 z3`&E1dg~$l?U(B-RBt?>eWzP*VMgT+2#k?t$$z=$w#UoO=6K2X8=a9?uHULPc1I?s WCvp?F%`Kej=pRhsFIo}*oA?9hwl=N+ literal 0 HcmV?d00001 diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py new file mode 100644 index 0000000000..1c6ead003b --- /dev/null +++ b/python/tests/test_load_from_parquet.py @@ -0,0 +1,68 @@ +import os + +import pyarrow as pa +import pyarrow.parquet as pq +from raphtory import Graph, PersistentGraph + + +def test_load_from_parquet(): + data = { + "src": [1, 2, 3, 4, 5], + "dst": [2, 3, 4, 5, 6], + "time": [1, 2, 3, 4, 5], + "weight": [1.0, 2.0, 3.0, 4.0, 5.0], + "marbles": ["red", "blue", "green", "yellow", "purple"], + } + + table = pa.table(data) + pq.write_table(table, '/tmp/parquet/test_data.parquet') + + +# +# expected_nodes = [1, 2, 3, 4, 5, 6] +# expected_edges = [ +# (1, 2, 1.0, "red"), +# (2, 3, 2.0, "blue"), +# (3, 4, 3.0, "green"), +# (4, 5, 4.0, "yellow"), +# (5, 6, 5.0, "purple"), +# ] +# +# def assertions(g): +# edges = [] +# for e in g.edges: +# weight = e["weight"] +# marbles = e["marbles"] +# edges.append((e.src.id, e.dst.id, weight, marbles)) +# +# assert g.nodes.id.collect() == expected_nodes +# assert edges == expected_edges +# +# g = Graph.load_from_parquet('test_data.parquet', "src", "dst", "time", ["weight", "marbles"]) +# assertions(g) +# +# # g = PersistentGraph.load_from_parquet( +# # 'test_data.parquet', "src", "dst", "time", ["weight", "marbles"] +# # ) +# # assertions(g) + +def test_load_edges_from_parquet(): + file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') + expected_edges = [ + (1, 2, 1.0, "red"), + (2, 3, 2.0, "blue"), + (3, 4, 3.0, "green"), + (4, 5, 4.0, "yellow"), + (5, 6, 5.0, "purple"), + ] + + g = Graph() + g.load_edges_from_parquet(file_path, "src", "dst", "time", ["weight", "marbles"]) + + edges = [] + for e in g.edges: + weight = e["weight"] + marbles = e["marbles"] + edges.append((e.src.id, e.dst.id, weight, marbles)) + + assert edges == expected_edges diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 96017d4ac9..57843494dc 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -25,7 +25,9 @@ use std::{ fmt::{Debug, Formatter}, path::Path, }; +use std::path::PathBuf; use crate::python::graph::io::panda_loaders::*; +use crate::python::graph::io::parquet_loaders::load_edges_from_parquet; /// A temporal graph. #[derive(Clone)] @@ -538,6 +540,48 @@ impl PyGraph { ) } + /// Load edges from a Parquet file into the graph. + /// + /// Arguments: + /// parquet_file_path: Parquet file path + /// src (str): The column name for the source node ids. + /// dst (str): The column name for the destination node ids. + /// time (str): The column name for the update timestamps. + /// properties (List): List of edge property column names. Defaults to None. (optional) + /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) + /// shared_const_properties (dict): A dictionary of constant properties that will be added to every edge. Defaults to None. (optional) + /// layer (str): The edge layer name (optional) Defaults to None. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dataframe or if it should be used directly as the layer for all edges (optional) defaults to True. + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, src, dst, time, properties = None, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + fn load_edges_from_parquet( + &self, + parquet_file_path: PathBuf, + src: &str, + dst: &str, + time: &str, + properties: Option>, + const_properties: Option>, + shared_const_properties: Option>, + layer: Option<&str>, + layer_in_df: Option, + ) -> Result<(), GraphError> { + load_edges_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + src, + dst, + time, + properties, + const_properties, + shared_const_properties, + layer, + layer_in_df, + ) + } + /// Load node properties from a Pandas DataFrame. /// /// Arguments: diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index f4a7590a3f..f5d3931c81 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -2,7 +2,7 @@ pub mod dataframe; pub mod df_loaders; mod prop_handler; pub mod panda_loaders; -mod parquet_loaders; +pub mod parquet_loaders; #[cfg(test)] mod test { diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index f947b54a73..20c29851fe 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -50,22 +50,23 @@ pub fn load_edges_from_parquet( } let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; - - // df.check_cols_exist(&cols_to_check)?; - // load_edges_from_df( - // &df, - // size, - // src, - // dst, - // time, - // properties, - // const_properties, - // shared_const_properties, - // layer, - // layer_in_df.unwrap_or(true), - // graph, - // ) - // .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + + load_edges_from_df( + &df, + size, + src, + dst, + time, + properties, + const_properties, + shared_const_properties, + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -150,7 +151,7 @@ mod test { .parent() .map(|p| p.join("raphtory/resources/test/test_data.parquet")) .unwrap(); - + let col_names = vec!["src", "dst", "time", "weight", "marbles"]; let df = process_parquet_file_to_df( parquet_file_path.as_path(), From fc9f2ed6b10227ca8bfe5fb21953596f9f2e1604 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Wed, 19 Jun 2024 16:21:52 +0100 Subject: [PATCH 07/33] impl/test load nodes from parquet and fix names order issue --- python/tests/data/parquet/nodes.parquet | Bin 0 -> 1615 bytes python/tests/test_load_from_parquet.py | 46 ++++++++++++-- raphtory/src/python/graph/graph.rs | 40 +++++++++++- raphtory/src/python/graph/io/dataframe.rs | 7 ++ raphtory/src/python/graph/io/df_loaders.rs | 2 +- .../src/python/graph/io/parquet_loaders.rs | 60 ++++++++++++++---- 6 files changed, 134 insertions(+), 21 deletions(-) create mode 100644 python/tests/data/parquet/nodes.parquet diff --git a/python/tests/data/parquet/nodes.parquet b/python/tests/data/parquet/nodes.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e52c475e71d9d934cf39a12c56e9cdef835bbf7b GIT binary patch literal 1615 zcmc&#Pmj_-6rYyTE!Z_=<20@5fy*`|vg=}k2jd=IVF9xkfdn^fOm+)Y78hvkBIwN+ z4<0;v@aTu|;Ms#m&&GK0W9-p4(*hB>Y;@yG`)1zXncsV_P=iK}O7xgks#K+9gOD9b zAPIqpi8YDD))gYHD|h20Bsp3sp&GrO#fTJ@sch=X%%UqoWDy6UB0-kIg%Fh6^mv9V$5Ftb-*IDT zw7s^F$`+i!8%QV?ok?8J?oC_dYmUgzu%F(iWm5O}M;wd&MK9+fd zYKu|A8VSBG_-IR{!$A>LVtq|6ikldDUH54G!VfDv(k9lcr1dSq>Kn$cI@@@ZC%Syr zIdO*$Rd6^qMK*ARs<`3(+9XvIkL4B)KeR25iySo7FpcVEPOkM&X34tbe*Se+w4bu3 zT8j2rzD9^)>MArE4-W2xJ<`v`n~a`=4FGj~8SMj1!R|o`pa+E>*V*WBpX~|&_t=W^ zY(PVn@ZJKpN}{+1I2-ZVXNgR{Oj YO6T(1JKMJm_NmMta)dvrHT*021): List of node property column names. Defaults to None. (optional) + /// const_properties (List): List of constant node property column names. Defaults to None. (optional) + /// shared_const_properties (Dictionary/Hashmap of properties): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, id, time, node_type = None, node_type_in_df = true, properties = None, const_properties = None, shared_const_properties = None))] + fn load_nodes_from_parquet( + &self, + parquet_file_path: PathBuf, + id: &str, + time: &str, + node_type: Option<&str>, + node_type_in_df: Option, + properties: Option>, + const_properties: Option>, + shared_const_properties: Option>, + ) -> Result<(), GraphError> { + load_nodes_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + id, + time, + node_type, + node_type_in_df, + properties, + const_properties, + shared_const_properties, + ) + } + /// Load edges from a Pandas DataFrame into the graph. /// /// Arguments: diff --git a/raphtory/src/python/graph/io/dataframe.rs b/raphtory/src/python/graph/io/dataframe.rs index a3dae3d9c0..3762015ae8 100644 --- a/raphtory/src/python/graph/io/dataframe.rs +++ b/raphtory/src/python/graph/io/dataframe.rs @@ -18,6 +18,13 @@ pub(crate) struct PretendDF { } impl PretendDF { + pub(crate) fn get_inner_size(&self) -> usize { + if self.arrays.is_empty() || self.arrays[0].is_empty() { + return 0; + } + self.arrays[0][0].len() + } + pub fn check_cols_exist(&self, cols: &[&str]) -> Result<(), GraphError> { let non_cols: Vec<&&str> = cols .iter() diff --git a/raphtory/src/python/graph/io/df_loaders.rs b/raphtory/src/python/graph/io/df_loaders.rs index dd906d12f5..bd33ced02e 100644 --- a/raphtory/src/python/graph/io/df_loaders.rs +++ b/raphtory/src/python/graph/io/df_loaders.rs @@ -39,7 +39,7 @@ pub(crate) fn load_nodes_from_df<'a>( }; iter_res? } else { - Box::new(std::iter::repeat(Some(node_type))) + Box::new(iter::repeat(Some(node_type))) } } None => Box::new(iter::repeat(None)), diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index 20c29851fe..b4a2d087ae 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -1,17 +1,14 @@ use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; use std::collections::HashMap; -use std::num::TryFromIntError; use std::path::{Path, PathBuf}; use itertools::Itertools; use polars_arrow::array::Array; use polars_arrow::datatypes::{ArrowSchema, Field}; use polars_arrow::legacy::error; -use polars_arrow::legacy::error::PolarsResult; use polars_parquet::read; use polars_parquet::read::{FileMetaData, FileReader, read_metadata}; use crate::python::graph::io::{dataframe::*, df_loaders::*}; use polars_arrow::datatypes::ArrowDataType as DataType; -use polars_arrow::datatypes::ArrowSchema as Schema; use polars_arrow::record_batch::RecordBatch as Chunk; pub fn load_nodes_from_parquet( @@ -25,7 +22,34 @@ pub fn load_nodes_from_parquet( const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { - todo!() + let mut cols_to_check = vec![id, time]; + cols_to_check.extend(properties.as_ref().unwrap_or(&Vec::new())); + cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); + if node_type_in_df.unwrap_or(true) { + if let Some(ref node_type) = node_type { + cols_to_check.push(node_type.as_ref()); + } + } + + let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = df.get_inner_size(); + + load_nodes_from_df( + &df, + size, + id, + time, + properties, + const_properties, + shared_const_properties, + node_type, + node_type_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + + Ok(()) } pub fn load_edges_from_parquet( @@ -52,7 +76,7 @@ pub fn load_edges_from_parquet( let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; let size = cols_to_check.len(); - + load_edges_from_df( &df, size, @@ -98,10 +122,12 @@ pub(crate) fn process_parquet_file_to_df( parquet_file_path: &Path, col_names: Vec<&str>, ) -> Result { - let iter = read_parquet_file(parquet_file_path)?; + let (names, arrays) = read_parquet_file(parquet_file_path, &col_names)?; - let names = col_names.iter().map(|s| s.to_string()).collect(); - let arrays = iter.map_ok(|r| { + let names = names.into_iter() + .filter(|x| col_names.contains(&x.as_str())) + .collect(); + let arrays = arrays.map_ok(|r| { r.into_iter().map(|boxed| boxed.clone()).collect_vec() }).collect::, _>>()?; @@ -113,8 +139,9 @@ pub(crate) fn process_parquet_file_to_df( fn read_parquet_file( path: impl AsRef, -) -> Result>, error::PolarsError>>, GraphError> { - fn read_schema(metadata: &FileMetaData) -> Result { + col_names: &Vec<&str>, +) -> Result<(Vec, impl Iterator>, error::PolarsError>>), GraphError> { + let read_schema = |metadata: &FileMetaData| -> Result { let schema = read::infer_schema(metadata)?; let fields = schema .fields @@ -126,17 +153,25 @@ fn read_parquet_file( f.clone() } }) + .filter(|f| { // Filtered fields to avoid loading data that is not needed + col_names.contains(&f.name.as_str()) + }) .collect::>(); Ok(ArrowSchema::from(fields).with_metadata(schema.metadata)) - } + }; let mut file = std::fs::File::open(&path)?; let metadata = read_metadata(&mut file)?; let row_groups = metadata.clone().row_groups; let schema = read_schema(&metadata)?; + + // Although fields are already filtered by col_names, we need names in the order as it appears + // in the schema to create PretendDF + let names = schema.fields.iter().map(|f| f.name.clone()).collect_vec(); + let reader = FileReader::new(file, row_groups, schema, None, None, None); - Ok(reader) + Ok((names, reader)) } #[cfg(test)] @@ -146,7 +181,6 @@ mod test { #[test] fn test_process_parquet_file_to_df() { - // let parquet_file_path = Path::new("/tmp/parquet/test_data.parquet"); let parquet_file_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .parent() .map(|p| p.join("raphtory/resources/test/test_data.parquet")) From db8c036ec877c6ba94b0d87ebdd7a7602c2da93e Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:32:11 +0100 Subject: [PATCH 08/33] impl/test load from parquet. load node/edge props from parquet --- python/tests/test_load_from_parquet.py | 99 ++++++------ raphtory/src/python/graph/graph.rs | 143 +++++++++++++++++- .../src/python/graph/io/parquet_loaders.rs | 48 +++++- 3 files changed, 240 insertions(+), 50 deletions(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index 4d0bb432ae..525ab459ab 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -6,51 +6,60 @@ def test_load_from_parquet(): - # data = { - # "src": [1, 2, 3, 4, 5], - # "dst": [2, 3, 4, 5, 6], - # "time": [1, 2, 3, 4, 5], - # "weight": [1.0, 2.0, 3.0, 4.0, 5.0], - # "marbles": ["red", "blue", "green", "yellow", "purple"], - # } - # - # table = pa.table(data) - # pq.write_table(table, '/tmp/parquet/test_data.parquet') - - data = { - "id": [1, 2, 3, 4, 5, 6], - "name": ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank"], - "time": [1, 2, 3, 4, 5, 6], - "node_type": ["p", "p", "p", "p", "p", "p"], - } - - table = pa.table(data) - pq.write_table(table, '/tmp/parquet/nodes.parquet') - - -# -# expected_nodes = [1, 2, 3, 4, 5, 6] -# expected_edges = [ -# (1, 2, 1.0, "red"), -# (2, 3, 2.0, "blue"), -# (3, 4, 3.0, "green"), -# (4, 5, 4.0, "yellow"), -# (5, 6, 5.0, "purple"), -# ] -# -# def assertions(g): -# edges = [] -# for e in g.edges: -# weight = e["weight"] -# marbles = e["marbles"] -# edges.append((e.src.id, e.dst.id, weight, marbles)) -# -# assert g.nodes.id.collect() == expected_nodes -# assert edges == expected_edges -# -# g = Graph.load_from_parquet('test_data.parquet', "src", "dst", "time", ["weight", "marbles"]) -# assertions(g) -# + nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') + edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') + + expected_node_ids = [1, 2, 3, 4, 5, 6] + expected_nodes = [ + (1, "Alice"), + (2, "Bob"), + (3, "Carol"), + (4, "Dave"), + (5, "Eve"), + (6, "Frank"), + ] + expected_edges = [ + (1, 2, 1.0, "red"), + (2, 3, 2.0, "blue"), + (3, 4, 3.0, "green"), + (4, 5, 4.0, "yellow"), + (5, 6, 5.0, "purple"), + ] + + def assertions(g): + nodes = [] + for v in g.nodes: + name = v["name"] + nodes.append((v.id, name)) + assert g.nodes.id.collect() == expected_node_ids + assert nodes == expected_nodes + + edges = [] + for e in g.edges: + weight = e["weight"] + marbles = e["marbles"] + edges.append((e.src.id, e.dst.id, weight, marbles)) + assert edges == expected_edges + + g = Graph() + g.load_nodes_from_parquet(nodes_parquet_file_path, "id", "time", "node_type", properties=["name"]) + g.load_edges_from_parquet(edges_parquet_file_path, "src", "dst", "time", ["weight", "marbles"]) + assertions(g) + + g = Graph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_properties=["weight", "marbles"], + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_type="node_type", + ) + assertions(g) + # # g = PersistentGraph.load_from_parquet( # # 'test_data.parquet', "src", "dst", "time", ["weight", "marbles"] # # ) diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 490c0a2f30..90145af146 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -460,6 +460,81 @@ impl PyGraph { Ok(graph.graph) } + /// Load a graph from a Pandas DataFrame. + /// + /// Args: + /// edge_parquet_file_path (str): Parquet file containing the edges. + /// edge_src (str): The column name for the source node ids. + /// edge_dst (str): The column name for the destination node ids. + /// edge_time (str): The column name for the timestamps. + /// edge_properties (list): The column names for the temporal properties (optional) Defaults to None. + /// edge_const_properties (list): The column names for the constant properties (optional) Defaults to None. + /// edge_shared_const_properties (dict): A dictionary of constant properties that will be added to every edge (optional) Defaults to None. + /// edge_layer (str): The edge layer name (optional) Defaults to None. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the edge_df or if it should be used directly as the layer for all edges (optional) defaults to True. + /// node_parquet_file_path (str): Parquet file containing the nodes (optional) Defaults to None. + /// node_id (str): The column name for the node ids (optional) Defaults to None. + /// node_time (str): The column name for the node timestamps (optional) Defaults to None. + /// node_properties (list): The column names for the node temporal properties (optional) Defaults to None. + /// node_const_properties (list): The column names for the node constant properties (optional) Defaults to None. + /// node_shared_const_properties (dict): A dictionary of constant properties that will be added to every node (optional) Defaults to None. + /// node_type (str): the column name for the node type + /// node_type_in_df (bool): whether the node type should be used to look up the values in a column of the df or if it should be used directly as the node type + /// + /// Returns: + /// Graph: The loaded Graph object. + #[staticmethod] + #[pyo3(signature = (edge_parquet_file_path, edge_src, edge_dst, edge_time, edge_properties = None, edge_const_properties = None, edge_shared_const_properties = None, + edge_layer = None, layer_in_df = true, node_parquet_file_path = None, node_id = None, node_time = None, node_properties = None, + node_const_properties = None, node_shared_const_properties = None, node_type = None, node_type_in_df = true))] + fn load_from_parquet( + edge_parquet_file_path: PathBuf, + edge_src: &str, + edge_dst: &str, + edge_time: &str, + edge_properties: Option>, + edge_const_properties: Option>, + edge_shared_const_properties: Option>, + edge_layer: Option<&str>, + layer_in_df: Option, + node_parquet_file_path: Option, + node_id: Option<&str>, + node_time: Option<&str>, + node_properties: Option>, + node_const_properties: Option>, + node_shared_const_properties: Option>, + node_type: Option<&str>, + node_type_in_df: Option, + ) -> Result { + let graph = PyGraph { + graph: Graph::new(), + }; + if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = (node_parquet_file_path, node_id, node_time) { + graph.load_nodes_from_parquet( + node_parquet_file_path, + node_id, + node_time, + node_type, + node_type_in_df, + node_properties, + node_const_properties, + node_shared_const_properties, + )?; + } + graph.load_edges_from_parquet( + edge_parquet_file_path, + edge_src, + edge_dst, + edge_time, + edge_properties, + edge_const_properties, + edge_shared_const_properties, + edge_layer, + layer_in_df, + )?; + Ok(graph.graph) + } + /// Load nodes from a Pandas DataFrame into the graph. /// /// Arguments: @@ -501,7 +576,7 @@ impl PyGraph { /// Load nodes from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path: Parquet file path + /// parquet_file_path (str): Parquet file path containing the nodes /// id (str): The column name for the node IDs. /// time (str): The column name for the timestamps. /// node_type (str): the column name for the node type @@ -581,7 +656,7 @@ impl PyGraph { /// Load edges from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path: Parquet file path + /// parquet_file_path (str): Parquet file path containing edges /// src (str): The column name for the source node ids. /// dst (str): The column name for the destination node ids. /// time (str): The column name for the update timestamps. @@ -647,6 +722,34 @@ impl PyGraph { ) } + /// Load node properties from a parquet file. + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing node information. + /// id(str): The column name for the node IDs. + /// const_properties (List): List of constant node property column names. Defaults to None. (optional) + /// shared_const_properties (>): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, id, const_properties = None, shared_const_properties = None))] + fn load_node_props_from_parquet( + &self, + parquet_file_path: PathBuf, + id: &str, + const_properties: Option>, + shared_const_properties: Option>, + ) -> Result<(), GraphError> { + load_node_props_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + id, + const_properties, + shared_const_properties, + ) + } + + /// Load edge properties from a Pandas DataFrame. /// /// Arguments: @@ -682,4 +785,40 @@ impl PyGraph { layer_in_df, ) } + + /// Load edge properties from parquet file + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing edge information. + /// src (str): The column name for the source node. + /// dst (str): The column name for the destination node. + /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) + /// shared_const_properties (dict): A dictionary of constant properties that will be added to every edge. Defaults to None. (optional) + /// layer (str): Layer name. Defaults to None. (optional) + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the data frame or if it should be used directly as the layer for all edges (optional) defaults to True. + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, src, dst, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + fn load_edge_props_from_parquet( + &self, + parquet_file_path: PathBuf, + src: &str, + dst: &str, + const_properties: Option>, + shared_const_properties: Option>, + layer: Option<&str>, + layer_in_df: Option, + ) -> Result<(), GraphError> { + load_edge_props_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + src, + dst, + const_properties, + shared_const_properties, + layer, + layer_in_df, + ) + } } diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index b4a2d087ae..074fc6ce74 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -1,6 +1,6 @@ use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; use std::collections::HashMap; -use std::path::{Path, PathBuf}; +use std::path::Path; use itertools::Itertools; use polars_arrow::array::Array; use polars_arrow::datatypes::{ArrowSchema, Field}; @@ -102,7 +102,24 @@ pub fn load_node_props_from_parquet( const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { - todo!() + let mut cols_to_check = vec![id]; + cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); + + let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + + load_node_props_from_df( + &df, + size, + id, + const_properties, + shared_const_properties, + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + + Ok(()) } pub fn load_edge_props_from_parquet( @@ -115,7 +132,32 @@ pub fn load_edge_props_from_parquet( layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - todo!() + let mut cols_to_check = vec![src, dst]; + if layer_in_df.unwrap_or(false) { + if let Some(ref layer) = layer { + cols_to_check.push(layer.as_ref()); + } + } + cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); + + let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + + load_edges_props_from_df( + &df, + size, + src, + dst, + const_properties, + shared_const_properties, + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + + Ok(()) } pub(crate) fn process_parquet_file_to_df( From 69ffe8982a758f7beac375dfffcb431cad94185c Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 10:19:08 +0100 Subject: [PATCH 09/33] add tests for props --- python/tests/data/parquet/edges.parquet | Bin 2130 -> 2829 bytes python/tests/data/parquet/nodes.parquet | Bin 1615 -> 1950 bytes python/tests/test_load_from_parquet.py | 175 ++++++++++++++++++++---- 3 files changed, 147 insertions(+), 28 deletions(-) diff --git a/python/tests/data/parquet/edges.parquet b/python/tests/data/parquet/edges.parquet index 59f63d062bc458aebb25cdc362f2cdd5ec3655a5..98b34238ff32a9a537c0392e0a6900ce7cf605af 100644 GIT binary patch delta 649 zcmca4&?~khna5zQ0z@Db$_We{RuFxOybU|`5etV}IZFqGtGFaly@Np2kzgs`b7 zgQ%FOlnzjbfGC?Nn~WKQ20KKDsRWc^2sDISg9%7-F)}lExG|`G5J9+#4XnBt>>P!6 zjACaP)jCAfmIz}tX|g}FD4%4HDT4%CadJj#ZlWmPWOWutV^JPeCN%%4GWf^+2m+8hx%kkM@J_|$3P(E=<4X`TMne1LqWVK5a9wMKoX9Q<^ksV`XK4R zP_SxukRU`OnB|lXq}(eqQp3Y4lY#0A1AvxPga8Q(poFU%*bry1QZN(DQgk!`(oQ9j u5nh=|?qO9CMt)^Ud43>4MWtvER&H8)X|Yw{FUM>a7oNg2tQi8ZmCQ`xc@)x-{PigBrN zh|S;>VPKGyk>GI*3i1!Pf@qOAz^o4f29qanI4FcVIy$bNez<`Vxj9C~M0vv-30l4%b*#H0l diff --git a/python/tests/data/parquet/nodes.parquet b/python/tests/data/parquet/nodes.parquet index e52c475e71d9d934cf39a12c56e9cdef835bbf7b..4da22cc3d7efab4750c9ce90a52fb44715e84253 100644 GIT binary patch literal 1950 zcmc&#Pfrs;6rbI0%N7>X5NES%mZTRZ94fY88)Nh^TUx53mIi_v5?RVxE2Uiuw9%6X z4<0;tFvcSvz@ssK0OP@fXJd?C!o+a&z1dw_p&p13H=Q?c_Wk?4KeH|KN`@+Qm6qnI zM#&fA4hGBxsDrEJFzc{zBIGCDvf-aU7c|$b(Rs07<|`2%OjGYM56> zWWgKtsv7~J-CC0psl3x}H553`I-RJIDs)^-wb*u=8;5huk;$kK7*XK_^#07`Rn($d zkcrZCHeHd+H%b34v3Cml5V4N97>N1*_2P}%>xHsY4>It(@ob3g9jnm;b9*V##jqiz zlnOO0yY1~(GnpCJ$TX}OSQp2&bhb~H8_)_hu*(@CmpRgq_al7`*?H{w4S6l=KP2{f zn0<|7LqhX@M+2b`(=+o_7FQ&iz#;bl))>`$w*b6Jo*{W4izodVmo&mYj`m>0o2_;C z{sESz-eLP1`=TKd-*!-Jvyml6Y`gl<-E`%Z;PYLFPy_#^(1!CZ z)Xyyf5%KbVJds3SJhZpd&T>Y;<<}nt`rHiE^mPNk=&%@w%2{Xg;({-Mh7&-@$obUGdY delta 457 zcmbQof1YPUKGS3^W=p1XT$96DWm%iJ)NXK2u4I)F5M_~!F=dcoD^AWx%}o^Ln7o-; zd-8i`F-D(>4Kb6GSeBz}kyU%ZCdQ-2A$EvOgn>a)MuNvND9AtD3Sxl74<>yOsBoOT zht7X(=8# zL^^?)U;=D@ihE8;6xe=8=SZjqN5@c*0+&#j0Zu_+ z4ap!9-GHQAw-T_WU Date: Thu, 20 Jun 2024 10:33:18 +0100 Subject: [PATCH 10/33] ref tests --- python/tests/test_load_from_parquet.py | 106 ++----------------------- 1 file changed, 6 insertions(+), 100 deletions(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index d5550508ba..6e35d8235e 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -84,59 +84,10 @@ def assertions(g): ) assertions(g) - -def test_load_nodes_from_parquet(): - nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') - expected_node_ids = [1, 2, 3, 4, 5, 6] - expected_nodes = [ - (1, "Alice"), - (2, "Bob"), - (3, "Carol"), - (4, "Dave"), - (5, "Eve"), - (6, "Frank"), - ] - g = Graph() g.load_nodes_from_parquet(nodes_parquet_file_path, "id", "time", "node_type", properties=["name"]) - - nodes = [] - for v in g.nodes: - name = v["name"] - nodes.append((v.id, name)) - - assert g.nodes.id.collect() == expected_node_ids - assert nodes == expected_nodes - - -def test_load_node_props_from_parquet(): - nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') - expected_node_ids = [1, 2, 3, 4, 5, 6] - expected_nodes = [ - (1, "Alice"), - (2, "Bob"), - (3, "Carol"), - (4, "Dave"), - (5, "Eve"), - (6, "Frank"), - ] - - g = Graph() - g.load_nodes_from_parquet( - nodes_parquet_file_path, - "id", - "time", - "node_type", - properties=["name"] - ) - - nodes = [] - for v in g.nodes: - name = v["name"] - nodes.append((v.id, name)) - - assert g.nodes.id.collect() == expected_node_ids - assert nodes == expected_nodes + g.load_edges_from_parquet(edges_parquet_file_path, "src", "dst", "time", ["weight", "marbles"], layer="layers") + assertions(g) g.load_node_props_from_parquet( nodes_parquet_file_path, @@ -144,7 +95,6 @@ def test_load_node_props_from_parquet(): const_properties=["type"], shared_const_properties={"tag": "test_tag"}, ) - assert g.nodes.properties.constant.get("type").collect() == [ "Person 1", "Person 2", @@ -162,50 +112,6 @@ def test_load_node_props_from_parquet(): "test_tag", ] - -def test_load_edges_from_parquet(): - edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') - expected_edges = [ - (1, 2, 1.0, "red"), - (2, 3, 2.0, "blue"), - (3, 4, 3.0, "green"), - (4, 5, 4.0, "yellow"), - (5, 6, 5.0, "purple"), - ] - - g = Graph() - g.load_edges_from_parquet(edges_parquet_file_path, "src", "dst", "time", ["weight", "marbles"]) - - edges = [] - for e in g.edges: - weight = e["weight"] - marbles = e["marbles"] - edges.append((e.src.id, e.dst.id, weight, marbles)) - - assert edges == expected_edges - - -def test_load_edge_props_from_parquet(): - edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') - expected_edges = [ - (1, 2, 1.0, "red"), - (2, 3, 2.0, "blue"), - (3, 4, 3.0, "green"), - (4, 5, 4.0, "yellow"), - (5, 6, 5.0, "purple"), - ] - - g = Graph() - g.load_edges_from_parquet(edges_parquet_file_path, "src", "dst", "time", ["weight", "marbles"], layer="layers") - - edges = [] - for e in g.edges: - weight = e["weight"] - marbles = e["marbles"] - edges.append((e.src.id, e.dst.id, weight, marbles)) - - assert edges == expected_edges - g.load_edge_props_from_parquet( edges_parquet_file_path, "src", @@ -217,10 +123,10 @@ def test_load_edge_props_from_parquet(): assert g.layers( ["layer 1", "layer 2", "layer 3"] ).edges.properties.constant.get("marbles_const").collect() == [ - {"layer 1": "red"}, - {"layer 2": "blue"}, - {"layer 3": "green"}, - ] + {"layer 1": "red"}, + {"layer 2": "blue"}, + {"layer 3": "green"}, + ] assert g.edges.properties.constant.get("tag").collect() == [ {"layer 1": "test_tag"}, {"layer 2": "test_tag"}, From e09f3878232381fbba69b09a547a438a88d1580b Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 10:58:27 +0100 Subject: [PATCH 11/33] simplify tests --- python/tests/test_load_from_parquet.py | 128 +++++++++++++++---------- 1 file changed, 77 insertions(+), 51 deletions(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index 6e35d8235e..dc2892232e 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -1,12 +1,12 @@ import os -import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest from raphtory import Graph, PersistentGraph + @pytest.mark.skip(reason="Prepares data for debugging purposes") def test_prepare_data(): data = { @@ -34,10 +34,11 @@ def test_prepare_data(): pq.write_table(table, '/tmp/parquet/edges.parquet') -def test_load_from_parquet(): - edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') - nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') +edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') +nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') + +def assert_expected_nodes(g): expected_node_ids = [1, 2, 3, 4, 5, 6] expected_nodes = [ (1, "Alice"), @@ -47,6 +48,15 @@ def test_load_from_parquet(): (5, "Eve"), (6, "Frank"), ] + nodes = [] + for v in g.nodes: + name = v["name"] + nodes.append((v.id, name)) + assert g.nodes.id.collect() == expected_node_ids + assert nodes == expected_nodes + + +def assert_expected_edges(g): expected_edges = [ (1, 2, 1.0, "red"), (2, 3, 2.0, "blue"), @@ -54,47 +64,15 @@ def test_load_from_parquet(): (4, 5, 4.0, "yellow"), (5, 6, 5.0, "purple"), ] + edges = [] + for e in g.edges: + weight = e["weight"] + marbles = e["marbles"] + edges.append((e.src.id, e.dst.id, weight, marbles)) + assert edges == expected_edges - def assertions(g): - nodes = [] - for v in g.nodes: - name = v["name"] - nodes.append((v.id, name)) - assert g.nodes.id.collect() == expected_node_ids - assert nodes == expected_nodes - - edges = [] - for e in g.edges: - weight = e["weight"] - marbles = e["marbles"] - edges.append((e.src.id, e.dst.id, weight, marbles)) - assert edges == expected_edges - - g = Graph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, - edge_src="src", - edge_dst="dst", - edge_time="time", - edge_properties=["weight", "marbles"], - node_parquet_file_path=nodes_parquet_file_path, - node_id="id", - node_time="time", - node_properties=["name"], - node_type="node_type", - ) - assertions(g) - - g = Graph() - g.load_nodes_from_parquet(nodes_parquet_file_path, "id", "time", "node_type", properties=["name"]) - g.load_edges_from_parquet(edges_parquet_file_path, "src", "dst", "time", ["weight", "marbles"], layer="layers") - assertions(g) - g.load_node_props_from_parquet( - nodes_parquet_file_path, - "id", - const_properties=["type"], - shared_const_properties={"tag": "test_tag"}, - ) +def assert_expected_node_properties(g): assert g.nodes.properties.constant.get("type").collect() == [ "Person 1", "Person 2", @@ -112,14 +90,8 @@ def assertions(g): "test_tag", ] - g.load_edge_props_from_parquet( - edges_parquet_file_path, - "src", - "dst", - const_properties=["marbles_const"], - shared_const_properties={"tag": "test_tag"}, - layer="layers", - ) + +def assert_expected_edge_properties(g): assert g.layers( ["layer 1", "layer 2", "layer 3"] ).edges.properties.constant.get("marbles_const").collect() == [ @@ -134,3 +106,57 @@ def assertions(g): {"layer 4": "test_tag"}, {"layer 5": "test_tag"}, ] + + +def test_load_from_parquet(): + g = Graph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_properties=["weight", "marbles"], + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_type="node_type", + ) + assert_expected_nodes(g) + assert_expected_edges(g) + + g = Graph() + g.load_nodes_from_parquet( + nodes_parquet_file_path, + "id", + "time", + "node_type", + properties=["name"] + ) + g.load_edges_from_parquet( + edges_parquet_file_path, + "src", + "dst", + "time", + ["weight", "marbles"], + layer="layers" + ) + assert_expected_nodes(g) + assert_expected_edges(g) + + g.load_node_props_from_parquet( + nodes_parquet_file_path, + "id", + const_properties=["type"], + shared_const_properties={"tag": "test_tag"}, + ) + assert_expected_node_properties(g) + + g.load_edge_props_from_parquet( + edges_parquet_file_path, + "src", + "dst", + const_properties=["marbles_const"], + shared_const_properties={"tag": "test_tag"}, + layer="layers", + ) + assert_expected_edge_properties(g) From 7744a323aafb68afff8003d012376bd0ada6add6 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:27:08 +0100 Subject: [PATCH 12/33] more tests --- python/tests/test_load_from_parquet.py | 148 +++++++++++++++++++++++-- 1 file changed, 139 insertions(+), 9 deletions(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index dc2892232e..512ffc8b00 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -1,4 +1,5 @@ import os +import re import pyarrow as pa import pyarrow.parquet as pq @@ -72,15 +73,18 @@ def assert_expected_edges(g): assert edges == expected_edges -def assert_expected_node_properties(g): - assert g.nodes.properties.constant.get("type").collect() == [ - "Person 1", - "Person 2", - "Person 3", - "Person 4", - "Person 5", - "Person 6", +def assert_expected_node_types(g): + assert g.nodes.node_type == [ + "p", + "p", + "p", + "p", + "p", + "p", ] + + +def assert_expected_node_property_tag(g): assert g.nodes.properties.constant.get("tag").collect() == [ "test_tag", "test_tag", @@ -91,6 +95,28 @@ def assert_expected_node_properties(g): ] +def assert_expected_node_property_type(g): + assert g.nodes.properties.constant.get("type").collect() == [ + "Person 1", + "Person 2", + "Person 3", + "Person 4", + "Person 5", + "Person 6", + ] + + +def assert_expected_node_property_dept(g): + assert g.nodes.properties.constant.get("dept").collect() == [ + "Sales", + "Sales", + "Sales", + "Sales", + "Sales", + "Sales", + ] + + def assert_expected_edge_properties(g): assert g.layers( ["layer 1", "layer 2", "layer 3"] @@ -108,6 +134,48 @@ def assert_expected_edge_properties(g): ] +def assert_expected_edge_properties_test_layer(g): + assert g.edges.properties.constant.get("type").collect() == [ + {"test_layer": "Edge"}, + {"test_layer": "Edge"}, + {"test_layer": "Edge"}, + {"test_layer": "Edge"}, + {"test_layer": "Edge"}, + ] + assert g.edges.properties.constant.get("tag").collect() == [ + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + ] + assert g.edges.properties.constant.get("tag").collect() == [ + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + {"test_layer": "test_tag"}, + ] + + +def assert_expected_layers(g): + assert g.unique_layers == ["_default", "layer 1", "layer 2", "layer 3", "layer 4", "layer 5"] + assert g.layers(["layer 1"]).edges.src.id.collect() == [1] + assert g.layers(["layer 1", "layer 2"]).edges.src.id.collect() == [1, 2] + assert g.layers(["layer 1", "layer 2", "layer 3"]).edges.src.id.collect() == [1, 2, 3] + assert g.layers(["layer 1", "layer 4", "layer 5"]).edges.src.id.collect() == [1, 4, 5] + with pytest.raises( + Exception, + match=re.escape("Invalid layer test_layer."), + ): + g.layers(["test_layer"]) + + +def assert_expected_test_layer(g): + assert g.unique_layers == ["_default", "test_layer"] + assert g.layers(["test_layer"]).edges.src.id.collect() == [1, 2, 3, 4, 5] + + def test_load_from_parquet(): g = Graph.load_from_parquet( edge_parquet_file_path=edges_parquet_file_path, @@ -142,6 +210,7 @@ def test_load_from_parquet(): ) assert_expected_nodes(g) assert_expected_edges(g) + assert_expected_layers(g) g.load_node_props_from_parquet( nodes_parquet_file_path, @@ -149,7 +218,8 @@ def test_load_from_parquet(): const_properties=["type"], shared_const_properties={"tag": "test_tag"}, ) - assert_expected_node_properties(g) + assert_expected_node_property_tag(g) + assert_expected_node_property_type(g) g.load_edge_props_from_parquet( edges_parquet_file_path, @@ -160,3 +230,63 @@ def test_load_from_parquet(): layer="layers", ) assert_expected_edge_properties(g) + assert_expected_layers(g) + + g = Graph() + g.load_nodes_from_parquet( + nodes_parquet_file_path, + "id", + "time", + "node_type", + properties=["name"], + shared_const_properties={"tag": "test_tag"}, + ) + assert_expected_node_types(g) + assert_expected_node_property_tag(g) + + g = Graph() + g.load_edges_from_parquet( + edges_parquet_file_path, + "src", + "dst", + "time", + properties=["weight", "marbles"], + const_properties=["marbles_const"], + shared_const_properties={"type": "Edge", "tag": "test_tag"}, + layer="test_layer", + layer_in_df=False, + ) + assert_expected_edge_properties_test_layer(g) + assert_expected_test_layer(g) + + g = Graph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_layer="test_layer", + layer_in_df=False, + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_shared_const_properties={"dept": "Sales"}, + ) + assert_expected_test_layer(g) + assert_expected_node_property_dept(g) + + g = Graph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_layer="layers", + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_const_properties=["type"], + ) + assert_expected_node_property_type(g) + assert_expected_layers(g) + From 76238f1127cfa6ea9592e12c722d17842129df45 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:42:45 +0100 Subject: [PATCH 13/33] impl loaders for persistent graphs --- raphtory/src/python/graph/graph.rs | 2 +- .../src/python/graph/graph_with_deletions.rs | 219 ++++++++++++++++++ 2 files changed, 220 insertions(+), 1 deletion(-) diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 90145af146..f2be26a3a9 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -460,7 +460,7 @@ impl PyGraph { Ok(graph.graph) } - /// Load a graph from a Pandas DataFrame. + /// Load a graph from Parquet file. /// /// Args: /// edge_parquet_file_path (str): Parquet file containing the edges. diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 3dd743fef0..9c209c36f5 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -29,6 +29,7 @@ use std::{ fmt::{Debug, Formatter}, path::{Path, PathBuf}, }; +use crate::python::graph::io::parquet_loaders::{load_edge_props_from_parquet, load_edges_from_parquet, load_node_props_from_parquet, load_nodes_from_parquet}; use super::{ graph::PyGraph, @@ -455,6 +456,81 @@ impl PyPersistentGraph { Ok(graph.graph) } + /// Load a graph from Parquet file. + /// + /// Args: + /// edge_parquet_file_path (str): Parquet file containing the edges. + /// edge_src (str): The column name for the source node ids. + /// edge_dst (str): The column name for the destination node ids. + /// edge_time (str): The column name for the timestamps. + /// edge_properties (list): The column names for the temporal properties (optional) Defaults to None. + /// edge_const_properties (list): The column names for the constant properties (optional) Defaults to None. + /// edge_shared_const_properties (dict): A dictionary of constant properties that will be added to every edge (optional) Defaults to None. + /// edge_layer (str): The edge layer name (optional) Defaults to None. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the edge_df or if it should be used directly as the layer for all edges (optional) defaults to True. + /// node_parquet_file_path (str): Parquet file containing the nodes (optional) Defaults to None. + /// node_id (str): The column name for the node ids (optional) Defaults to None. + /// node_time (str): The column name for the node timestamps (optional) Defaults to None. + /// node_properties (list): The column names for the node temporal properties (optional) Defaults to None. + /// node_const_properties (list): The column names for the node constant properties (optional) Defaults to None. + /// node_shared_const_properties (dict): A dictionary of constant properties that will be added to every node (optional) Defaults to None. + /// node_type (str): the column name for the node type + /// node_type_in_df (bool): whether the node type should be used to look up the values in a column of the df or if it should be used directly as the node type + /// + /// Returns: + /// Graph: The loaded Graph object. + #[staticmethod] + #[pyo3(signature = (edge_parquet_file_path, edge_src, edge_dst, edge_time, edge_properties = None, edge_const_properties = None, edge_shared_const_properties = None, + edge_layer = None, layer_in_df = true, node_parquet_file_path = None, node_id = None, node_time = None, node_properties = None, + node_const_properties = None, node_shared_const_properties = None, node_type = None, node_type_in_df = true))] + fn load_from_parquet( + edge_parquet_file_path: PathBuf, + edge_src: &str, + edge_dst: &str, + edge_time: &str, + edge_properties: Option>, + edge_const_properties: Option>, + edge_shared_const_properties: Option>, + edge_layer: Option<&str>, + layer_in_df: Option, + node_parquet_file_path: Option, + node_id: Option<&str>, + node_time: Option<&str>, + node_properties: Option>, + node_const_properties: Option>, + node_shared_const_properties: Option>, + node_type: Option<&str>, + node_type_in_df: Option, + ) -> Result { + let graph = PyPersistentGraph { + graph: PersistentGraph::new(), + }; + if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = (node_parquet_file_path, node_id, node_time) { + graph.load_nodes_from_parquet( + node_parquet_file_path, + node_id, + node_time, + node_type, + node_type_in_df, + node_properties, + node_const_properties, + node_shared_const_properties, + )?; + } + graph.load_edges_from_parquet( + edge_parquet_file_path, + edge_src, + edge_dst, + edge_time, + edge_properties, + edge_const_properties, + edge_shared_const_properties, + edge_layer, + layer_in_df, + )?; + Ok(graph.graph) + } + /// Load nodes from a Pandas DataFrame into the graph. /// /// Arguments: @@ -493,6 +569,44 @@ impl PyPersistentGraph { ) } + /// Load nodes from a Parquet file into the graph. + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing the nodes + /// id (str): The column name for the node IDs. + /// time (str): The column name for the timestamps. + /// node_type (str): the column name for the node type + /// node_type_in_df (bool): whether the node type should be used to look up the values in a column of the df or if it should be used directly as the node type + /// properties (List): List of node property column names. Defaults to None. (optional) + /// const_properties (List): List of constant node property column names. Defaults to None. (optional) + /// shared_const_properties (Dictionary/Hashmap of properties): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, id, time, node_type = None, node_type_in_df = true, properties = None, const_properties = None, shared_const_properties = None))] + fn load_nodes_from_parquet( + &self, + parquet_file_path: PathBuf, + id: &str, + time: &str, + node_type: Option<&str>, + node_type_in_df: Option, + properties: Option>, + const_properties: Option>, + shared_const_properties: Option>, + ) -> Result<(), GraphError> { + load_nodes_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + id, + time, + node_type, + node_type_in_df, + properties, + const_properties, + shared_const_properties, + ) + } + /// Load edges from a Pandas DataFrame into the graph. /// /// Arguments: @@ -535,6 +649,48 @@ impl PyPersistentGraph { ) } + /// Load edges from a Parquet file into the graph. + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing edges + /// src (str): The column name for the source node ids. + /// dst (str): The column name for the destination node ids. + /// time (str): The column name for the update timestamps. + /// properties (List): List of edge property column names. Defaults to None. (optional) + /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) + /// shared_const_properties (dict): A dictionary of constant properties that will be added to every edge. Defaults to None. (optional) + /// layer (str): The edge layer name (optional) Defaults to None. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dateframe or if it should be used directly as the layer for all edges (optional) defaults to True. + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, src, dst, time, properties = None, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + fn load_edges_from_parquet( + &self, + parquet_file_path: PathBuf, + src: &str, + dst: &str, + time: &str, + properties: Option>, + const_properties: Option>, + shared_const_properties: Option>, + layer: Option<&str>, + layer_in_df: Option, + ) -> Result<(), GraphError> { + load_edges_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + src, + dst, + time, + properties, + const_properties, + shared_const_properties, + layer, + layer_in_df, + ) + } + /// Load edges deletions from a Pandas DataFrame into the graph. /// /// Arguments: @@ -622,6 +778,33 @@ impl PyPersistentGraph { ) } + /// Load node properties from a parquet file. + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing node information. + /// id(str): The column name for the node IDs. + /// const_properties (List): List of constant node property column names. Defaults to None. (optional) + /// shared_const_properties (>): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, id, const_properties = None, shared_const_properties = None))] + fn load_node_props_from_parquet( + &self, + parquet_file_path: PathBuf, + id: &str, + const_properties: Option>, + shared_const_properties: Option>, + ) -> Result<(), GraphError> { + load_node_props_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + id, + const_properties, + shared_const_properties, + ) + } + /// Load edge properties from a Pandas DataFrame. /// /// Arguments: @@ -657,4 +840,40 @@ impl PyPersistentGraph { layer_in_df, ) } + + /// Load edge properties from parquet file + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing edge information. + /// src (str): The column name for the source node. + /// dst (str): The column name for the destination node. + /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) + /// shared_const_properties (dict): A dictionary of constant properties that will be added to every edge. Defaults to None. (optional) + /// layer (str): Layer name. Defaults to None. (optional) + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the data frame or if it should be used directly as the layer for all edges (optional) defaults to True. + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, src, dst, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + fn load_edge_props_from_parquet( + &self, + parquet_file_path: PathBuf, + src: &str, + dst: &str, + const_properties: Option>, + shared_const_properties: Option>, + layer: Option<&str>, + layer_in_df: Option, + ) -> Result<(), GraphError> { + load_edge_props_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + src, + dst, + const_properties, + shared_const_properties, + layer, + layer_in_df, + ) + } } From ff0e6ebf56af2ce3057d7ce26f17f0fcfa493162 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:44:28 +0100 Subject: [PATCH 14/33] impl loader tests for persistent graphs --- python/tests/test_load_from_parquet.py | 117 ++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index 512ffc8b00..1ba07575b3 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -176,7 +176,7 @@ def assert_expected_test_layer(g): assert g.layers(["test_layer"]).edges.src.id.collect() == [1, 2, 3, 4, 5] -def test_load_from_parquet(): +def test_load_from_parquet_graphs(): g = Graph.load_from_parquet( edge_parquet_file_path=edges_parquet_file_path, edge_src="src", @@ -290,3 +290,118 @@ def test_load_from_parquet(): assert_expected_node_property_type(g) assert_expected_layers(g) + +def test_load_from_parquet_persistent_graphs(): + g = PersistentGraph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_properties=["weight", "marbles"], + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_type="node_type", + ) + assert_expected_nodes(g) + assert_expected_edges(g) + + g = PersistentGraph() + g.load_nodes_from_parquet( + nodes_parquet_file_path, + "id", + "time", + "node_type", + properties=["name"] + ) + g.load_edges_from_parquet( + edges_parquet_file_path, + "src", + "dst", + "time", + ["weight", "marbles"], + layer="layers" + ) + assert_expected_nodes(g) + assert_expected_edges(g) + assert_expected_layers(g) + + g.load_node_props_from_parquet( + nodes_parquet_file_path, + "id", + const_properties=["type"], + shared_const_properties={"tag": "test_tag"}, + ) + assert_expected_node_property_tag(g) + assert_expected_node_property_type(g) + + g.load_edge_props_from_parquet( + edges_parquet_file_path, + "src", + "dst", + const_properties=["marbles_const"], + shared_const_properties={"tag": "test_tag"}, + layer="layers", + ) + assert_expected_edge_properties(g) + assert_expected_layers(g) + + g = PersistentGraph() + g.load_nodes_from_parquet( + nodes_parquet_file_path, + "id", + "time", + "node_type", + properties=["name"], + shared_const_properties={"tag": "test_tag"}, + ) + assert_expected_node_types(g) + assert_expected_node_property_tag(g) + + g = PersistentGraph() + g.load_edges_from_parquet( + edges_parquet_file_path, + "src", + "dst", + "time", + properties=["weight", "marbles"], + const_properties=["marbles_const"], + shared_const_properties={"type": "Edge", "tag": "test_tag"}, + layer="test_layer", + layer_in_df=False, + ) + assert_expected_edge_properties_test_layer(g) + assert_expected_test_layer(g) + + g = Graph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_layer="test_layer", + layer_in_df=False, + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_shared_const_properties={"dept": "Sales"}, + ) + assert_expected_test_layer(g) + assert_expected_node_property_dept(g) + + g = PersistentGraph.load_from_parquet( + edge_parquet_file_path=edges_parquet_file_path, + edge_src="src", + edge_dst="dst", + edge_time="time", + edge_layer="layers", + node_parquet_file_path=nodes_parquet_file_path, + node_id="id", + node_time="time", + node_properties=["name"], + node_const_properties=["type"], + ) + assert_expected_node_property_type(g) + assert_expected_layers(g) + From ec1e9abba41e3394a992d2169e5d5be49c660dd3 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:50:10 +0100 Subject: [PATCH 15/33] move load_edges_deletions_from_pandas to panda_loaders --- .../src/python/graph/graph_with_deletions.rs | 53 +++++-------------- raphtory/src/python/graph/io/panda_loaders.rs | 46 ++++++++++++++++ 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 9c209c36f5..4b6ef19382 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -36,7 +36,7 @@ use super::{ io::{ dataframe::GraphLoadException, panda_loaders::*, - df_loaders::load_edges_deletions_from_df + df_loaders::load_edges_deletions_from_df, }, }; @@ -68,8 +68,8 @@ impl IntoPy for PersistentGraph { PyGraphView::from(self), ), ) - .unwrap() // I think this only fails if we are out of memory? Seems to be unavoidable if we want to create an actual graph. - .into_py(py) + .unwrap() // I think this only fails if we are out of memory? Seems to be unavoidable if we want to create an actual graph. + .into_py(py) } } @@ -712,43 +712,16 @@ impl PyPersistentGraph { time: &str, layer: Option<&str>, layer_in_df: Option, - ) -> Result<(), GraphError> { // TODO: move this to panda_loaders - let graph = &self.graph.0; - Python::with_gil(|py| { - let size: usize = py - .eval( - "index.__len__()", - Some([("index", df.getattr("index")?)].into_py_dict(py)), - None, - )? - .extract()?; - - let mut cols_to_check = vec![src, dst, time]; - if layer_in_df.unwrap_or(true) { - if let Some(ref layer) = layer { - cols_to_check.push(layer.as_ref()); - } - } - - let df = process_pandas_py_df(df, py, cols_to_check.clone())?; - - df.check_cols_exist(&cols_to_check)?; - load_edges_deletions_from_df( - &df, - size, - src, - dst, - time, - layer, - layer_in_df.unwrap_or(true), - graph, - ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; - - Ok::<(), PyErr>(()) - }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - Ok(()) + ) -> Result<(), GraphError> { + load_edges_deletions_from_pandas( + &self.graph.0, + df, + src, + dst, + time, + layer, + layer_in_df, + ) } /// Load node properties from a Pandas DataFrame. diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index 274eea197b..3a4928c614 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -193,6 +193,52 @@ pub fn load_edge_props_from_pandas( Ok(()) } +pub fn load_edges_deletions_from_pandas( + graph: &InternalGraph, + df: &PyAny, + src: &str, + dst: &str, + time: &str, + layer: Option<&str>, + layer_in_df: Option, +) -> Result<(), GraphError> { + Python::with_gil(|py| { + let size: usize = py + .eval( + "index.__len__()", + Some([("index", df.getattr("index")?)].into_py_dict(py)), + None, + )? + .extract()?; + + let mut cols_to_check = vec![src, dst, time]; + if layer_in_df.unwrap_or(true) { + if let Some(ref layer) = layer { + cols_to_check.push(layer.as_ref()); + } + } + + let df = process_pandas_py_df(df, py, cols_to_check.clone())?; + + df.check_cols_exist(&cols_to_check)?; + load_edges_deletions_from_df( + &df, + size, + src, + dst, + time, + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + + Ok::<(), PyErr>(()) + }) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + Ok(()) +} + pub(crate) fn process_pandas_py_df( df: &PyAny, py: Python, From 38a4fb6bab45aa75a014a72f622a48a4200af96e Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:56:21 +0100 Subject: [PATCH 16/33] impl load_edges_deletions_from_parquet --- .../src/python/graph/graph_with_deletions.rs | 49 +++++++++++++++---- .../src/python/graph/io/parquet_loaders.rs | 39 +++++++++++++++ 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 4b6ef19382..29721fd6c1 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -22,22 +22,18 @@ use crate::{ }; use pyo3::{ prelude::*, - types::{IntoPyDict, PyBytes}, + types::PyBytes, }; use std::{ collections::HashMap, fmt::{Debug, Formatter}, path::{Path, PathBuf}, }; -use crate::python::graph::io::parquet_loaders::{load_edge_props_from_parquet, load_edges_from_parquet, load_node_props_from_parquet, load_nodes_from_parquet}; +use crate::python::graph::io::parquet_loaders::*; use super::{ graph::PyGraph, - io::{ - dataframe::GraphLoadException, - panda_loaders::*, - df_loaders::load_edges_deletions_from_df, - }, + io::panda_loaders::*, }; /// A temporal graph that allows edges and nodes to be deleted. @@ -618,7 +614,7 @@ impl PyPersistentGraph { /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) /// shared_const_properties (dict): A dictionary of constant properties that will be added to every edge. Defaults to None. (optional) /// layer (str): The edge layer name (optional) Defaults to None. - /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dateframe or if it should be used directly as the layer for all edges (optional) defaults to True. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dataframe or if it should be used directly as the layer for all edges (optional) defaults to True. /// /// Returns: /// Result<(), GraphError>: Result of the operation. @@ -660,7 +656,7 @@ impl PyPersistentGraph { /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) /// shared_const_properties (dict): A dictionary of constant properties that will be added to every edge. Defaults to None. (optional) /// layer (str): The edge layer name (optional) Defaults to None. - /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dateframe or if it should be used directly as the layer for all edges (optional) defaults to True. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dataframe or if it should be used directly as the layer for all edges (optional) defaults to True. /// /// Returns: /// Result<(), GraphError>: Result of the operation. @@ -699,7 +695,7 @@ impl PyPersistentGraph { /// dst (str): The column name for the destination node ids. /// time (str): The column name for the update timestamps. /// layer (str): The edge layer name (optional) Defaults to None. - /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dateframe or if it should be used directly as the layer for all edges (optional) defaults to True. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dataframe or if it should be used directly as the layer for all edges (optional) defaults to True. /// /// Returns: /// Result<(), GraphError>: Result of the operation. @@ -724,6 +720,39 @@ impl PyPersistentGraph { ) } + /// Load edges deletions from a Parquet file into the graph. + /// + /// Arguments: + /// parquet_file_path (str): Parquet file path containing edges + /// src (str): The column name for the source node ids. + /// dst (str): The column name for the destination node ids. + /// time (str): The column name for the update timestamps. + /// layer (str): The edge layer name (optional) Defaults to None. + /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the dataframe or if it should be used directly as the layer for all edges (optional) defaults to True. + /// + /// Returns: + /// Result<(), GraphError>: Result of the operation. + #[pyo3(signature = (parquet_file_path, src, dst, time, layer = None, layer_in_df = true))] + fn load_edges_deletions_from_parquet( + &self, + parquet_file_path: PathBuf, + src: &str, + dst: &str, + time: &str, + layer: Option<&str>, + layer_in_df: Option, + ) -> Result<(), GraphError> { + load_edges_deletions_from_parquet( + &self.graph.0, + parquet_file_path.as_path(), + src, + dst, + time, + layer, + layer_in_df, + ) + } + /// Load node properties from a Pandas DataFrame. /// /// Arguments: diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index 074fc6ce74..edee7992b1 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -10,6 +10,9 @@ use polars_parquet::read::{FileMetaData, FileReader, read_metadata}; use crate::python::graph::io::{dataframe::*, df_loaders::*}; use polars_arrow::datatypes::ArrowDataType as DataType; use polars_arrow::record_batch::RecordBatch as Chunk; +use pyo3::{PyAny, PyErr, Python}; +use pyo3::types::IntoPyDict; +use crate::python::graph::io::panda_loaders::process_pandas_py_df; pub fn load_nodes_from_parquet( graph: &InternalGraph, @@ -160,6 +163,41 @@ pub fn load_edge_props_from_parquet( Ok(()) } +pub fn load_edges_deletions_from_parquet( + graph: &InternalGraph, + parquet_file_path: &Path, + src: &str, + dst: &str, + time: &str, + layer: Option<&str>, + layer_in_df: Option, +) -> Result<(), GraphError> { + let mut cols_to_check = vec![src, dst, time]; + if layer_in_df.unwrap_or(true) { + if let Some(ref layer) = layer { + cols_to_check.push(layer.as_ref()); + } + } + + let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + + load_edges_deletions_from_df( + &df, + size, + src, + dst, + time, + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + + Ok(()) +} + pub(crate) fn process_parquet_file_to_df( parquet_file_path: &Path, col_names: Vec<&str>, @@ -218,6 +256,7 @@ fn read_parquet_file( #[cfg(test)] mod test { + use std::path::PathBuf; use polars_arrow::array::{PrimitiveArray, Utf8Array}; use super::*; From 5e1dd62ce80f232d1189a3621d002793c4e6efa8 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:17:17 +0100 Subject: [PATCH 17/33] impl edge deletions tests --- .../data/parquet/edges_deletions.parquet | Bin 0 -> 1337 bytes python/tests/test_load_from_parquet.py | 29 +++++++++++++++++- raphtory/src/python/graph/io/panda_loaders.rs | 2 +- .../src/python/graph/io/parquet_loaders.rs | 2 +- 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 python/tests/data/parquet/edges_deletions.parquet diff --git a/python/tests/data/parquet/edges_deletions.parquet b/python/tests/data/parquet/edges_deletions.parquet new file mode 100644 index 0000000000000000000000000000000000000000..89fa9488f6ee3f9fb82447a931befda989026a01 GIT binary patch literal 1337 zcmcIk%}(1u5FXoG$CY}B5WBJ^AAD#nBnk!ENC<_)IJBh&3XO_TxJ4!ke}U9VoA>Aw z^fCGfJyt!@Th%Av5jwN>LX;384jEZ9JAdEKw`=Vl)|aJ_k$k#M24(=tRR?krg_R&@ zqr`R&Ep^wpzq=lY*)Z{d+Bkj)08gV&of(J zdaQw)bc)B+iPx(aEl~5w~)U}ue2txk}*6X_mYxx2DKkOGQ^T_g1 zYs4+Y0VZvHgV@GeTSt=wafeqm#CL4THJj%jv#~$YucJf#8(ej``C$onK2hl^yNQOPA{w6NYhW_S=cdgJKQm9N z8(8I{@zR&?PnkD4)SoinLJ09j*4M$0y3OqKmsWov?RFQMS#~*yzP7U7$>l|~Tv=RN STnVZnK>ooWdTjRS(fkkMP_B#s literal 0 HcmV?d00001 diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index 1ba07575b3..ae5e61c5b8 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -34,9 +34,20 @@ def test_prepare_data(): table = pa.table(data) pq.write_table(table, '/tmp/parquet/edges.parquet') + data = { + "src": [3, 4], + "dst": [4, 5], + "time": [6, 7], + } + + table = pa.table(data) + pq.write_table(table, '/tmp/parquet/edges_deletions.parquet') + -edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') +edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') +edges_deletions_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', + 'edges_deletions.parquet') def assert_expected_nodes(g): @@ -405,3 +416,19 @@ def test_load_from_parquet_persistent_graphs(): assert_expected_node_property_type(g) assert_expected_layers(g) + g = PersistentGraph() + g.load_edges_from_parquet( + edges_parquet_file_path, + "src", + "dst", + "time", + ) + assert g.window(10, 12).edges.src.id.collect() == [1, 2, 3, 4, 5] + g.load_edges_deletions_from_parquet( + parquet_file_path=edges_deletions_parquet_file_path, + src="src", + dst="dst", + time="time" + ) + assert g.window(10, 12).edges.src.id.collect() == [1, 2, 5] + diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index 3a4928c614..c9ceda0c9e 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -219,8 +219,8 @@ pub fn load_edges_deletions_from_pandas( } let df = process_pandas_py_df(df, py, cols_to_check.clone())?; - df.check_cols_exist(&cols_to_check)?; + load_edges_deletions_from_df( &df, size, diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index edee7992b1..07234741ba 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -194,7 +194,7 @@ pub fn load_edges_deletions_from_parquet( graph, ) .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - + Ok(()) } From 5326733b0084763741b53e431f2020d974112655 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:18:21 +0100 Subject: [PATCH 18/33] fmt --- raphtory/src/core/utils/errors.rs | 2 +- raphtory/src/python/graph/disk_graph.rs | 3 +- raphtory/src/python/graph/graph.rs | 15 +-- .../src/python/graph/graph_with_deletions.rs | 33 +++---- raphtory/src/python/graph/io/dataframe.rs | 10 +- raphtory/src/python/graph/io/mod.rs | 2 +- raphtory/src/python/graph/io/panda_loaders.rs | 46 ++++----- .../src/python/graph/io/parquet_loaders.rs | 95 ++++++++++--------- raphtory/src/python/graph/mod.rs | 2 +- 9 files changed, 102 insertions(+), 106 deletions(-) diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index 9a87c84006..c17d619f0b 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -1,5 +1,5 @@ -use polars_arrow::legacy::error; use crate::core::{utils::time::error::ParseTimeError, ArcStr, Prop, PropType}; +use polars_arrow::legacy::error; #[cfg(feature = "search")] use tantivy; #[cfg(feature = "search")] diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index c9f20c627d..2f8001c5a9 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -33,8 +33,7 @@ use pyo3::{ types::{IntoPyDict, PyDict, PyList, PyString}, }; -use super::io::dataframe::PretendDF; -use super::io::panda_loaders::*; +use super::io::{dataframe::PretendDF, panda_loaders::*}; impl From for PyErr { fn from(value: Error) -> Self { diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index f2be26a3a9..d3549d8310 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -13,7 +13,10 @@ use crate::{ prelude::*, python::{ graph::{ - edge::PyEdge, graph_with_deletions::PyPersistentGraph, node::PyNode, + edge::PyEdge, + graph_with_deletions::PyPersistentGraph, + io::{panda_loaders::*, parquet_loaders::*}, + node::PyNode, views::graph_view::PyGraphView, }, utils::{PyInputNode, PyTime}, @@ -23,11 +26,8 @@ use pyo3::{prelude::*, types::PyBytes}; use std::{ collections::HashMap, fmt::{Debug, Formatter}, - path::Path, + path::{Path, PathBuf}, }; -use std::path::PathBuf; -use crate::python::graph::io::panda_loaders::*; -use crate::python::graph::io::parquet_loaders::*; /// A temporal graph. #[derive(Clone)] @@ -509,7 +509,9 @@ impl PyGraph { let graph = PyGraph { graph: Graph::new(), }; - if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = (node_parquet_file_path, node_id, node_time) { + if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = + (node_parquet_file_path, node_id, node_time) + { graph.load_nodes_from_parquet( node_parquet_file_path, node_id, @@ -749,7 +751,6 @@ impl PyGraph { ) } - /// Load edge properties from a Pandas DataFrame. /// /// Arguments: diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 29721fd6c1..c4182809e2 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -16,25 +16,20 @@ use crate::{ }, prelude::{DeletionOps, GraphViewOps, ImportOps}, python::{ - graph::{edge::PyEdge, node::PyNode, views::graph_view::PyGraphView}, + graph::{ + edge::PyEdge, io::parquet_loaders::*, node::PyNode, views::graph_view::PyGraphView, + }, utils::{PyInputNode, PyTime}, }, }; -use pyo3::{ - prelude::*, - types::PyBytes, -}; +use pyo3::{prelude::*, types::PyBytes}; use std::{ collections::HashMap, fmt::{Debug, Formatter}, path::{Path, PathBuf}, }; -use crate::python::graph::io::parquet_loaders::*; -use super::{ - graph::PyGraph, - io::panda_loaders::*, -}; +use super::{graph::PyGraph, io::panda_loaders::*}; /// A temporal graph that allows edges and nodes to be deleted. #[derive(Clone)] @@ -64,8 +59,8 @@ impl IntoPy for PersistentGraph { PyGraphView::from(self), ), ) - .unwrap() // I think this only fails if we are out of memory? Seems to be unavoidable if we want to create an actual graph. - .into_py(py) + .unwrap() // I think this only fails if we are out of memory? Seems to be unavoidable if we want to create an actual graph. + .into_py(py) } } @@ -501,7 +496,9 @@ impl PyPersistentGraph { let graph = PyPersistentGraph { graph: PersistentGraph::new(), }; - if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = (node_parquet_file_path, node_id, node_time) { + if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = + (node_parquet_file_path, node_id, node_time) + { graph.load_nodes_from_parquet( node_parquet_file_path, node_id, @@ -709,15 +706,7 @@ impl PyPersistentGraph { layer: Option<&str>, layer_in_df: Option, ) -> Result<(), GraphError> { - load_edges_deletions_from_pandas( - &self.graph.0, - df, - src, - dst, - time, - layer, - layer_in_df, - ) + load_edges_deletions_from_pandas(&self.graph.0, df, src, dst, time, layer, layer_in_df) } /// Load edges deletions from a Parquet file into the graph. diff --git a/raphtory/src/python/graph/io/dataframe.rs b/raphtory/src/python/graph/io/dataframe.rs index 3762015ae8..d24057a733 100644 --- a/raphtory/src/python/graph/io/dataframe.rs +++ b/raphtory/src/python/graph/io/dataframe.rs @@ -24,7 +24,7 @@ impl PretendDF { } self.arrays[0][0].len() } - + pub fn check_cols_exist(&self, cols: &[&str]) -> Result<(), GraphError> { let non_cols: Vec<&&str> = cols .iter() @@ -40,7 +40,7 @@ impl PretendDF { pub(crate) fn iter_col( &self, name: &str, - ) -> Option> + '_> { + ) -> Option> + '_> { let idx = self.names.iter().position(|n| n == name)?; let _ = (&self.arrays[0])[idx] @@ -56,7 +56,7 @@ impl PretendDF { Some(iter) } - pub fn utf8(&self, name: &str) -> Option> + '_> { + pub fn utf8(&self, name: &str) -> Option> + '_> { let idx = self.names.iter().position(|n| n == name)?; // test that it's actually a utf8 array let _ = (&self.arrays[0])[idx] @@ -72,7 +72,7 @@ impl PretendDF { Some(iter) } - pub fn time_iter_col(&self, name: &str) -> Option> + '_> { + pub fn time_iter_col(&self, name: &str) -> Option> + '_> { let idx = self.names.iter().position(|n| n == name)?; let _ = (&self.arrays[0])[idx] @@ -87,7 +87,7 @@ impl PretendDF { &DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), CastOptions::default(), ) - .unwrap(); + .unwrap(); array } else { arr.clone() diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index f5d3931c81..ea13e56fc6 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -1,8 +1,8 @@ pub mod dataframe; pub mod df_loaders; -mod prop_handler; pub mod panda_loaders; pub mod parquet_loaders; +mod prop_handler; #[cfg(test)] mod test { diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index c9ceda0c9e..71933045e3 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -1,10 +1,10 @@ -use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; -use pyo3::{prelude::*, types::IntoPyDict}; +use crate::{ + core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, + python::graph::io::{dataframe::*, df_loaders::*}, +}; +use polars_arrow::{array::Array, ffi}; +use pyo3::{ffi::Py_uintptr_t, prelude::*, types::IntoPyDict}; use std::collections::HashMap; -use polars_arrow::array::Array; -use polars_arrow::ffi; -use pyo3::ffi::Py_uintptr_t; -use crate::python::graph::io::{dataframe::*, df_loaders::*}; pub fn load_nodes_from_pandas( graph: &InternalGraph, @@ -50,10 +50,10 @@ pub fn load_nodes_from_pandas( node_type_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -103,11 +103,11 @@ pub fn load_edges_from_pandas( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -139,11 +139,11 @@ pub fn load_node_props_from_pandas( shared_const_properties, graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -185,11 +185,11 @@ pub fn load_edge_props_from_pandas( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; df.check_cols_exist(&cols_to_check)?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -220,7 +220,7 @@ pub fn load_edges_deletions_from_pandas( let df = process_pandas_py_df(df, py, cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; - + load_edges_deletions_from_df( &df, size, @@ -231,11 +231,11 @@ pub fn load_edges_deletions_from_pandas( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; + .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -258,14 +258,14 @@ pub(crate) fn process_pandas_py_df( let dropped_df = if !cols_to_drop.is_empty() { let drop_method = df.getattr("drop")?; - drop_method.call((cols_to_drop, ), Some(vec![("axis", 1)].into_py_dict(py)))? + drop_method.call((cols_to_drop,), Some(vec![("axis", 1)].into_py_dict(py)))? } else { df }; let _df_columns: Vec = dropped_df.getattr("columns")?.extract()?; - let table = pa_table.call_method("from_pandas", (dropped_df, ), None)?; + let table = pa_table.call_method("from_pandas", (dropped_df,), None)?; let rb = table.call_method0("to_batches")?.extract::>()?; let names: Vec = if let Some(batch0) = rb.get(0) { @@ -274,16 +274,16 @@ pub(crate) fn process_pandas_py_df( } else { vec![] } - .into_iter() - .filter(|x| col_names.contains(&x.as_str())) - .collect(); + .into_iter() + .filter(|x| col_names.contains(&x.as_str())) + .collect(); let arrays = rb .iter() .map(|rb| { (0..names.len()) .map(|i| { - let array = rb.call_method1("column", (i, ))?; + let array = rb.call_method1("column", (i,))?; let arr = array_to_rust(array)?; Ok::, PyErr>(arr) }) diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index 07234741ba..61fd4d2d18 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -1,18 +1,20 @@ -use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; -use std::collections::HashMap; -use std::path::Path; +use crate::{ + core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, + python::graph::io::{dataframe::*, df_loaders::*, panda_loaders::process_pandas_py_df}, +}; use itertools::Itertools; -use polars_arrow::array::Array; -use polars_arrow::datatypes::{ArrowSchema, Field}; -use polars_arrow::legacy::error; -use polars_parquet::read; -use polars_parquet::read::{FileMetaData, FileReader, read_metadata}; -use crate::python::graph::io::{dataframe::*, df_loaders::*}; -use polars_arrow::datatypes::ArrowDataType as DataType; -use polars_arrow::record_batch::RecordBatch as Chunk; -use pyo3::{PyAny, PyErr, Python}; -use pyo3::types::IntoPyDict; -use crate::python::graph::io::panda_loaders::process_pandas_py_df; +use polars_arrow::{ + array::Array, + datatypes::{ArrowDataType as DataType, ArrowSchema, Field}, + legacy::error, + record_batch::RecordBatch as Chunk, +}; +use polars_parquet::{ + read, + read::{read_metadata, FileMetaData, FileReader}, +}; +use pyo3::{types::IntoPyDict, PyAny, PyErr, Python}; +use std::{collections::HashMap, path::Path}; pub fn load_nodes_from_parquet( graph: &InternalGraph, @@ -50,7 +52,7 @@ pub fn load_nodes_from_parquet( node_type_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -93,7 +95,7 @@ pub fn load_edges_from_parquet( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -120,7 +122,7 @@ pub fn load_node_props_from_parquet( shared_const_properties, graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -158,7 +160,7 @@ pub fn load_edge_props_from_parquet( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -193,7 +195,7 @@ pub fn load_edges_deletions_from_parquet( layer_in_df.unwrap_or(true), graph, ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; Ok(()) } @@ -204,23 +206,27 @@ pub(crate) fn process_parquet_file_to_df( ) -> Result { let (names, arrays) = read_parquet_file(parquet_file_path, &col_names)?; - let names = names.into_iter() + let names = names + .into_iter() .filter(|x| col_names.contains(&x.as_str())) .collect(); - let arrays = arrays.map_ok(|r| { - r.into_iter().map(|boxed| boxed.clone()).collect_vec() - }).collect::, _>>()?; - - Ok(PretendDF { - names, - arrays, - }) + let arrays = arrays + .map_ok(|r| r.into_iter().map(|boxed| boxed.clone()).collect_vec()) + .collect::, _>>()?; + + Ok(PretendDF { names, arrays }) } fn read_parquet_file( path: impl AsRef, col_names: &Vec<&str>, -) -> Result<(Vec, impl Iterator>, error::PolarsError>>), GraphError> { +) -> Result< + ( + Vec, + impl Iterator>, error::PolarsError>>, + ), + GraphError, +> { let read_schema = |metadata: &FileMetaData| -> Result { let schema = read::infer_schema(metadata)?; let fields = schema @@ -233,7 +239,8 @@ fn read_parquet_file( f.clone() } }) - .filter(|f| { // Filtered fields to avoid loading data that is not needed + .filter(|f| { + // Filtered fields to avoid loading data that is not needed col_names.contains(&f.name.as_str()) }) .collect::>(); @@ -256,9 +263,9 @@ fn read_parquet_file( #[cfg(test)] mod test { - use std::path::PathBuf; - use polars_arrow::array::{PrimitiveArray, Utf8Array}; use super::*; + use polars_arrow::array::{PrimitiveArray, Utf8Array}; + use std::path::PathBuf; #[test] fn test_process_parquet_file_to_df() { @@ -268,24 +275,24 @@ mod test { .unwrap(); let col_names = vec!["src", "dst", "time", "weight", "marbles"]; - let df = process_parquet_file_to_df( - parquet_file_path.as_path(), - col_names, - ).unwrap(); + let df = process_parquet_file_to_df(parquet_file_path.as_path(), col_names).unwrap(); let df1 = PretendDF { names: vec!["src", "dst", "time", "weight", "marbles"] .iter() .map(|s| s.to_string()) .collect(), - arrays: vec![ - vec![ - Box::new(PrimitiveArray::::from_values(vec![1, 2, 3, 4, 5])), - Box::new(PrimitiveArray::::from_values(vec![2, 3, 4, 5, 6])), - Box::new(PrimitiveArray::::from_values(vec![1, 2, 3, 4, 5])), - Box::new(PrimitiveArray::::from_values(vec![1f64, 2f64, 3f64, 4f64, 5f64])), - Box::new(Utf8Array::::from_iter_values(vec!["red", "blue", "green", "yellow", "purple"].into_iter())), - ]], + arrays: vec![vec![ + Box::new(PrimitiveArray::::from_values(vec![1, 2, 3, 4, 5])), + Box::new(PrimitiveArray::::from_values(vec![2, 3, 4, 5, 6])), + Box::new(PrimitiveArray::::from_values(vec![1, 2, 3, 4, 5])), + Box::new(PrimitiveArray::::from_values(vec![ + 1f64, 2f64, 3f64, 4f64, 5f64, + ])), + Box::new(Utf8Array::::from_iter_values( + vec!["red", "blue", "green", "yellow", "purple"].into_iter(), + )), + ]], }; assert_eq!(df.names, df1.names); diff --git a/raphtory/src/python/graph/mod.rs b/raphtory/src/python/graph/mod.rs index eec1e2b894..e32a05c680 100644 --- a/raphtory/src/python/graph/mod.rs +++ b/raphtory/src/python/graph/mod.rs @@ -10,7 +10,7 @@ pub mod graph_with_deletions; pub mod edges; #[cfg(feature = "search")] pub mod index; -pub mod node; pub mod io; +pub mod node; pub mod properties; pub mod views; From 96267dc4f2448325edb1146f12b345b2cc92ef5b Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:31:36 +0100 Subject: [PATCH 19/33] add py dep --- raphtory/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index eb2607d685..3e0c488ae0 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -109,6 +109,7 @@ python = [ "dep:display-error-chain", "dep:polars-arrow", "polars-arrow?/compute", + "dep:polars-parquet", "dep:kdam", "dep:rpds", ] From 1323b3235790a41d2aa6c07e2c115f8ee42070b8 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 12:46:56 +0100 Subject: [PATCH 20/33] add deps --- raphtory/Cargo.toml | 10 ++++++---- raphtory/src/core/utils/errors.rs | 2 ++ raphtory/src/python/graph/io/parquet_loaders.rs | 3 +-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index 3e0c488ae0..0773b4ee6f 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -104,12 +104,11 @@ io = [ # Enables generating the pyo3 python bindings python = [ "io", + "arrow", "dep:pyo3", "dep:num", "dep:display-error-chain", - "dep:polars-arrow", "polars-arrow?/compute", - "dep:polars-parquet", "dep:kdam", "dep:rpds", ] @@ -119,10 +118,9 @@ search = ["dep:tantivy"] vectors = ["dep:futures-util", "dep:async-trait", "dep:async-openai"] # storage storage = [ + "arrow", "pometry-storage", - "dep:polars-arrow", "dep:polars-utils", - "dep:polars-parquet", "dep:memmap2", "dep:ahash", "dep:tempfile", @@ -133,3 +131,7 @@ storage = [ "polars-arrow?/arrow_rs", "polars-parquet?/compression", ] +arrow = [ + "dep:polars-arrow", + "dep:polars-parquet", +] diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index c17d619f0b..7b840883f5 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -1,4 +1,5 @@ use crate::core::{utils::time::error::ParseTimeError, ArcStr, Prop, PropType}; +#[cfg(feature = "arrow")] use polars_arrow::legacy::error; #[cfg(feature = "search")] use tantivy; @@ -7,6 +8,7 @@ use tantivy::query::QueryParserError; #[derive(thiserror::Error, Debug)] pub enum GraphError { + #[cfg(feature = "arrow")] #[error("Arrow error: {0}")] Arrow(#[from] error::PolarsError), #[error("Graph error occurred")] diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/python/graph/io/parquet_loaders.rs index 61fd4d2d18..0877aa47b7 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/python/graph/io/parquet_loaders.rs @@ -1,6 +1,6 @@ use crate::{ core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, - python::graph::io::{dataframe::*, df_loaders::*, panda_loaders::process_pandas_py_df}, + python::graph::io::{dataframe::*, df_loaders::*}, }; use itertools::Itertools; use polars_arrow::{ @@ -13,7 +13,6 @@ use polars_parquet::{ read, read::{read_metadata, FileMetaData, FileReader}, }; -use pyo3::{types::IntoPyDict, PyAny, PyErr, Python}; use std::{collections::HashMap, path::Path}; pub fn load_nodes_from_parquet( From 330890cd08fbbda0d38afea42b01697befde35e2 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Thu, 20 Jun 2024 13:01:48 +0100 Subject: [PATCH 21/33] move compression to arrow dep --- raphtory/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index 0773b4ee6f..4306a5f619 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -129,9 +129,9 @@ storage = [ "dep:thread_local", "polars-arrow?/io_ipc", "polars-arrow?/arrow_rs", - "polars-parquet?/compression", ] arrow = [ "dep:polars-arrow", "dep:polars-parquet", + "polars-parquet?/compression" ] From 116d26c43bfbdd32c13f41a8cd274c8bc26c3423 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Fri, 21 Jun 2024 13:28:13 +0100 Subject: [PATCH 22/33] restruct io --- examples/rust/src/bin/bench/main.rs | 2 +- examples/rust/src/bin/btc/main.rs | 2 +- examples/rust/src/bin/crypto/main.rs | 2 +- examples/rust/src/bin/hulongbay/main.rs | 2 +- examples/rust/src/bin/lotr/main.rs | 2 +- examples/rust/src/bin/pokec/main.rs | 2 +- raphtory-benchmark/benches/algobench.rs | 6 ++--- raphtory-benchmark/benches/arrow_algobench.rs | 2 +- raphtory-benchmark/benches/base.rs | 2 +- raphtory-benchmark/src/main.rs | 11 ++++---- .../algorithms/community_detection/louvain.rs | 2 +- .../{example => }/company_house.rs | 2 +- raphtory/src/graph_loader/example/mod.rs | 7 ----- .../graph_loader/{example => }/karate_club.rs | 0 .../graph_loader/{example => }/lotr_graph.rs | 5 ++-- raphtory/src/graph_loader/mod.rs | 27 +++++++++++-------- .../{example => }/neo4j_examples.rs | 2 +- .../{example => }/reddit_hyperlinks.rs | 4 +-- .../{example => }/stable_coins.rs | 3 ++- .../{example => }/sx_superuser_graph.rs | 7 ++--- .../{graph_loader/source => io}/csv_loader.rs | 16 +++++------ .../source => io}/json_loader.rs | 0 .../src/{graph_loader/source => io}/mod.rs | 2 +- .../source => io}/neo4j_loader.rs | 2 +- raphtory/src/lib.rs | 2 ++ raphtory/src/python/packages/graph_loader.rs | 12 ++++----- raphtory/src/python/utils/errors.rs | 2 +- 27 files changed, 66 insertions(+), 62 deletions(-) rename raphtory/src/graph_loader/{example => }/company_house.rs (98%) delete mode 100644 raphtory/src/graph_loader/example/mod.rs rename raphtory/src/graph_loader/{example => }/karate_club.rs (100%) rename raphtory/src/graph_loader/{example => }/lotr_graph.rs (95%) rename raphtory/src/graph_loader/{example => }/neo4j_examples.rs (96%) rename raphtory/src/graph_loader/{example => }/reddit_hyperlinks.rs (97%) rename raphtory/src/graph_loader/{example => }/stable_coins.rs (98%) rename raphtory/src/graph_loader/{example => }/sx_superuser_graph.rs (93%) rename raphtory/src/{graph_loader/source => io}/csv_loader.rs (97%) rename raphtory/src/{graph_loader/source => io}/json_loader.rs (100%) rename raphtory/src/{graph_loader/source => io}/mod.rs (65%) rename raphtory/src/{graph_loader/source => io}/neo4j_loader.rs (98%) diff --git a/examples/rust/src/bin/bench/main.rs b/examples/rust/src/bin/bench/main.rs index bf90573a71..12d262ad0d 100644 --- a/examples/rust/src/bin/bench/main.rs +++ b/examples/rust/src/bin/bench/main.rs @@ -1,6 +1,6 @@ use raphtory::{ algorithms::centrality::pagerank::unweighted_page_rank, - graph_loader::source::csv_loader::CsvLoader, prelude::*, + io::csv_loader::CsvLoader, prelude::*, }; use serde::Deserialize; use std::{ diff --git a/examples/rust/src/bin/btc/main.rs b/examples/rust/src/bin/btc/main.rs index 0a7d2a27c4..759786c123 100644 --- a/examples/rust/src/bin/btc/main.rs +++ b/examples/rust/src/bin/btc/main.rs @@ -2,7 +2,7 @@ #![allow(dead_code)] use chrono::{DateTime, Utc}; -use raphtory::{core::utils::hashing, graph_loader::source::csv_loader::CsvLoader, prelude::*}; +use raphtory::{core::utils::hashing, io::csv_loader::CsvLoader, prelude::*}; use regex::Regex; use serde::Deserialize; use std::{ diff --git a/examples/rust/src/bin/crypto/main.rs b/examples/rust/src/bin/crypto/main.rs index a0d9725908..77d0a47091 100644 --- a/examples/rust/src/bin/crypto/main.rs +++ b/examples/rust/src/bin/crypto/main.rs @@ -5,7 +5,7 @@ use raphtory::{ pathing::temporal_reachability::temporally_reachable_nodes, }, db::api::view::*, - graph_loader::example::stable_coins::stable_coin_graph, + graph_loader::stable_coins::stable_coin_graph, }; use std::{env, time::Instant}; diff --git a/examples/rust/src/bin/hulongbay/main.rs b/examples/rust/src/bin/hulongbay/main.rs index 9ec0207363..1b44ecffb1 100644 --- a/examples/rust/src/bin/hulongbay/main.rs +++ b/examples/rust/src/bin/hulongbay/main.rs @@ -9,7 +9,7 @@ use raphtory::{ triangle_count::triangle_count, }, }, - graph_loader::source::csv_loader::CsvLoader, + io::csv_loader::CsvLoader, prelude::*, }; use regex::Regex; diff --git a/examples/rust/src/bin/lotr/main.rs b/examples/rust/src/bin/lotr/main.rs index e06552363a..65ef968056 100644 --- a/examples/rust/src/bin/lotr/main.rs +++ b/examples/rust/src/bin/lotr/main.rs @@ -1,6 +1,6 @@ use raphtory::{ algorithms::pathing::temporal_reachability::temporally_reachable_nodes, core::utils::hashing, - graph_loader::source::csv_loader::CsvLoader, prelude::*, + io::csv_loader::CsvLoader, prelude::*, }; use serde::Deserialize; use std::{ diff --git a/examples/rust/src/bin/pokec/main.rs b/examples/rust/src/bin/pokec/main.rs index 63158b0d27..2be1a7737d 100644 --- a/examples/rust/src/bin/pokec/main.rs +++ b/examples/rust/src/bin/pokec/main.rs @@ -3,7 +3,7 @@ use raphtory::{ centrality::pagerank::unweighted_page_rank, components::weakly_connected_components, }, db::{api::mutation::AdditionOps, graph::graph::Graph}, - graph_loader::source::csv_loader::CsvLoader, + io::csv_loader::CsvLoader, prelude::*, }; use serde::Deserialize; diff --git a/raphtory-benchmark/benches/algobench.rs b/raphtory-benchmark/benches/algobench.rs index b879875321..0d469c5053 100644 --- a/raphtory-benchmark/benches/algobench.rs +++ b/raphtory-benchmark/benches/algobench.rs @@ -23,7 +23,7 @@ pub fn local_triangle_count_analysis(c: &mut Criterion) { let mut group = c.benchmark_group("local_triangle_count"); group.sample_size(10); bench(&mut group, "local_triangle_count", None, |b| { - let g = raphtory::graph_loader::example::lotr_graph::lotr_graph(); + let g = raphtory::graph_loader::lotr_graph::lotr_graph(); let windowed_graph = g.window(i64::MIN, i64::MAX); b.iter(|| { @@ -42,7 +42,7 @@ pub fn local_clustering_coefficient_analysis(c: &mut Criterion) { let mut group = c.benchmark_group("local_clustering_coefficient"); bench(&mut group, "local_clustering_coefficient", None, |b| { - let g: Graph = raphtory::graph_loader::example::lotr_graph::lotr_graph(); + let g: Graph = raphtory::graph_loader::lotr_graph::lotr_graph(); b.iter(|| local_clustering_coefficient(&g, "Gandalf")) }); @@ -123,7 +123,7 @@ pub fn temporal_motifs(c: &mut Criterion) { let mut group = c.benchmark_group("temporal_motifs"); bench(&mut group, "temporal_motifs", None, |b| { - let g: Graph = raphtory::graph_loader::example::lotr_graph::lotr_graph(); + let g: Graph = raphtory::graph_loader::lotr_graph::lotr_graph(); b.iter(|| global_temporal_three_node_motif(&g, 100, None)) }); diff --git a/raphtory-benchmark/benches/arrow_algobench.rs b/raphtory-benchmark/benches/arrow_algobench.rs index 3f9b7439c9..a1b7d913ad 100644 --- a/raphtory-benchmark/benches/arrow_algobench.rs +++ b/raphtory-benchmark/benches/arrow_algobench.rs @@ -37,7 +37,7 @@ pub fn local_triangle_count_analysis(c: &mut Criterion) { let mut group = c.benchmark_group("local_triangle_count"); group.sample_size(10); bench(&mut group, "local_triangle_count", None, |b| { - let g = raphtory::graph_loader::example::lotr_graph::lotr_graph(); + let g = raphtory::graph_loader::lotr_graph::lotr_graph(); let test_dir = TempDir::new().unwrap(); let g = g.persist_as_disk_graph(test_dir.path()).unwrap(); let windowed_graph = g.window(i64::MIN, i64::MAX); diff --git a/raphtory-benchmark/benches/base.rs b/raphtory-benchmark/benches/base.rs index fb178a534b..01a9cbb894 100644 --- a/raphtory-benchmark/benches/base.rs +++ b/raphtory-benchmark/benches/base.rs @@ -1,6 +1,6 @@ use crate::common::{bootstrap_graph, run_analysis_benchmarks, run_large_ingestion_benchmarks}; use criterion::{criterion_group, criterion_main, Criterion, Throughput}; -use raphtory::{graph_loader::example::lotr_graph::lotr_graph, prelude::*}; +use raphtory::{graph_loader::lotr_graph::lotr_graph, prelude::*}; mod common; diff --git a/raphtory-benchmark/src/main.rs b/raphtory-benchmark/src/main.rs index ca535c5748..df489da776 100644 --- a/raphtory-benchmark/src/main.rs +++ b/raphtory-benchmark/src/main.rs @@ -5,7 +5,8 @@ use raphtory::{ algorithms::{ centrality::pagerank::unweighted_page_rank, components::weakly_connected_components, }, - graph_loader::{fetch_file, source::csv_loader::CsvLoader}, + graph_loader::fetch_file, + io::csv_loader::CsvLoader, prelude::{AdditionOps, Graph, GraphViewOps, NodeViewOps, NO_PROPS}, }; use std::{ @@ -16,10 +17,10 @@ use std::{ }; #[derive(Parser, Debug)] -#[command(author, version, about, long_about = None )] +#[command(author, version, about, long_about = None)] struct Args { /// Set if the file has a header, default is False - #[arg(long, action=ArgAction::SetTrue)] + #[arg(long, action = ArgAction::SetTrue)] header: bool, /// Delimiter of the csv file @@ -43,11 +44,11 @@ struct Args { time_column: i32, /// Download default files - #[arg(long, action=ArgAction::SetTrue)] + #[arg(long, action = ArgAction::SetTrue)] download: bool, /// Debug to print more info to the screen - #[arg(long, action=ArgAction::SetTrue)] + #[arg(long, action = ArgAction::SetTrue)] debug: bool, /// Set the number of locks for the node and edge storage diff --git a/raphtory/src/algorithms/community_detection/louvain.rs b/raphtory/src/algorithms/community_detection/louvain.rs index 7383f114d9..a2c7c39724 100644 --- a/raphtory/src/algorithms/community_detection/louvain.rs +++ b/raphtory/src/algorithms/community_detection/louvain.rs @@ -133,7 +133,7 @@ mod test { #[cfg(feature = "io")] #[test] fn lfr_test() { - use crate::graph_loader::source::csv_loader::CsvLoader; + use crate::io::csv_loader::CsvLoader; use serde::{Deserialize, Serialize}; use std::path::PathBuf; diff --git a/raphtory/src/graph_loader/example/company_house.rs b/raphtory/src/graph_loader/company_house.rs similarity index 98% rename from raphtory/src/graph_loader/example/company_house.rs rename to raphtory/src/graph_loader/company_house.rs index 0a585c9908..6167a5d76c 100644 --- a/raphtory/src/graph_loader/example/company_house.rs +++ b/raphtory/src/graph_loader/company_house.rs @@ -1,4 +1,4 @@ -use crate::{graph_loader::source::csv_loader::CsvLoader, prelude::*}; +use crate::{io::csv_loader::CsvLoader, prelude::*}; use chrono::DateTime; use serde::Deserialize; use std::{fs, path::PathBuf, time::Instant}; diff --git a/raphtory/src/graph_loader/example/mod.rs b/raphtory/src/graph_loader/example/mod.rs deleted file mode 100644 index 6a3b542df1..0000000000 --- a/raphtory/src/graph_loader/example/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub mod company_house; -pub mod karate_club; -pub mod lotr_graph; -pub mod neo4j_examples; -pub mod reddit_hyperlinks; -pub mod stable_coins; -pub mod sx_superuser_graph; diff --git a/raphtory/src/graph_loader/example/karate_club.rs b/raphtory/src/graph_loader/karate_club.rs similarity index 100% rename from raphtory/src/graph_loader/example/karate_club.rs rename to raphtory/src/graph_loader/karate_club.rs diff --git a/raphtory/src/graph_loader/example/lotr_graph.rs b/raphtory/src/graph_loader/lotr_graph.rs similarity index 95% rename from raphtory/src/graph_loader/example/lotr_graph.rs rename to raphtory/src/graph_loader/lotr_graph.rs index b923d0a09a..ed9dc55860 100644 --- a/raphtory/src/graph_loader/example/lotr_graph.rs +++ b/raphtory/src/graph_loader/lotr_graph.rs @@ -13,7 +13,7 @@ //! //! Example: //! ```rust -//! use raphtory::graph_loader::example::lotr_graph::lotr_graph; +//! use raphtory::graph_loader::lotr_graph::lotr_graph; //! use raphtory::prelude::*; //! //! let graph = lotr_graph(); @@ -22,11 +22,12 @@ //! println!("The graph has {:?} edges", graph.count_edges()); //! ``` use crate::{ - graph_loader::{fetch_file, source::csv_loader::CsvLoader}, + graph_loader::fetch_file, prelude::*, }; use serde::Deserialize; use std::path::PathBuf; +use crate::io::csv_loader::CsvLoader; #[derive(Deserialize, std::fmt::Debug)] pub struct Lotr { diff --git a/raphtory/src/graph_loader/mod.rs b/raphtory/src/graph_loader/mod.rs index 9069fb46b7..d22a2a7c5b 100644 --- a/raphtory/src/graph_loader/mod.rs +++ b/raphtory/src/graph_loader/mod.rs @@ -10,7 +10,7 @@ //! ```rust //! use raphtory::algorithms::metrics::degree::average_degree; //! use raphtory::prelude::*; -//! use raphtory::graph_loader::example::lotr_graph::lotr_graph; +//! use raphtory::graph_loader::lotr_graph::lotr_graph; //! //! let graph = lotr_graph(); //! @@ -32,7 +32,7 @@ //! ```no_run //! use std::time::Instant; //! use serde::Deserialize; -//! use raphtory::graph_loader::source::csv_loader::CsvLoader; +//! use raphtory::io::csv_loader::CsvLoader; //! use raphtory::prelude::*; //! //! let data_dir = "/tmp/lotr.csv"; @@ -105,8 +105,13 @@ use std::{ }; use zip::read::ZipArchive; -pub mod example; -pub mod source; +pub mod company_house; +pub mod karate_club; +pub mod lotr_graph; +pub mod neo4j_examples; +pub mod reddit_hyperlinks; +pub mod stable_coins; +pub mod sx_superuser_graph; pub fn fetch_file( name: &str, @@ -177,13 +182,13 @@ mod graph_loader_test { #[test] fn test_lotr_load_graph() { - let g = crate::graph_loader::example::lotr_graph::lotr_graph(); + let g = crate::graph_loader::lotr_graph::lotr_graph(); assert_eq!(g.count_edges(), 701); } #[test] fn test_graph_at() { - let g = crate::graph_loader::example::lotr_graph::lotr_graph(); + let g = crate::graph_loader::lotr_graph::lotr_graph(); let g_at_empty = g.at(1); let g_astart = g.at(7059); @@ -196,7 +201,7 @@ mod graph_loader_test { #[test] fn test_karate_graph() { - let g = crate::graph_loader::example::karate_club::karate_club_graph(); + let g = crate::graph_loader::karate_club::karate_club_graph(); assert_eq!(g.count_nodes(), 34); assert_eq!(g.count_edges(), 155); } @@ -205,7 +210,7 @@ mod graph_loader_test { fn db_lotr() { let g = Graph::new(); - let data_dir = crate::graph_loader::example::lotr_graph::lotr_file() + let data_dir = crate::graph_loader::lotr_graph::lotr_file() .expect("Failed to get lotr.csv file"); fn parse_record(rec: &StringRecord) -> Option<(String, String, i64)> { @@ -244,7 +249,7 @@ mod graph_loader_test { #[test] fn test_all_degrees_window() { - let g = crate::graph_loader::example::lotr_graph::lotr_graph(); + let g = crate::graph_loader::lotr_graph::lotr_graph(); assert_eq!(g.count_edges(), 701); assert_eq!(g.node("Gandalf").unwrap().degree(), 49); @@ -263,7 +268,7 @@ mod graph_loader_test { #[test] fn test_all_neighbours_window() { - let g = crate::graph_loader::example::lotr_graph::lotr_graph(); + let g = crate::graph_loader::lotr_graph::lotr_graph(); assert_eq!(g.count_edges(), 701); assert_eq!(g.node("Gandalf").unwrap().neighbours().iter().count(), 49); @@ -316,7 +321,7 @@ mod graph_loader_test { #[test] fn test_all_edges_window() { - let g = crate::graph_loader::example::lotr_graph::lotr_graph(); + let g = crate::graph_loader::lotr_graph::lotr_graph(); assert_eq!(g.count_edges(), 701); assert_eq!(g.node("Gandalf").unwrap().edges().iter().count(), 59); diff --git a/raphtory/src/graph_loader/example/neo4j_examples.rs b/raphtory/src/graph_loader/neo4j_examples.rs similarity index 96% rename from raphtory/src/graph_loader/example/neo4j_examples.rs rename to raphtory/src/graph_loader/neo4j_examples.rs index 1e661fbc81..112aaef70c 100644 --- a/raphtory/src/graph_loader/example/neo4j_examples.rs +++ b/raphtory/src/graph_loader/neo4j_examples.rs @@ -1,6 +1,6 @@ use crate::{ db::{api::mutation::AdditionOps, graph::graph as rap}, - graph_loader::source::neo4j_loader::Neo4JConnection, + io::neo4j_loader::Neo4JConnection, prelude::{IntoProp, NO_PROPS}, }; use neo4rs::*; diff --git a/raphtory/src/graph_loader/example/reddit_hyperlinks.rs b/raphtory/src/graph_loader/reddit_hyperlinks.rs similarity index 97% rename from raphtory/src/graph_loader/example/reddit_hyperlinks.rs rename to raphtory/src/graph_loader/reddit_hyperlinks.rs index 9ac2d78258..350f3600d0 100644 --- a/raphtory/src/graph_loader/example/reddit_hyperlinks.rs +++ b/raphtory/src/graph_loader/reddit_hyperlinks.rs @@ -29,7 +29,7 @@ //! //! Example: //! ```no_run -//! use raphtory::graph_loader::example::reddit_hyperlinks::reddit_graph; +//! use raphtory::graph_loader::reddit_hyperlinks::reddit_graph; //! use raphtory::prelude::*; //! //! let graph = reddit_graph(120, false); @@ -158,7 +158,7 @@ pub fn generate_reddit_graph(path: PathBuf) -> Graph { mod reddit_test { use crate::{ db::api::view::*, - graph_loader::example::reddit_hyperlinks::{reddit_file, reddit_graph}, + graph_loader::reddit_hyperlinks::{reddit_file, reddit_graph}, }; #[test] diff --git a/raphtory/src/graph_loader/example/stable_coins.rs b/raphtory/src/graph_loader/stable_coins.rs similarity index 98% rename from raphtory/src/graph_loader/example/stable_coins.rs rename to raphtory/src/graph_loader/stable_coins.rs index d221770392..8d60732f37 100644 --- a/raphtory/src/graph_loader/example/stable_coins.rs +++ b/raphtory/src/graph_loader/stable_coins.rs @@ -1,11 +1,12 @@ use crate::{ - graph_loader::{fetch_file, source::csv_loader::CsvLoader, unzip_file}, + graph_loader::{fetch_file, unzip_file}, prelude::*, }; use chrono::DateTime; use regex::Regex; use serde::Deserialize; use std::{collections::HashMap, fs, path::PathBuf, time::Instant}; +use crate::io::csv_loader::CsvLoader; #[allow(dead_code)] #[derive(Deserialize, std::fmt::Debug)] diff --git a/raphtory/src/graph_loader/example/sx_superuser_graph.rs b/raphtory/src/graph_loader/sx_superuser_graph.rs similarity index 93% rename from raphtory/src/graph_loader/example/sx_superuser_graph.rs rename to raphtory/src/graph_loader/sx_superuser_graph.rs index 503046f336..b093075395 100644 --- a/raphtory/src/graph_loader/example/sx_superuser_graph.rs +++ b/raphtory/src/graph_loader/sx_superuser_graph.rs @@ -36,7 +36,7 @@ //! //! Example: //! ```no_run -//! use raphtory::graph_loader::example::sx_superuser_graph::sx_superuser_graph; +//! use raphtory::graph_loader::sx_superuser_graph::sx_superuser_graph; //! use raphtory::prelude::*; //! //! let graph = sx_superuser_graph().unwrap(); @@ -46,11 +46,12 @@ //! ``` use crate::{ - graph_loader::{fetch_file, source::csv_loader::CsvLoader}, + graph_loader::fetch_file, prelude::*, }; use serde::Deserialize; use std::path::PathBuf; +use crate::io::csv_loader::CsvLoader; #[derive(Deserialize, std::fmt::Debug)] pub struct TEdge { @@ -90,7 +91,7 @@ pub fn sx_superuser_graph() -> Result> { #[cfg(test)] mod sx_superuser_test { - use crate::graph_loader::example::sx_superuser_graph::{sx_superuser_file, sx_superuser_graph}; + use crate::graph_loader::sx_superuser_graph::{sx_superuser_file, sx_superuser_graph}; #[test] #[ignore] // don't hit SNAP by default diff --git a/raphtory/src/graph_loader/source/csv_loader.rs b/raphtory/src/io/csv_loader.rs similarity index 97% rename from raphtory/src/graph_loader/source/csv_loader.rs rename to raphtory/src/io/csv_loader.rs index 0c0f4ee2b0..ff80231f58 100644 --- a/raphtory/src/graph_loader/source/csv_loader.rs +++ b/raphtory/src/io/csv_loader.rs @@ -5,8 +5,8 @@ //! use std::path::{Path, PathBuf}; //! use regex::Regex; //! use raphtory::core::utils::hashing::calculate_hash; -//! use raphtory::graph_loader::source::csv_loader::CsvLoader; -//! use raphtory::graph_loader::example::lotr_graph::Lotr; +//! use raphtory::io::csv_loader::CsvLoader; +//! use raphtory::graph_loader::lotr_graph::Lotr; //! use raphtory::prelude::*; //! //! let g = Graph::new(); @@ -143,7 +143,7 @@ impl CsvLoader { /// /// ```no_run /// - /// use raphtory::graph_loader::source::csv_loader::CsvLoader; + /// use raphtory::io::csv_loader::CsvLoader; /// let loader = CsvLoader::new("/path/to/csv_file.csv"); /// ``` pub fn new>(p: P) -> Self { @@ -165,7 +165,7 @@ impl CsvLoader { /// # Example /// /// ```no_run - /// use raphtory::graph_loader::source::csv_loader::CsvLoader; + /// use raphtory::io::csv_loader::CsvLoader; /// let loader = CsvLoader::new("/path/to/csv_file.csv").set_header(true); /// ``` pub fn set_header(mut self, h: bool) -> Self { @@ -181,7 +181,7 @@ impl CsvLoader { /// /// # Example /// ```no_run - /// use raphtory::graph_loader::source::csv_loader::CsvLoader; + /// use raphtory::io::csv_loader::CsvLoader; /// let loader = CsvLoader::new("/path/to/csv_file.csv").set_print_file_name(true); /// ``` pub fn set_print_file_name(mut self, p: bool) -> Self { @@ -198,7 +198,7 @@ impl CsvLoader { /// # Example /// /// ```no_run - /// use raphtory::graph_loader::source::csv_loader::CsvLoader; + /// use raphtory::io::csv_loader::CsvLoader; /// let loader = CsvLoader::new("/path/to/csv_file.csv").set_delimiter("|"); /// ``` pub fn set_delimiter(mut self, d: &str) -> Self { @@ -216,7 +216,7 @@ impl CsvLoader { /// /// ```no_run /// use regex::Regex; - /// use raphtory::graph_loader::source::csv_loader::CsvLoader; + /// use raphtory::io::csv_loader::CsvLoader; /// /// let loader = CsvLoader::new("/path/to/csv_files") /// .with_filter(Regex::new(r"file_name_pattern").unwrap()); @@ -475,7 +475,7 @@ impl CsvLoader { #[cfg(test)] mod csv_loader_test { use crate::{ - core::utils::hashing::calculate_hash, graph_loader::source::csv_loader::CsvLoader, + core::utils::hashing::calculate_hash, io::csv_loader::CsvLoader, prelude::*, }; use csv::StringRecord; diff --git a/raphtory/src/graph_loader/source/json_loader.rs b/raphtory/src/io/json_loader.rs similarity index 100% rename from raphtory/src/graph_loader/source/json_loader.rs rename to raphtory/src/io/json_loader.rs diff --git a/raphtory/src/graph_loader/source/mod.rs b/raphtory/src/io/mod.rs similarity index 65% rename from raphtory/src/graph_loader/source/mod.rs rename to raphtory/src/io/mod.rs index ea9fc4dc61..2e2e495419 100644 --- a/raphtory/src/graph_loader/source/mod.rs +++ b/raphtory/src/io/mod.rs @@ -1,3 +1,3 @@ pub mod csv_loader; pub mod json_loader; -pub mod neo4j_loader; +pub mod neo4j_loader; \ No newline at end of file diff --git a/raphtory/src/graph_loader/source/neo4j_loader.rs b/raphtory/src/io/neo4j_loader.rs similarity index 98% rename from raphtory/src/graph_loader/source/neo4j_loader.rs rename to raphtory/src/io/neo4j_loader.rs index c437821727..882d771afd 100644 --- a/raphtory/src/graph_loader/source/neo4j_loader.rs +++ b/raphtory/src/io/neo4j_loader.rs @@ -50,7 +50,7 @@ impl Neo4JConnection { mod neo_loader_test { use crate::{ db::{api::mutation::AdditionOps, graph::graph as rap}, - graph_loader::source::neo4j_loader::Neo4JConnection, + io::neo4j_loader::Neo4JConnection, prelude::*, }; use neo4rs::*; diff --git a/raphtory/src/lib.rs b/raphtory/src/lib.rs index 1359bd29e4..40a0e33304 100644 --- a/raphtory/src/lib.rs +++ b/raphtory/src/lib.rs @@ -104,6 +104,8 @@ pub mod search; #[cfg(feature = "vectors")] pub mod vectors; +pub mod io; + pub mod prelude { pub const NO_PROPS: [(&str, Prop); 0] = []; pub use crate::{ diff --git a/raphtory/src/python/packages/graph_loader.rs b/raphtory/src/python/packages/graph_loader.rs index c7784a9891..3851e5e778 100644 --- a/raphtory/src/python/packages/graph_loader.rs +++ b/raphtory/src/python/packages/graph_loader.rs @@ -26,7 +26,7 @@ use tokio::runtime::Runtime; /// A Graph containing the LOTR dataset #[pyfunction] pub fn lotr_graph() -> PyResult> { - PyGraph::py_from_db_graph(crate::graph_loader::example::lotr_graph::lotr_graph()) + PyGraph::py_from_db_graph(crate::graph_loader::lotr_graph::lotr_graph()) } /// Load (a subset of) Reddit hyperlinks dataset into a graph. @@ -68,7 +68,7 @@ pub fn lotr_graph() -> PyResult> { #[pyo3(signature = (timeout_seconds=600))] pub fn reddit_hyperlink_graph(timeout_seconds: u64) -> PyResult> { PyGraph::py_from_db_graph( - crate::graph_loader::example::reddit_hyperlinks::reddit_graph(timeout_seconds, false), + crate::graph_loader::reddit_hyperlinks::reddit_graph(timeout_seconds, false), ) } @@ -77,7 +77,7 @@ pub fn reddit_hyperlink_graph(timeout_seconds: u64) -> PyResult> { pub fn reddit_hyperlink_graph_local(file_path: &str) -> PyResult> { let file_path_buf = PathBuf::from(file_path); PyGraph::py_from_db_graph( - crate::graph_loader::example::reddit_hyperlinks::generate_reddit_graph(file_path_buf), + crate::graph_loader::reddit_hyperlinks::generate_reddit_graph(file_path_buf), ) } @@ -85,7 +85,7 @@ pub fn reddit_hyperlink_graph_local(file_path: &str) -> PyResult> { #[pyo3(signature = (path=None,subset=None))] pub fn stable_coin_graph(path: Option, subset: Option) -> PyResult> { PyGraph::py_from_db_graph( - crate::graph_loader::example::stable_coins::stable_coin_graph( + crate::graph_loader::stable_coins::stable_coin_graph( path, subset.unwrap_or(false), ), @@ -101,7 +101,7 @@ pub fn neo4j_movie_graph( database: String, ) -> PyResult> { let g = Runtime::new().unwrap().block_on( - crate::graph_loader::example::neo4j_examples::neo4j_movie_graph( + crate::graph_loader::neo4j_examples::neo4j_movie_graph( uri, username, password, database, ), ); @@ -131,5 +131,5 @@ pub fn neo4j_movie_graph( #[pyfunction] #[pyo3(signature = ())] pub fn karate_club_graph() -> PyResult> { - PyGraph::py_from_db_graph(crate::graph_loader::example::karate_club::karate_club_graph()) + PyGraph::py_from_db_graph(crate::graph_loader::karate_club::karate_club_graph()) } diff --git a/raphtory/src/python/utils/errors.rs b/raphtory/src/python/utils/errors.rs index 7a67443639..f242220f10 100644 --- a/raphtory/src/python/utils/errors.rs +++ b/raphtory/src/python/utils/errors.rs @@ -1,6 +1,6 @@ use crate::{ core::utils::{errors::GraphError, time::error::ParseTimeError}, - graph_loader::source::csv_loader::CsvErr, + io::csv_loader::CsvErr, }; use pyo3::{exceptions::PyException, PyErr}; use std::error::Error; From c48c750af43fc7461b376d5c946671b780707cdf Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:08:40 +0100 Subject: [PATCH 23/33] restruct arrow --- .../graph/io => io/arrow}/dataframe.rs | 0 .../graph/io => io/arrow}/df_loaders.rs | 5 +- raphtory/src/io/arrow/mod.rs | 159 +++++++++++++++++ .../graph/io => io/arrow}/prop_handler.rs | 2 +- raphtory/src/io/mod.rs | 5 +- .../{python/graph => }/io/parquet_loaders.rs | 6 +- raphtory/src/python/graph/disk_graph.rs | 3 +- raphtory/src/python/graph/graph.rs | 3 +- .../src/python/graph/graph_with_deletions.rs | 3 +- raphtory/src/python/graph/io/mod.rs | 162 ------------------ raphtory/src/python/graph/io/panda_loaders.rs | 6 +- 11 files changed, 175 insertions(+), 179 deletions(-) rename raphtory/src/{python/graph/io => io/arrow}/dataframe.rs (100%) rename raphtory/src/{python/graph/io => io/arrow}/df_loaders.rs (99%) create mode 100644 raphtory/src/io/arrow/mod.rs rename raphtory/src/{python/graph/io => io/arrow}/prop_handler.rs (99%) rename raphtory/src/{python/graph => }/io/parquet_loaders.rs (98%) diff --git a/raphtory/src/python/graph/io/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs similarity index 100% rename from raphtory/src/python/graph/io/dataframe.rs rename to raphtory/src/io/arrow/dataframe.rs diff --git a/raphtory/src/python/graph/io/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs similarity index 99% rename from raphtory/src/python/graph/io/df_loaders.rs rename to raphtory/src/io/arrow/df_loaders.rs index bd33ced02e..4401c3e72c 100644 --- a/raphtory/src/python/graph/io/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -2,13 +2,10 @@ use crate::{ core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError}, db::api::mutation::AdditionOps, prelude::*, - python::graph::io::{ - dataframe::PretendDF, - prop_handler::{get_prop_rows, lift_layer}, - }, }; use kdam::tqdm; use std::{collections::HashMap, iter}; +use crate::io::arrow::{dataframe::PretendDF, prop_handler::*}; pub(crate) fn load_nodes_from_df<'a>( df: &'a PretendDF, diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs new file mode 100644 index 0000000000..25f0222cd0 --- /dev/null +++ b/raphtory/src/io/arrow/mod.rs @@ -0,0 +1,159 @@ +pub mod dataframe; +pub mod df_loaders; +mod prop_handler; + +#[cfg(test)] +mod test { + use crate::{ + core::ArcStr, + prelude::*, + }; + use polars_arrow::array::{PrimitiveArray, Utf8Array}; + use crate::io::arrow::dataframe::PretendDF; + use crate::io::arrow::df_loaders::*; + + #[test] + fn load_edges_from_pretend_df() { + let df = PretendDF { + names: vec!["src", "dst", "time", "prop1", "prop2"] + .iter() + .map(|s| s.to_string()) + .collect(), + arrays: vec![ + vec![ + Box::new(PrimitiveArray::::from(vec![Some(1)])), + Box::new(PrimitiveArray::::from(vec![Some(2)])), + Box::new(PrimitiveArray::::from(vec![Some(1)])), + Box::new(PrimitiveArray::::from(vec![Some(1.0)])), + Box::new(Utf8Array::::from(vec![Some("a")])), + ], + vec![ + Box::new(PrimitiveArray::::from(vec![Some(2), Some(3)])), + Box::new(PrimitiveArray::::from(vec![Some(3), Some(4)])), + Box::new(PrimitiveArray::::from(vec![Some(2), Some(3)])), + Box::new(PrimitiveArray::::from(vec![Some(2.0), Some(3.0)])), + Box::new(Utf8Array::::from(vec![Some("b"), Some("c")])), + ], + ], + }; + let graph = Graph::new(); + let layer: Option<&str> = None; + let layer_in_df: bool = true; + load_edges_from_df( + &df, + 5, + "src", + "dst", + "time", + Some(vec!["prop1", "prop2"]), + None, + None, + layer, + layer_in_df, + &graph.0, + ) + .expect("failed to load edges from pretend df"); + + let actual = graph + .edges() + .iter() + .map(|e| { + ( + e.src().id(), + e.dst().id(), + e.latest_time(), + e.properties() + .temporal() + .get("prop1") + .and_then(|v| v.latest()), + e.properties() + .temporal() + .get("prop2") + .and_then(|v| v.latest()), + ) + }) + .collect::>(); + + assert_eq!( + actual, + vec![ + (1, 2, Some(1), Some(Prop::F64(1.0)), Some(Prop::str("a"))), + (2, 3, Some(2), Some(Prop::F64(2.0)), Some(Prop::str("b"))), + (3, 4, Some(3), Some(Prop::F64(3.0)), Some(Prop::str("c"))), + ] + ); + } + + #[test] + fn load_nodes_from_pretend_df() { + let df = PretendDF { + names: vec!["id", "name", "time", "node_type"] + .iter() + .map(|s| s.to_string()) + .collect(), + arrays: vec![ + vec![ + Box::new(PrimitiveArray::::from(vec![Some(1)])), + Box::new(Utf8Array::::from(vec![Some("a")])), + Box::new(PrimitiveArray::::from(vec![Some(1)])), + Box::new(Utf8Array::::from(vec![Some("atype")])), + ], + vec![ + Box::new(PrimitiveArray::::from(vec![Some(2)])), + Box::new(Utf8Array::::from(vec![Some("b")])), + Box::new(PrimitiveArray::::from(vec![Some(2)])), + Box::new(Utf8Array::::from(vec![Some("btype")])), + ], + ], + }; + let graph = Graph::new(); + + load_nodes_from_df( + &df, + 3, + "id", + "time", + Some(vec!["name"]), + None, + None, + Some("node_type"), + false, + &graph.0, + ) + .expect("failed to load nodes from pretend df"); + + let actual = graph + .nodes() + .iter() + .map(|v| { + ( + v.id(), + v.latest_time(), + v.properties() + .temporal() + .get("name") + .and_then(|v| v.latest()), + v.node_type(), + ) + }) + .collect::>(); + + assert_eq!( + actual, + vec![ + ( + 1, + Some(1), + Some(Prop::str("a")), + Some(ArcStr::from("node_type")) + ), + ( + 2, + Some(2), + Some(Prop::str("b")), + Some(ArcStr::from("node_type")) + ), + ] + ); + } +} diff --git a/raphtory/src/python/graph/io/prop_handler.rs b/raphtory/src/io/arrow/prop_handler.rs similarity index 99% rename from raphtory/src/python/graph/io/prop_handler.rs rename to raphtory/src/io/arrow/prop_handler.rs index e4e5ce72ac..e02e59bf75 100644 --- a/raphtory/src/python/graph/io/prop_handler.rs +++ b/raphtory/src/io/arrow/prop_handler.rs @@ -7,8 +7,8 @@ use polars_arrow::{ use crate::{ core::{utils::errors::GraphError, IntoPropList}, prelude::Prop, - python::graph::io::dataframe::PretendDF, }; +use crate::io::arrow::dataframe::PretendDF; pub struct PropIter<'a> { inner: Box> + 'a>, diff --git a/raphtory/src/io/mod.rs b/raphtory/src/io/mod.rs index 2e2e495419..7c0b20aba1 100644 --- a/raphtory/src/io/mod.rs +++ b/raphtory/src/io/mod.rs @@ -1,3 +1,6 @@ pub mod csv_loader; pub mod json_loader; -pub mod neo4j_loader; \ No newline at end of file +pub mod neo4j_loader; +#[cfg(feature = "arrow")] +pub(crate) mod arrow; +pub mod parquet_loaders; diff --git a/raphtory/src/python/graph/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs similarity index 98% rename from raphtory/src/python/graph/io/parquet_loaders.rs rename to raphtory/src/io/parquet_loaders.rs index 0877aa47b7..cda07ceb40 100644 --- a/raphtory/src/python/graph/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -1,7 +1,4 @@ -use crate::{ - core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, - python::graph::io::{dataframe::*, df_loaders::*}, -}; +use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; use itertools::Itertools; use polars_arrow::{ array::Array, @@ -14,6 +11,7 @@ use polars_parquet::{ read::{read_metadata, FileMetaData, FileReader}, }; use std::{collections::HashMap, path::Path}; +use crate::io::arrow::{dataframe::*, df_loaders::*}; pub fn load_nodes_from_parquet( graph: &InternalGraph, diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index 2f8001c5a9..4c6db906bc 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -33,7 +33,8 @@ use pyo3::{ types::{IntoPyDict, PyDict, PyList, PyString}, }; -use super::io::{dataframe::PretendDF, panda_loaders::*}; +use super::io::panda_loaders::*; +use crate::io::arrow::dataframe::PretendDF; impl From for PyErr { fn from(value: Error) -> Self { diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index d3549d8310..6adf27d167 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -15,7 +15,7 @@ use crate::{ graph::{ edge::PyEdge, graph_with_deletions::PyPersistentGraph, - io::{panda_loaders::*, parquet_loaders::*}, + io::panda_loaders::*, node::PyNode, views::graph_view::PyGraphView, }, @@ -28,6 +28,7 @@ use std::{ fmt::{Debug, Formatter}, path::{Path, PathBuf}, }; +use crate::io::parquet_loaders::*; /// A temporal graph. #[derive(Clone)] diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index c4182809e2..6a992204b9 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -17,7 +17,7 @@ use crate::{ prelude::{DeletionOps, GraphViewOps, ImportOps}, python::{ graph::{ - edge::PyEdge, io::parquet_loaders::*, node::PyNode, views::graph_view::PyGraphView, + edge::PyEdge, node::PyNode, views::graph_view::PyGraphView, }, utils::{PyInputNode, PyTime}, }, @@ -30,6 +30,7 @@ use std::{ }; use super::{graph::PyGraph, io::panda_loaders::*}; +use crate::io::parquet_loaders::*; /// A temporal graph that allows edges and nodes to be deleted. #[derive(Clone)] diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index ea13e56fc6..2f56bd4b1b 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -1,163 +1 @@ -pub mod dataframe; -pub mod df_loaders; pub mod panda_loaders; -pub mod parquet_loaders; -mod prop_handler; - -#[cfg(test)] -mod test { - use crate::{ - core::ArcStr, - prelude::*, - python::graph::io::{ - dataframe::PretendDF, - df_loaders::{load_edges_from_df, load_nodes_from_df}, - }, - }; - use polars_arrow::array::{PrimitiveArray, Utf8Array}; - - #[test] - fn load_edges_from_pretend_df() { - let df = PretendDF { - names: vec!["src", "dst", "time", "prop1", "prop2"] - .iter() - .map(|s| s.to_string()) - .collect(), - arrays: vec![ - vec![ - Box::new(PrimitiveArray::::from(vec![Some(1)])), - Box::new(PrimitiveArray::::from(vec![Some(2)])), - Box::new(PrimitiveArray::::from(vec![Some(1)])), - Box::new(PrimitiveArray::::from(vec![Some(1.0)])), - Box::new(Utf8Array::::from(vec![Some("a")])), - ], - vec![ - Box::new(PrimitiveArray::::from(vec![Some(2), Some(3)])), - Box::new(PrimitiveArray::::from(vec![Some(3), Some(4)])), - Box::new(PrimitiveArray::::from(vec![Some(2), Some(3)])), - Box::new(PrimitiveArray::::from(vec![Some(2.0), Some(3.0)])), - Box::new(Utf8Array::::from(vec![Some("b"), Some("c")])), - ], - ], - }; - let graph = Graph::new(); - let layer: Option<&str> = None; - let layer_in_df: bool = true; - load_edges_from_df( - &df, - 5, - "src", - "dst", - "time", - Some(vec!["prop1", "prop2"]), - None, - None, - layer, - layer_in_df, - &graph.0, - ) - .expect("failed to load edges from pretend df"); - - let actual = graph - .edges() - .iter() - .map(|e| { - ( - e.src().id(), - e.dst().id(), - e.latest_time(), - e.properties() - .temporal() - .get("prop1") - .and_then(|v| v.latest()), - e.properties() - .temporal() - .get("prop2") - .and_then(|v| v.latest()), - ) - }) - .collect::>(); - - assert_eq!( - actual, - vec![ - (1, 2, Some(1), Some(Prop::F64(1.0)), Some(Prop::str("a"))), - (2, 3, Some(2), Some(Prop::F64(2.0)), Some(Prop::str("b"))), - (3, 4, Some(3), Some(Prop::F64(3.0)), Some(Prop::str("c"))), - ] - ); - } - - #[test] - fn load_nodes_from_pretend_df() { - let df = PretendDF { - names: vec!["id", "name", "time", "node_type"] - .iter() - .map(|s| s.to_string()) - .collect(), - arrays: vec![ - vec![ - Box::new(PrimitiveArray::::from(vec![Some(1)])), - Box::new(Utf8Array::::from(vec![Some("a")])), - Box::new(PrimitiveArray::::from(vec![Some(1)])), - Box::new(Utf8Array::::from(vec![Some("atype")])), - ], - vec![ - Box::new(PrimitiveArray::::from(vec![Some(2)])), - Box::new(Utf8Array::::from(vec![Some("b")])), - Box::new(PrimitiveArray::::from(vec![Some(2)])), - Box::new(Utf8Array::::from(vec![Some("btype")])), - ], - ], - }; - let graph = Graph::new(); - - load_nodes_from_df( - &df, - 3, - "id", - "time", - Some(vec!["name"]), - None, - None, - Some("node_type"), - false, - &graph.0, - ) - .expect("failed to load nodes from pretend df"); - - let actual = graph - .nodes() - .iter() - .map(|v| { - ( - v.id(), - v.latest_time(), - v.properties() - .temporal() - .get("name") - .and_then(|v| v.latest()), - v.node_type(), - ) - }) - .collect::>(); - - assert_eq!( - actual, - vec![ - ( - 1, - Some(1), - Some(Prop::str("a")), - Some(ArcStr::from("node_type")) - ), - ( - 2, - Some(2), - Some(Prop::str("b")), - Some(ArcStr::from("node_type")) - ), - ] - ); - } -} diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index 71933045e3..dacbdfeb2f 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -1,10 +1,8 @@ -use crate::{ - core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, - python::graph::io::{dataframe::*, df_loaders::*}, -}; +use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; use polars_arrow::{array::Array, ffi}; use pyo3::{ffi::Py_uintptr_t, prelude::*, types::IntoPyDict}; use std::collections::HashMap; +use crate::io::arrow::{dataframe::*, df_loaders::*}; pub fn load_nodes_from_pandas( graph: &InternalGraph, From 4e632292fbf4f13bb328a4b385ad0a8f06435b9f Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:20:55 +0100 Subject: [PATCH 24/33] tmpdir impl --- python/tests/test_load_from_parquet.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index ae5e61c5b8..651156f0ff 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -1,5 +1,6 @@ import os import re +import tempfile import pyarrow as pa import pyarrow.parquet as pq @@ -10,6 +11,13 @@ @pytest.mark.skip(reason="Prepares data for debugging purposes") def test_prepare_data(): + dirname = tempfile.TemporaryDirectory() + nodes_parquet_fp = os.path.join(dirname.name, "parquet", "nodes.parquet") + edges_parquet_fp = os.path.join(dirname.name, "parquet", "edges.parquet") + edge_deletions_parquet_fp = os.path.join(dirname.name, "parquet", "edges_deletions.parquet") + + os.makedirs(os.path.dirname(nodes_parquet_fp), exist_ok=True) + data = { "id": [1, 2, 3, 4, 5, 6], "name": ["Alice", "Bob", "Carol", "Dave", "Eve", "Frank"], @@ -19,7 +27,8 @@ def test_prepare_data(): } table = pa.table(data) - pq.write_table(table, '/tmp/parquet/nodes.parquet') + pq.write_table(table, nodes_parquet_fp) + print("""Created nodes.parquet at loc = {}""".format(nodes_parquet_fp)) data = { "src": [1, 2, 3, 4, 5], @@ -32,7 +41,8 @@ def test_prepare_data(): } table = pa.table(data) - pq.write_table(table, '/tmp/parquet/edges.parquet') + pq.write_table(table, edges_parquet_fp) + print("""Created edges.parquet at loc = {}""".format(edges_parquet_fp)) data = { "src": [3, 4], @@ -41,7 +51,8 @@ def test_prepare_data(): } table = pa.table(data) - pq.write_table(table, '/tmp/parquet/edges_deletions.parquet') + pq.write_table(table, edge_deletions_parquet_fp) + print("""Created edges_deletions.parquet at loc = {}""".format(edge_deletions_parquet_fp)) nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') From c32bf86273036f197e4beeb9c3c3db09340e1de1 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Fri, 21 Jun 2024 15:41:23 +0100 Subject: [PATCH 25/33] add feature gates --- raphtory/src/io/mod.rs | 1 + raphtory/src/lib.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/raphtory/src/io/mod.rs b/raphtory/src/io/mod.rs index 7c0b20aba1..5f45fc823c 100644 --- a/raphtory/src/io/mod.rs +++ b/raphtory/src/io/mod.rs @@ -3,4 +3,5 @@ pub mod json_loader; pub mod neo4j_loader; #[cfg(feature = "arrow")] pub(crate) mod arrow; +#[cfg(feature = "arrow")] pub mod parquet_loaders; diff --git a/raphtory/src/lib.rs b/raphtory/src/lib.rs index 40a0e33304..3f43a5a01e 100644 --- a/raphtory/src/lib.rs +++ b/raphtory/src/lib.rs @@ -104,6 +104,7 @@ pub mod search; #[cfg(feature = "vectors")] pub mod vectors; +#[cfg(feature = "io")] pub mod io; pub mod prelude { From 7178d01602f61c53070949243001ba5f5d4552cd Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:19:13 +0100 Subject: [PATCH 26/33] make parquet loader compatible for rust --- examples/rust/src/bin/bench/main.rs | 3 +- raphtory/src/graph_loader/lotr_graph.rs | 6 +- raphtory/src/graph_loader/mod.rs | 4 +- raphtory/src/graph_loader/stable_coins.rs | 2 +- .../src/graph_loader/sx_superuser_graph.rs | 6 +- raphtory/src/io/arrow/df_loaders.rs | 57 +++++++++++++------ raphtory/src/io/arrow/mod.rs | 11 ++-- raphtory/src/io/arrow/prop_handler.rs | 2 +- raphtory/src/io/csv_loader.rs | 5 +- raphtory/src/io/mod.rs | 4 +- raphtory/src/io/parquet_loaders.rs | 41 +++++++++---- raphtory/src/python/graph/graph.rs | 17 +++--- .../src/python/graph/graph_with_deletions.rs | 14 ++--- raphtory/src/python/graph/io/panda_loaders.rs | 6 +- raphtory/src/python/packages/graph_loader.rs | 28 ++++----- 15 files changed, 115 insertions(+), 91 deletions(-) diff --git a/examples/rust/src/bin/bench/main.rs b/examples/rust/src/bin/bench/main.rs index 12d262ad0d..783c6fb065 100644 --- a/examples/rust/src/bin/bench/main.rs +++ b/examples/rust/src/bin/bench/main.rs @@ -1,6 +1,5 @@ use raphtory::{ - algorithms::centrality::pagerank::unweighted_page_rank, - io::csv_loader::CsvLoader, prelude::*, + algorithms::centrality::pagerank::unweighted_page_rank, io::csv_loader::CsvLoader, prelude::*, }; use serde::Deserialize; use std::{ diff --git a/raphtory/src/graph_loader/lotr_graph.rs b/raphtory/src/graph_loader/lotr_graph.rs index ed9dc55860..8f5eafd98a 100644 --- a/raphtory/src/graph_loader/lotr_graph.rs +++ b/raphtory/src/graph_loader/lotr_graph.rs @@ -21,13 +21,9 @@ //! println!("The graph has {:?} nodes", graph.count_nodes()); //! println!("The graph has {:?} edges", graph.count_edges()); //! ``` -use crate::{ - graph_loader::fetch_file, - prelude::*, -}; +use crate::{graph_loader::fetch_file, io::csv_loader::CsvLoader, prelude::*}; use serde::Deserialize; use std::path::PathBuf; -use crate::io::csv_loader::CsvLoader; #[derive(Deserialize, std::fmt::Debug)] pub struct Lotr { diff --git a/raphtory/src/graph_loader/mod.rs b/raphtory/src/graph_loader/mod.rs index d22a2a7c5b..82db95bed3 100644 --- a/raphtory/src/graph_loader/mod.rs +++ b/raphtory/src/graph_loader/mod.rs @@ -210,8 +210,8 @@ mod graph_loader_test { fn db_lotr() { let g = Graph::new(); - let data_dir = crate::graph_loader::lotr_graph::lotr_file() - .expect("Failed to get lotr.csv file"); + let data_dir = + crate::graph_loader::lotr_graph::lotr_file().expect("Failed to get lotr.csv file"); fn parse_record(rec: &StringRecord) -> Option<(String, String, i64)> { let src = rec.get(0).and_then(|s| s.parse::().ok())?; diff --git a/raphtory/src/graph_loader/stable_coins.rs b/raphtory/src/graph_loader/stable_coins.rs index 8d60732f37..ccb15899bc 100644 --- a/raphtory/src/graph_loader/stable_coins.rs +++ b/raphtory/src/graph_loader/stable_coins.rs @@ -1,12 +1,12 @@ use crate::{ graph_loader::{fetch_file, unzip_file}, + io::csv_loader::CsvLoader, prelude::*, }; use chrono::DateTime; use regex::Regex; use serde::Deserialize; use std::{collections::HashMap, fs, path::PathBuf, time::Instant}; -use crate::io::csv_loader::CsvLoader; #[allow(dead_code)] #[derive(Deserialize, std::fmt::Debug)] diff --git a/raphtory/src/graph_loader/sx_superuser_graph.rs b/raphtory/src/graph_loader/sx_superuser_graph.rs index b093075395..4b7887c4d3 100644 --- a/raphtory/src/graph_loader/sx_superuser_graph.rs +++ b/raphtory/src/graph_loader/sx_superuser_graph.rs @@ -45,13 +45,9 @@ //! println!("The graph has {:?} edges", graph.count_edges()); //! ``` -use crate::{ - graph_loader::fetch_file, - prelude::*, -}; +use crate::{graph_loader::fetch_file, io::csv_loader::CsvLoader, prelude::*}; use serde::Deserialize; use std::path::PathBuf; -use crate::io::csv_loader::CsvLoader; #[derive(Deserialize, std::fmt::Debug)] pub struct TEdge { diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index 4401c3e72c..37c5434c52 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -1,13 +1,19 @@ use crate::{ - core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError}, - db::api::mutation::AdditionOps, + core::utils::errors::GraphError, + db::api::{ + mutation::{internal::*, AdditionOps}, + view::StaticGraphViewOps, + }, + io::arrow::{dataframe::PretendDF, prop_handler::*}, prelude::*, }; use kdam::tqdm; use std::{collections::HashMap, iter}; -use crate::io::arrow::{dataframe::PretendDF, prop_handler::*}; -pub(crate) fn load_nodes_from_df<'a>( +pub(crate) fn load_nodes_from_df< + 'a, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( df: &'a PretendDF, size: usize, node_id: &str, @@ -17,7 +23,7 @@ pub(crate) fn load_nodes_from_df<'a>( shared_const_properties: Option>, node_type: Option<&str>, node_type_in_df: bool, - graph: &InternalGraph, + graph: &G, ) -> Result<(), GraphError> { let (prop_iter, const_prop_iter) = get_prop_rows(df, properties, const_properties)?; @@ -133,7 +139,11 @@ fn extract_out_default_type(n_t: Option<&str>) -> Option<&str> { } } -pub(crate) fn load_edges_from_df<'a, S: AsRef>( +pub(crate) fn load_edges_from_df< + 'a, + S: AsRef, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( df: &'a PretendDF, size: usize, src: &str, @@ -144,7 +154,7 @@ pub(crate) fn load_edges_from_df<'a, S: AsRef>( shared_const_properties: Option>, layer: Option, layer_in_df: bool, - graph: &InternalGraph, + graph: &G, ) -> Result<(), GraphError> { let (prop_iter, const_prop_iter) = get_prop_rows(df, properties, const_properties)?; let layer = lift_layer(layer, layer_in_df, df); @@ -159,7 +169,7 @@ pub(crate) fn load_edges_from_df<'a, S: AsRef>( .zip(dst.map(|i| i.copied())) .zip(time); load_edges_from_num_iter( - &graph, + graph, size, triplets, prop_iter, @@ -177,7 +187,7 @@ pub(crate) fn load_edges_from_df<'a, S: AsRef>( .zip(dst.map(i64_opt_into_u64_opt)) .zip(time); load_edges_from_num_iter( - &graph, + graph, size, triplets, prop_iter, @@ -237,7 +247,11 @@ pub(crate) fn load_edges_from_df<'a, S: AsRef>( Ok(()) } -pub(crate) fn load_edges_deletions_from_df<'a, S: AsRef>( +pub(crate) fn load_edges_deletions_from_df< + 'a, + S: AsRef, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + DeletionOps, +>( df: &'a PretendDF, size: usize, src: &str, @@ -245,7 +259,7 @@ pub(crate) fn load_edges_deletions_from_df<'a, S: AsRef>( time: &str, layer: Option, layer_in_df: bool, - graph: &InternalGraph, + graph: &G, ) -> Result<(), GraphError> { let layer = lift_layer(layer, layer_in_df, df); @@ -332,13 +346,16 @@ pub(crate) fn load_edges_deletions_from_df<'a, S: AsRef>( Ok(()) } -pub(crate) fn load_node_props_from_df<'a>( +pub(crate) fn load_node_props_from_df< + 'a, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( df: &'a PretendDF, size: usize, node_id: &str, const_properties: Option>, shared_const_properties: Option>, - graph: &InternalGraph, + graph: &G, ) -> Result<(), GraphError> { let (_, const_prop_iter) = get_prop_rows(df, None, const_properties)?; @@ -426,7 +443,11 @@ pub(crate) fn load_node_props_from_df<'a>( Ok(()) } -pub(crate) fn load_edges_props_from_df<'a, S: AsRef>( +pub(crate) fn load_edges_props_from_df< + 'a, + S: AsRef, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( df: &'a PretendDF, size: usize, src: &str, @@ -435,7 +456,7 @@ pub(crate) fn load_edges_props_from_df<'a, S: AsRef>( shared_const_properties: Option>, layer: Option, layer_in_df: bool, - graph: &InternalGraph, + graph: &G, ) -> Result<(), GraphError> { let (_, const_prop_iter) = get_prop_rows(df, None, const_properties)?; let layer = lift_layer(layer, layer_in_df, df); @@ -544,8 +565,9 @@ fn load_edges_from_num_iter< I: Iterator, Option), Option)>, PI: Iterator>, IL: Iterator>, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( - graph: &InternalGraph, + graph: &G, size: usize, edges: I, properties: PI, @@ -576,8 +598,9 @@ fn load_nodes_from_num_iter< S: AsRef, I: Iterator, Option, Option<&'a str>)>, PI: Iterator>, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( - graph: &InternalGraph, + graph: &G, size: usize, nodes: I, properties: PI, diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index 25f0222cd0..b85ef6f475 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -6,11 +6,10 @@ mod prop_handler; mod test { use crate::{ core::ArcStr, + io::arrow::{dataframe::PretendDF, df_loaders::*}, prelude::*, }; use polars_arrow::array::{PrimitiveArray, Utf8Array}; - use crate::io::arrow::dataframe::PretendDF; - use crate::io::arrow::df_loaders::*; #[test] fn load_edges_from_pretend_df() { @@ -50,9 +49,9 @@ mod test { None, layer, layer_in_df, - &graph.0, + &graph, ) - .expect("failed to load edges from pretend df"); + .expect("failed to load edges from pretend df"); let actual = graph .edges() @@ -118,9 +117,9 @@ mod test { None, Some("node_type"), false, - &graph.0, + &graph, ) - .expect("failed to load nodes from pretend df"); + .expect("failed to load nodes from pretend df"); let actual = graph .nodes() diff --git a/raphtory/src/io/arrow/prop_handler.rs b/raphtory/src/io/arrow/prop_handler.rs index e02e59bf75..fae4562cfb 100644 --- a/raphtory/src/io/arrow/prop_handler.rs +++ b/raphtory/src/io/arrow/prop_handler.rs @@ -6,9 +6,9 @@ use polars_arrow::{ use crate::{ core::{utils::errors::GraphError, IntoPropList}, + io::arrow::dataframe::PretendDF, prelude::Prop, }; -use crate::io::arrow::dataframe::PretendDF; pub struct PropIter<'a> { inner: Box> + 'a>, diff --git a/raphtory/src/io/csv_loader.rs b/raphtory/src/io/csv_loader.rs index ff80231f58..3b6fcbff1d 100644 --- a/raphtory/src/io/csv_loader.rs +++ b/raphtory/src/io/csv_loader.rs @@ -474,10 +474,7 @@ impl CsvLoader { #[cfg(test)] mod csv_loader_test { - use crate::{ - core::utils::hashing::calculate_hash, io::csv_loader::CsvLoader, - prelude::*, - }; + use crate::{core::utils::hashing::calculate_hash, io::csv_loader::CsvLoader, prelude::*}; use csv::StringRecord; use regex::Regex; use serde::Deserialize; diff --git a/raphtory/src/io/mod.rs b/raphtory/src/io/mod.rs index 5f45fc823c..327e9b42c4 100644 --- a/raphtory/src/io/mod.rs +++ b/raphtory/src/io/mod.rs @@ -1,7 +1,7 @@ +#[cfg(feature = "arrow")] +pub(crate) mod arrow; pub mod csv_loader; pub mod json_loader; pub mod neo4j_loader; #[cfg(feature = "arrow")] -pub(crate) mod arrow; -#[cfg(feature = "arrow")] pub mod parquet_loaders; diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index cda07ceb40..683c96c792 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -1,4 +1,12 @@ -use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; +use crate::{ + core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, + db::api::{ + mutation::internal::{InternalAdditionOps, InternalPropertyAdditionOps}, + view::StaticGraphViewOps, + }, + io::arrow::{dataframe::*, df_loaders::*}, + prelude::DeletionOps, +}; use itertools::Itertools; use polars_arrow::{ array::Array, @@ -11,10 +19,11 @@ use polars_parquet::{ read::{read_metadata, FileMetaData, FileReader}, }; use std::{collections::HashMap, path::Path}; -use crate::io::arrow::{dataframe::*, df_loaders::*}; -pub fn load_nodes_from_parquet( - graph: &InternalGraph, +pub fn load_nodes_from_parquet< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( + graph: &G, parquet_file_path: &Path, id: &str, time: &str, @@ -54,8 +63,10 @@ pub fn load_nodes_from_parquet( Ok(()) } -pub fn load_edges_from_parquet( - graph: &InternalGraph, +pub fn load_edges_from_parquet< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( + graph: &G, parquet_file_path: &Path, src: &str, dst: &str, @@ -97,8 +108,10 @@ pub fn load_edges_from_parquet( Ok(()) } -pub fn load_node_props_from_parquet( - graph: &InternalGraph, +pub fn load_node_props_from_parquet< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( + graph: &G, parquet_file_path: &Path, id: &str, const_properties: Option>, @@ -124,8 +137,10 @@ pub fn load_node_props_from_parquet( Ok(()) } -pub fn load_edge_props_from_parquet( - graph: &InternalGraph, +pub fn load_edge_props_from_parquet< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( + graph: &G, parquet_file_path: &Path, src: &str, dst: &str, @@ -162,8 +177,10 @@ pub fn load_edge_props_from_parquet( Ok(()) } -pub fn load_edges_deletions_from_parquet( - graph: &InternalGraph, +pub fn load_edges_deletions_from_parquet< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + DeletionOps, +>( + graph: &G, parquet_file_path: &Path, src: &str, dst: &str, diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 6adf27d167..29d2d7b4d8 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -10,14 +10,12 @@ use crate::{ api::view::internal::{CoreGraphOps, DynamicGraph, IntoDynamic, MaterializedGraph}, graph::{edge::EdgeView, node::NodeView, views::node_subgraph::NodeSubgraph}, }, + io::parquet_loaders::*, prelude::*, python::{ graph::{ - edge::PyEdge, - graph_with_deletions::PyPersistentGraph, - io::panda_loaders::*, - node::PyNode, - views::graph_view::PyGraphView, + edge::PyEdge, graph_with_deletions::PyPersistentGraph, io::panda_loaders::*, + node::PyNode, views::graph_view::PyGraphView, }, utils::{PyInputNode, PyTime}, }, @@ -28,7 +26,6 @@ use std::{ fmt::{Debug, Formatter}, path::{Path, PathBuf}, }; -use crate::io::parquet_loaders::*; /// A temporal graph. #[derive(Clone)] @@ -602,7 +599,7 @@ impl PyGraph { shared_const_properties: Option>, ) -> Result<(), GraphError> { load_nodes_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), id, time, @@ -685,7 +682,7 @@ impl PyGraph { layer_in_df: Option, ) -> Result<(), GraphError> { load_edges_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), src, dst, @@ -744,7 +741,7 @@ impl PyGraph { shared_const_properties: Option>, ) -> Result<(), GraphError> { load_node_props_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), id, const_properties, @@ -813,7 +810,7 @@ impl PyGraph { layer_in_df: Option, ) -> Result<(), GraphError> { load_edge_props_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), src, dst, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 6a992204b9..135eb2173e 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -16,9 +16,7 @@ use crate::{ }, prelude::{DeletionOps, GraphViewOps, ImportOps}, python::{ - graph::{ - edge::PyEdge, node::PyNode, views::graph_view::PyGraphView, - }, + graph::{edge::PyEdge, node::PyNode, views::graph_view::PyGraphView}, utils::{PyInputNode, PyTime}, }, }; @@ -589,7 +587,7 @@ impl PyPersistentGraph { shared_const_properties: Option>, ) -> Result<(), GraphError> { load_nodes_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), id, time, @@ -672,7 +670,7 @@ impl PyPersistentGraph { layer_in_df: Option, ) -> Result<(), GraphError> { load_edges_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), src, dst, @@ -733,7 +731,7 @@ impl PyPersistentGraph { layer_in_df: Option, ) -> Result<(), GraphError> { load_edges_deletions_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), src, dst, @@ -789,7 +787,7 @@ impl PyPersistentGraph { shared_const_properties: Option>, ) -> Result<(), GraphError> { load_node_props_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), id, const_properties, @@ -858,7 +856,7 @@ impl PyPersistentGraph { layer_in_df: Option, ) -> Result<(), GraphError> { load_edge_props_from_parquet( - &self.graph.0, + &self.graph, parquet_file_path.as_path(), src, dst, diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index dacbdfeb2f..93fc5d337c 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -1,8 +1,10 @@ -use crate::core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}; +use crate::{ + core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, + io::arrow::{dataframe::*, df_loaders::*}, +}; use polars_arrow::{array::Array, ffi}; use pyo3::{ffi::Py_uintptr_t, prelude::*, types::IntoPyDict}; use std::collections::HashMap; -use crate::io::arrow::{dataframe::*, df_loaders::*}; pub fn load_nodes_from_pandas( graph: &InternalGraph, diff --git a/raphtory/src/python/packages/graph_loader.rs b/raphtory/src/python/packages/graph_loader.rs index 3851e5e778..3ed52dae8f 100644 --- a/raphtory/src/python/packages/graph_loader.rs +++ b/raphtory/src/python/packages/graph_loader.rs @@ -67,9 +67,10 @@ pub fn lotr_graph() -> PyResult> { #[pyfunction] #[pyo3(signature = (timeout_seconds=600))] pub fn reddit_hyperlink_graph(timeout_seconds: u64) -> PyResult> { - PyGraph::py_from_db_graph( - crate::graph_loader::reddit_hyperlinks::reddit_graph(timeout_seconds, false), - ) + PyGraph::py_from_db_graph(crate::graph_loader::reddit_hyperlinks::reddit_graph( + timeout_seconds, + false, + )) } #[pyfunction] @@ -84,12 +85,10 @@ pub fn reddit_hyperlink_graph_local(file_path: &str) -> PyResult> { #[pyfunction] #[pyo3(signature = (path=None,subset=None))] pub fn stable_coin_graph(path: Option, subset: Option) -> PyResult> { - PyGraph::py_from_db_graph( - crate::graph_loader::stable_coins::stable_coin_graph( - path, - subset.unwrap_or(false), - ), - ) + PyGraph::py_from_db_graph(crate::graph_loader::stable_coins::stable_coin_graph( + path, + subset.unwrap_or(false), + )) } #[pyfunction] @@ -100,11 +99,12 @@ pub fn neo4j_movie_graph( password: String, database: String, ) -> PyResult> { - let g = Runtime::new().unwrap().block_on( - crate::graph_loader::neo4j_examples::neo4j_movie_graph( - uri, username, password, database, - ), - ); + let g = + Runtime::new() + .unwrap() + .block_on(crate::graph_loader::neo4j_examples::neo4j_movie_graph( + uri, username, password, database, + )); PyGraph::py_from_db_graph(g) } From c3048c8ac904aaa4676193904ebd42db260af54e Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Sat, 22 Jun 2024 04:47:28 +0100 Subject: [PATCH 27/33] move py impls to python package --- raphtory/src/io/arrow/dataframe.rs | 6 ------ raphtory/src/python/graph/graph.rs | 1 + raphtory/src/python/graph/io/mod.rs | 8 ++++++++ raphtory/src/python/graph/io/panda_loaders.rs | 1 + 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index d24057a733..a233294840 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -9,7 +9,6 @@ use polars_arrow::{ }; use itertools::Itertools; -use pyo3::{create_exception, exceptions::PyException}; #[derive(Debug)] pub(crate) struct PretendDF { @@ -100,8 +99,3 @@ impl PretendDF { Some(iter) } } - -pub type ArrayRef = Box; - -create_exception!(exceptions, ArrowErrorException, PyException); -create_exception!(exceptions, GraphLoadException, PyException); diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 29d2d7b4d8..f4d47e169f 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -24,6 +24,7 @@ use pyo3::{prelude::*, types::PyBytes}; use std::{ collections::HashMap, fmt::{Debug, Formatter}, + ops::Deref, path::{Path, PathBuf}, }; diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index 2f56bd4b1b..2320a35de3 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -1 +1,9 @@ +use polars_arrow::array::Array; +use pyo3::{create_exception, exceptions::PyException}; + pub mod panda_loaders; + +pub type ArrayRef = Box; + +create_exception!(exceptions, ArrowErrorException, PyException); +create_exception!(exceptions, GraphLoadException, PyException); diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/panda_loaders.rs index 93fc5d337c..ecc82372ce 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/panda_loaders.rs @@ -1,6 +1,7 @@ use crate::{ core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, io::arrow::{dataframe::*, df_loaders::*}, + python::graph::io::*, }; use polars_arrow::{array::Array, ffi}; use pyo3::{ffi::Py_uintptr_t, prelude::*, types::IntoPyDict}; From dcf50de466d60d8985a7a280bb25df40e732574c Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Mon, 24 Jun 2024 12:02:30 +0100 Subject: [PATCH 28/33] rename PretendDF to DFView, panda_loaders to pandas_loaders --- raphtory/src/io/arrow/dataframe.rs | 4 ++-- raphtory/src/io/arrow/df_loaders.rs | 12 ++++++------ raphtory/src/io/arrow/mod.rs | 6 +++--- raphtory/src/io/arrow/prop_handler.rs | 10 +++++----- raphtory/src/io/parquet_loaders.rs | 6 +++--- raphtory/src/python/graph/disk_graph.rs | 6 +++--- raphtory/src/python/graph/graph.rs | 2 +- raphtory/src/python/graph/graph_with_deletions.rs | 2 +- raphtory/src/python/graph/io/mod.rs | 2 +- .../graph/io/{panda_loaders.rs => pandas_loaders.rs} | 4 ++-- 10 files changed, 27 insertions(+), 27 deletions(-) rename raphtory/src/python/graph/io/{panda_loaders.rs => pandas_loaders.rs} (99%) diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index a233294840..2b8173f5e1 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -11,12 +11,12 @@ use polars_arrow::{ use itertools::Itertools; #[derive(Debug)] -pub(crate) struct PretendDF { +pub(crate) struct DFView { pub(crate) names: Vec, pub(crate) arrays: Vec>>, } -impl PretendDF { +impl DFView { pub(crate) fn get_inner_size(&self) -> usize { if self.arrays.is_empty() || self.arrays[0].is_empty() { return 0; diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index 37c5434c52..ee2003953f 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -4,7 +4,7 @@ use crate::{ mutation::{internal::*, AdditionOps}, view::StaticGraphViewOps, }, - io::arrow::{dataframe::PretendDF, prop_handler::*}, + io::arrow::{dataframe::DFView, prop_handler::*}, prelude::*, }; use kdam::tqdm; @@ -14,7 +14,7 @@ pub(crate) fn load_nodes_from_df< 'a, G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( - df: &'a PretendDF, + df: &'a DFView, size: usize, node_id: &str, time: &str, @@ -144,7 +144,7 @@ pub(crate) fn load_edges_from_df< S: AsRef, G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( - df: &'a PretendDF, + df: &'a DFView, size: usize, src: &str, dst: &str, @@ -252,7 +252,7 @@ pub(crate) fn load_edges_deletions_from_df< S: AsRef, G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + DeletionOps, >( - df: &'a PretendDF, + df: &'a DFView, size: usize, src: &str, dst: &str, @@ -350,7 +350,7 @@ pub(crate) fn load_node_props_from_df< 'a, G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( - df: &'a PretendDF, + df: &'a DFView, size: usize, node_id: &str, const_properties: Option>, @@ -448,7 +448,7 @@ pub(crate) fn load_edges_props_from_df< S: AsRef, G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( - df: &'a PretendDF, + df: &'a DFView, size: usize, src: &str, dst: &str, diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index b85ef6f475..649cf0a294 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -6,14 +6,14 @@ mod prop_handler; mod test { use crate::{ core::ArcStr, - io::arrow::{dataframe::PretendDF, df_loaders::*}, + io::arrow::{dataframe::DFView, df_loaders::*}, prelude::*, }; use polars_arrow::array::{PrimitiveArray, Utf8Array}; #[test] fn load_edges_from_pretend_df() { - let df = PretendDF { + let df = DFView { names: vec!["src", "dst", "time", "prop1", "prop2"] .iter() .map(|s| s.to_string()) @@ -85,7 +85,7 @@ mod test { #[test] fn load_nodes_from_pretend_df() { - let df = PretendDF { + let df = DFView { names: vec!["id", "name", "time", "node_type"] .iter() .map(|s| s.to_string()) diff --git a/raphtory/src/io/arrow/prop_handler.rs b/raphtory/src/io/arrow/prop_handler.rs index fae4562cfb..c3d07979c4 100644 --- a/raphtory/src/io/arrow/prop_handler.rs +++ b/raphtory/src/io/arrow/prop_handler.rs @@ -6,7 +6,7 @@ use polars_arrow::{ use crate::{ core::{utils::errors::GraphError, IntoPropList}, - io::arrow::dataframe::PretendDF, + io::arrow::dataframe::DFView, prelude::Prop, }; @@ -23,7 +23,7 @@ impl<'a> Iterator for PropIter<'a> { } pub(crate) fn get_prop_rows<'a>( - df: &'a PretendDF, + df: &'a DFView, props: Option>, const_props: Option>, ) -> Result<(PropIter<'a>, PropIter<'a>), GraphError> { @@ -34,7 +34,7 @@ pub(crate) fn get_prop_rows<'a>( fn combine_properties<'a>( props: Option>, - df: &'a PretendDF, + df: &'a DFView, ) -> Result, GraphError> { let iter = props .unwrap_or_default() @@ -148,7 +148,7 @@ fn validate_data_types(dt: &DataType) -> Result<(), GraphError> { pub(crate) fn lift_property<'a: 'b, 'b>( name: &'a str, - df: &'b PretendDF, + df: &'b DFView, ) -> Result> + 'b>, GraphError> { let idx = df .names @@ -383,7 +383,7 @@ pub(crate) fn lift_property<'a: 'b, 'b>( pub(crate) fn lift_layer<'a, S: AsRef>( layer: Option, layer_in_df: bool, - df: &'a PretendDF, + df: &'a DFView, ) -> Box> + 'a> { if let Some(layer) = layer { if layer_in_df { diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 683c96c792..6519ecaa36 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -217,7 +217,7 @@ pub fn load_edges_deletions_from_parquet< pub(crate) fn process_parquet_file_to_df( parquet_file_path: &Path, col_names: Vec<&str>, -) -> Result { +) -> Result { let (names, arrays) = read_parquet_file(parquet_file_path, &col_names)?; let names = names @@ -228,7 +228,7 @@ pub(crate) fn process_parquet_file_to_df( .map_ok(|r| r.into_iter().map(|boxed| boxed.clone()).collect_vec()) .collect::, _>>()?; - Ok(PretendDF { names, arrays }) + Ok(DFView { names, arrays }) } fn read_parquet_file( @@ -291,7 +291,7 @@ mod test { let col_names = vec!["src", "dst", "time", "weight", "marbles"]; let df = process_parquet_file_to_df(parquet_file_path.as_path(), col_names).unwrap(); - let df1 = PretendDF { + let df1 = DFView { names: vec!["src", "dst", "time", "weight", "marbles"] .iter() .map(|s| s.to_string()) diff --git a/raphtory/src/python/graph/disk_graph.rs b/raphtory/src/python/graph/disk_graph.rs index 4c6db906bc..f6aafe9dea 100644 --- a/raphtory/src/python/graph/disk_graph.rs +++ b/raphtory/src/python/graph/disk_graph.rs @@ -33,8 +33,8 @@ use pyo3::{ types::{IntoPyDict, PyDict, PyList, PyString}, }; -use super::io::panda_loaders::*; -use crate::io::arrow::dataframe::PretendDF; +use super::io::pandas_loaders::*; +use crate::io::arrow::dataframe::DFView; impl From for PyErr { fn from(value: Error) -> Self { @@ -238,7 +238,7 @@ impl PyDiskGraph { impl PyDiskGraph { fn from_pandas( graph_dir: &str, - df: PretendDF, + df: DFView, src: &str, dst: &str, time: &str, diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index f4d47e169f..904f579d29 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -14,7 +14,7 @@ use crate::{ prelude::*, python::{ graph::{ - edge::PyEdge, graph_with_deletions::PyPersistentGraph, io::panda_loaders::*, + edge::PyEdge, graph_with_deletions::PyPersistentGraph, io::pandas_loaders::*, node::PyNode, views::graph_view::PyGraphView, }, utils::{PyInputNode, PyTime}, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 135eb2173e..8a73a8a675 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -27,7 +27,7 @@ use std::{ path::{Path, PathBuf}, }; -use super::{graph::PyGraph, io::panda_loaders::*}; +use super::{graph::PyGraph, io::pandas_loaders::*}; use crate::io::parquet_loaders::*; /// A temporal graph that allows edges and nodes to be deleted. diff --git a/raphtory/src/python/graph/io/mod.rs b/raphtory/src/python/graph/io/mod.rs index 2320a35de3..a25698fdfa 100644 --- a/raphtory/src/python/graph/io/mod.rs +++ b/raphtory/src/python/graph/io/mod.rs @@ -1,7 +1,7 @@ use polars_arrow::array::Array; use pyo3::{create_exception, exceptions::PyException}; -pub mod panda_loaders; +pub mod pandas_loaders; pub type ArrayRef = Box; diff --git a/raphtory/src/python/graph/io/panda_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs similarity index 99% rename from raphtory/src/python/graph/io/panda_loaders.rs rename to raphtory/src/python/graph/io/pandas_loaders.rs index ecc82372ce..20856e3f5c 100644 --- a/raphtory/src/python/graph/io/panda_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -244,7 +244,7 @@ pub(crate) fn process_pandas_py_df( df: &PyAny, py: Python, col_names: Vec<&str>, -) -> PyResult { +) -> PyResult { is_jupyter(py); py.import("pandas")?; let module = py.import("pyarrow")?; @@ -292,7 +292,7 @@ pub(crate) fn process_pandas_py_df( }) .collect::, PyErr>>()?; - let df = PretendDF { names, arrays }; + let df = DFView { names, arrays }; Ok(df) } From 8d57736d6ca532dea399f0dd197ff4529fb5cd24 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Mon, 24 Jun 2024 12:20:00 +0100 Subject: [PATCH 29/33] rid parent --- raphtory/src/io/parquet_loaders.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 6519ecaa36..852fd762ed 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -284,9 +284,7 @@ mod test { #[test] fn test_process_parquet_file_to_df() { let parquet_file_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .parent() - .map(|p| p.join("raphtory/resources/test/test_data.parquet")) - .unwrap(); + .join("resources/test/test_data.parquet"); let col_names = vec!["src", "dst", "time", "weight", "marbles"]; let df = process_parquet_file_to_df(parquet_file_path.as_path(), col_names).unwrap(); From bbb065fb2687495080a9fa080ca47919f01beaa5 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Mon, 24 Jun 2024 12:37:30 +0100 Subject: [PATCH 30/33] make test create parquet --- python/tests/data/parquet/edges.parquet | Bin 2829 -> 0 bytes .../data/parquet/edges_deletions.parquet | Bin 1337 -> 0 bytes python/tests/data/parquet/nodes.parquet | Bin 1950 -> 0 bytes python/tests/test_load_from_parquet.py | 39 ++++++++++-------- 4 files changed, 21 insertions(+), 18 deletions(-) delete mode 100644 python/tests/data/parquet/edges.parquet delete mode 100644 python/tests/data/parquet/edges_deletions.parquet delete mode 100644 python/tests/data/parquet/nodes.parquet diff --git a/python/tests/data/parquet/edges.parquet b/python/tests/data/parquet/edges.parquet deleted file mode 100644 index 98b34238ff32a9a537c0392e0a6900ce7cf605af..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2829 zcmc&$%}*0S6rXLEc3E1j%4~K`4;*yTLls&q7-K@GDBuTJhzc4LQ?{E@Ep6Er3WS4R zj0a=<13Vey$$0S0!GlK+9zFU87{k$nF}^pu3#E%u6VORJAMa~^zxQUQ&GVTQm1&-4 zuTqtgql6@6iF8XO&>faZFshuFiPO5=krJxW<4LfN(GU&Ab;Z-@isag8B(aASls$<6 z^r{pLJ{}|Nn-r(Aks+a&p$8kbNduayeV4Q?i4~dhH<67bkRUWbpo_skNGGWPk_sXz z8Ay4MLoQ%HDNmy-7!4be@=HOno79W0Hd4h#vyIeKnY|4HEq|0>#Qr_4e^#rH$__2L zSzff-cx{H+=f64#zvV?pmn3hEbcG{?WMNK}sB$8f#WE1moAsB=)98vb+Zw@?`lTS* zU9Ja_I91&<7;0JN(n5R1zNqY(3KjDQ`bGFXna$BV-t9`h4ni#3D&n=8uPj@55=*va zRq;@)S(Qqyg|qsyU9VW3Jcmdk=hG_kP_DZ(#Bt!I2!(6Zy z3-l1YT_*$LL3C&Eia0ZI@d#Bx?VeRaBe3vFeUe;_15zQ|oym5M3)dBCsu4Ff4)iT% z2m2fxi)KzwRyD-@C;2@sw*nBAUt+Z8OqKh!NK#d}ij_>^bMq>+j@#Y(u3S z=X@Hz*sgHi;<*wZUHSOg5u(s2m&B*A$rjbK%Z;g zn!8+{AHTUim!4XguTG(mr_q0)!#{@r+WiIoZGXfA{6u^c{RQ4hfme|3#5+-}6lROH zB1Q%Dm^a|zP{Wj6+1`It1QjIP_s7t=fx>L6A|hPA=knz_vFTK&zmT@^$I|5Pn}gGQ z)x(SBg9b)W2Z3h->k3Vf!wG0zJe~Ov@z%}7Nvt<4z`y0xTeykJiWw(*U%g;2EnCe~ oX1#u@VB57;--=~7%C%}=Dsy_^blTW}*U|qW58+2R4}XMz0&Q@|hyVZp diff --git a/python/tests/data/parquet/edges_deletions.parquet b/python/tests/data/parquet/edges_deletions.parquet deleted file mode 100644 index 89fa9488f6ee3f9fb82447a931befda989026a01..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1337 zcmcIk%}(1u5FXoG$CY}B5WBJ^AAD#nBnk!ENC<_)IJBh&3XO_TxJ4!ke}U9VoA>Aw z^fCGfJyt!@Th%Av5jwN>LX;384jEZ9JAdEKw`=Vl)|aJ_k$k#M24(=tRR?krg_R&@ zqr`R&Ep^wpzq=lY*)Z{d+Bkj)08gV&of(J zdaQw)bc)B+iPx(aEl~5w~)U}ue2txk}*6X_mYxx2DKkOGQ^T_g1 zYs4+Y0VZvHgV@GeTSt=wafeqm#CL4THJj%jv#~$YucJf#8(ej``C$onK2hl^yNQOPA{w6NYhW_S=cdgJKQm9N z8(8I{@zR&?PnkD4)SoinLJ09j*4M$0y3OqKmsWov?RFQMS#~*yzP7U7$>l|~Tv=RN STnVZnK>ooWdTjRS(fkkMP_B#s diff --git a/python/tests/data/parquet/nodes.parquet b/python/tests/data/parquet/nodes.parquet deleted file mode 100644 index 4da22cc3d7efab4750c9ce90a52fb44715e84253..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1950 zcmc&#Pfrs;6rbI0%N7>X5NES%mZTRZ94fY88)Nh^TUx53mIi_v5?RVxE2Uiuw9%6X z4<0;tFvcSvz@ssK0OP@fXJd?C!o+a&z1dw_p&p13H=Q?c_Wk?4KeH|KN`@+Qm6qnI zM#&fA4hGBxsDrEJFzc{zBIGCDvf-aU7c|$b(Rs07<|`2%OjGYM56> zWWgKtsv7~J-CC0psl3x}H553`I-RJIDs)^-wb*u=8;5huk;$kK7*XK_^#07`Rn($d zkcrZCHeHd+H%b34v3Cml5V4N97>N1*_2P}%>xHsY4>It(@ob3g9jnm;b9*V##jqiz zlnOO0yY1~(GnpCJ$TX}OSQp2&bhb~H8_)_hu*(@CmpRgq_al7`*?H{w4S6l=KP2{f zn0<|7LqhX@M+2b`(=+o_7FQ&iz#;bl))>`$w*b6Jo*{W4izodVmo&mYj`m>0o2_;C z{sESz-eLP1`=TKd-*!-Jvyml6Y`gl<-E`%Z;PYLFPy_#^(1!CZ z)Xyyf5%KbVJds3SJhZpd&T>Y;<<}nt`rHiE^mPNk=&%@w%2{Xg;({-Mh7&-@$obUGdY diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index 651156f0ff..20df621fb2 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -9,14 +9,14 @@ from raphtory import Graph, PersistentGraph -@pytest.mark.skip(reason="Prepares data for debugging purposes") -def test_prepare_data(): +@pytest.fixture(scope="session") +def parquet_files(): dirname = tempfile.TemporaryDirectory() - nodes_parquet_fp = os.path.join(dirname.name, "parquet", "nodes.parquet") - edges_parquet_fp = os.path.join(dirname.name, "parquet", "edges.parquet") - edge_deletions_parquet_fp = os.path.join(dirname.name, "parquet", "edges_deletions.parquet") + nodes_parquet_file_path = os.path.join(dirname.name, "parquet", "nodes.parquet") + edges_parquet_file_path = os.path.join(dirname.name, "parquet", "edges.parquet") + edge_deletions_parquet_file_path = os.path.join(dirname.name, "parquet", "edges_deletions.parquet") - os.makedirs(os.path.dirname(nodes_parquet_fp), exist_ok=True) + os.makedirs(os.path.dirname(nodes_parquet_file_path), exist_ok=True) data = { "id": [1, 2, 3, 4, 5, 6], @@ -27,8 +27,8 @@ def test_prepare_data(): } table = pa.table(data) - pq.write_table(table, nodes_parquet_fp) - print("""Created nodes.parquet at loc = {}""".format(nodes_parquet_fp)) + pq.write_table(table, nodes_parquet_file_path) + print("""Created nodes.parquet at loc = {}""".format(nodes_parquet_file_path)) data = { "src": [1, 2, 3, 4, 5], @@ -41,8 +41,8 @@ def test_prepare_data(): } table = pa.table(data) - pq.write_table(table, edges_parquet_fp) - print("""Created edges.parquet at loc = {}""".format(edges_parquet_fp)) + pq.write_table(table, edges_parquet_file_path) + print("""Created edges.parquet at loc = {}""".format(edges_parquet_file_path)) data = { "src": [3, 4], @@ -51,14 +51,13 @@ def test_prepare_data(): } table = pa.table(data) - pq.write_table(table, edge_deletions_parquet_fp) - print("""Created edges_deletions.parquet at loc = {}""".format(edge_deletions_parquet_fp)) + pq.write_table(table, edge_deletions_parquet_file_path) + print("""Created edges_deletions.parquet at loc = {}""".format(edge_deletions_parquet_file_path)) + yield nodes_parquet_file_path, edges_parquet_file_path, edge_deletions_parquet_file_path -nodes_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'nodes.parquet') -edges_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', 'edges.parquet') -edges_deletions_parquet_file_path = os.path.join(os.path.dirname(__file__), 'data', 'parquet', - 'edges_deletions.parquet') + # Cleanup the temporary directory after tests + dirname.cleanup() def assert_expected_nodes(g): @@ -198,7 +197,9 @@ def assert_expected_test_layer(g): assert g.layers(["test_layer"]).edges.src.id.collect() == [1, 2, 3, 4, 5] -def test_load_from_parquet_graphs(): +def test_load_from_parquet_graphs(parquet_files): + nodes_parquet_file_path, edges_parquet_file_path, edges_deletions_parquet_file_path = parquet_files + g = Graph.load_from_parquet( edge_parquet_file_path=edges_parquet_file_path, edge_src="src", @@ -313,7 +314,9 @@ def test_load_from_parquet_graphs(): assert_expected_layers(g) -def test_load_from_parquet_persistent_graphs(): +def test_load_from_parquet_persistent_graphs(parquet_files): + nodes_parquet_file_path, edges_parquet_file_path, edges_deletions_parquet_file_path = parquet_files + g = PersistentGraph.load_from_parquet( edge_parquet_file_path=edges_parquet_file_path, edge_src="src", From f27c9a07bf4ff54d71a9433948f2cb4cc37bcc9f Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Mon, 24 Jun 2024 14:04:59 +0100 Subject: [PATCH 31/33] load parquet from dir or file --- python/tests/test_load_from_parquet.py | 122 +++++------ raphtory/src/io/parquet_loaders.rs | 204 ++++++++++-------- raphtory/src/python/graph/graph.rs | 52 ++--- .../src/python/graph/graph_with_deletions.rs | 56 ++--- 4 files changed, 231 insertions(+), 203 deletions(-) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index 20df621fb2..f67634d4d8 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -201,12 +201,12 @@ def test_load_from_parquet_graphs(parquet_files): nodes_parquet_file_path, edges_parquet_file_path, edges_deletions_parquet_file_path = parquet_files g = Graph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, + edge_parquet_path=edges_parquet_file_path, edge_src="src", edge_dst="dst", edge_time="time", edge_properties=["weight", "marbles"], - node_parquet_file_path=nodes_parquet_file_path, + node_parquet_path=nodes_parquet_file_path, node_id="id", node_time="time", node_properties=["name"], @@ -217,18 +217,18 @@ def test_load_from_parquet_graphs(parquet_files): g = Graph() g.load_nodes_from_parquet( - nodes_parquet_file_path, - "id", - "time", - "node_type", + parquet_path=nodes_parquet_file_path, + id="id", + time="time", + node_type="node_type", properties=["name"] ) g.load_edges_from_parquet( - edges_parquet_file_path, - "src", - "dst", - "time", - ["weight", "marbles"], + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", + time="time", + properties=["weight", "marbles"], layer="layers" ) assert_expected_nodes(g) @@ -236,8 +236,8 @@ def test_load_from_parquet_graphs(parquet_files): assert_expected_layers(g) g.load_node_props_from_parquet( - nodes_parquet_file_path, - "id", + parquet_path=nodes_parquet_file_path, + id="id", const_properties=["type"], shared_const_properties={"tag": "test_tag"}, ) @@ -245,9 +245,9 @@ def test_load_from_parquet_graphs(parquet_files): assert_expected_node_property_type(g) g.load_edge_props_from_parquet( - edges_parquet_file_path, - "src", - "dst", + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", const_properties=["marbles_const"], shared_const_properties={"tag": "test_tag"}, layer="layers", @@ -257,10 +257,10 @@ def test_load_from_parquet_graphs(parquet_files): g = Graph() g.load_nodes_from_parquet( - nodes_parquet_file_path, - "id", - "time", - "node_type", + parquet_path=nodes_parquet_file_path, + id="id", + time="time", + node_type="node_type", properties=["name"], shared_const_properties={"tag": "test_tag"}, ) @@ -269,10 +269,10 @@ def test_load_from_parquet_graphs(parquet_files): g = Graph() g.load_edges_from_parquet( - edges_parquet_file_path, - "src", - "dst", - "time", + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", + time="time", properties=["weight", "marbles"], const_properties=["marbles_const"], shared_const_properties={"type": "Edge", "tag": "test_tag"}, @@ -283,13 +283,13 @@ def test_load_from_parquet_graphs(parquet_files): assert_expected_test_layer(g) g = Graph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, + edge_parquet_path=edges_parquet_file_path, edge_src="src", edge_dst="dst", edge_time="time", edge_layer="test_layer", layer_in_df=False, - node_parquet_file_path=nodes_parquet_file_path, + node_parquet_path=nodes_parquet_file_path, node_id="id", node_time="time", node_properties=["name"], @@ -299,12 +299,12 @@ def test_load_from_parquet_graphs(parquet_files): assert_expected_node_property_dept(g) g = Graph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, + edge_parquet_path=edges_parquet_file_path, edge_src="src", edge_dst="dst", edge_time="time", edge_layer="layers", - node_parquet_file_path=nodes_parquet_file_path, + node_parquet_path=nodes_parquet_file_path, node_id="id", node_time="time", node_properties=["name"], @@ -318,12 +318,12 @@ def test_load_from_parquet_persistent_graphs(parquet_files): nodes_parquet_file_path, edges_parquet_file_path, edges_deletions_parquet_file_path = parquet_files g = PersistentGraph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, + edge_parquet_path=edges_parquet_file_path, edge_src="src", edge_dst="dst", edge_time="time", edge_properties=["weight", "marbles"], - node_parquet_file_path=nodes_parquet_file_path, + node_parquet_path=nodes_parquet_file_path, node_id="id", node_time="time", node_properties=["name"], @@ -334,18 +334,18 @@ def test_load_from_parquet_persistent_graphs(parquet_files): g = PersistentGraph() g.load_nodes_from_parquet( - nodes_parquet_file_path, - "id", - "time", - "node_type", + parquet_path=nodes_parquet_file_path, + id="id", + time="time", + node_type="node_type", properties=["name"] ) g.load_edges_from_parquet( - edges_parquet_file_path, - "src", - "dst", - "time", - ["weight", "marbles"], + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", + time="time", + properties=["weight", "marbles"], layer="layers" ) assert_expected_nodes(g) @@ -353,8 +353,8 @@ def test_load_from_parquet_persistent_graphs(parquet_files): assert_expected_layers(g) g.load_node_props_from_parquet( - nodes_parquet_file_path, - "id", + parquet_path=nodes_parquet_file_path, + id="id", const_properties=["type"], shared_const_properties={"tag": "test_tag"}, ) @@ -362,9 +362,9 @@ def test_load_from_parquet_persistent_graphs(parquet_files): assert_expected_node_property_type(g) g.load_edge_props_from_parquet( - edges_parquet_file_path, - "src", - "dst", + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", const_properties=["marbles_const"], shared_const_properties={"tag": "test_tag"}, layer="layers", @@ -374,10 +374,10 @@ def test_load_from_parquet_persistent_graphs(parquet_files): g = PersistentGraph() g.load_nodes_from_parquet( - nodes_parquet_file_path, - "id", - "time", - "node_type", + parquet_path=nodes_parquet_file_path, + id="id", + time="time", + node_type="node_type", properties=["name"], shared_const_properties={"tag": "test_tag"}, ) @@ -386,10 +386,10 @@ def test_load_from_parquet_persistent_graphs(parquet_files): g = PersistentGraph() g.load_edges_from_parquet( - edges_parquet_file_path, - "src", - "dst", - "time", + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", + time="time", properties=["weight", "marbles"], const_properties=["marbles_const"], shared_const_properties={"type": "Edge", "tag": "test_tag"}, @@ -400,13 +400,13 @@ def test_load_from_parquet_persistent_graphs(parquet_files): assert_expected_test_layer(g) g = Graph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, + edge_parquet_path=edges_parquet_file_path, edge_src="src", edge_dst="dst", edge_time="time", edge_layer="test_layer", layer_in_df=False, - node_parquet_file_path=nodes_parquet_file_path, + node_parquet_path=nodes_parquet_file_path, node_id="id", node_time="time", node_properties=["name"], @@ -416,12 +416,12 @@ def test_load_from_parquet_persistent_graphs(parquet_files): assert_expected_node_property_dept(g) g = PersistentGraph.load_from_parquet( - edge_parquet_file_path=edges_parquet_file_path, + edge_parquet_path=edges_parquet_file_path, edge_src="src", edge_dst="dst", edge_time="time", edge_layer="layers", - node_parquet_file_path=nodes_parquet_file_path, + node_parquet_path=nodes_parquet_file_path, node_id="id", node_time="time", node_properties=["name"], @@ -432,14 +432,14 @@ def test_load_from_parquet_persistent_graphs(parquet_files): g = PersistentGraph() g.load_edges_from_parquet( - edges_parquet_file_path, - "src", - "dst", - "time", + parquet_path=edges_parquet_file_path, + src="src", + dst="dst", + time="time", ) assert g.window(10, 12).edges.src.id.collect() == [1, 2, 3, 4, 5] g.load_edges_deletions_from_parquet( - parquet_file_path=edges_deletions_parquet_file_path, + parquet_path=edges_deletions_parquet_file_path, src="src", dst="dst", time="time" diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 852fd762ed..c0d278710d 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -1,5 +1,5 @@ use crate::{ - core::{entities::graph::tgraph::InternalGraph, utils::errors::GraphError, Prop}, + core::{utils::errors::GraphError, Prop}, db::api::{ mutation::internal::{InternalAdditionOps, InternalPropertyAdditionOps}, view::StaticGraphViewOps, @@ -18,13 +18,17 @@ use polars_parquet::{ read, read::{read_metadata, FileMetaData, FileReader}, }; -use std::{collections::HashMap, path::Path}; +use std::{ + collections::HashMap, + fs, + path::{Path, PathBuf}, +}; pub fn load_nodes_from_parquet< G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( graph: &G, - parquet_file_path: &Path, + parquet_path: &Path, id: &str, time: &str, node_type: Option<&str>, @@ -42,23 +46,24 @@ pub fn load_nodes_from_parquet< } } - let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; - df.check_cols_exist(&cols_to_check)?; - let size = df.get_inner_size(); - - load_nodes_from_df( - &df, - size, - id, - time, - properties, - const_properties, - shared_const_properties, - node_type, - node_type_in_df.unwrap_or(true), - graph, - ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + for path in get_parquet_file_paths(parquet_path) { + let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = df.get_inner_size(); + load_nodes_from_df( + &df, + size, + id, + time, + properties.clone(), + const_properties.clone(), + shared_const_properties.clone(), + node_type, + node_type_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + } Ok(()) } @@ -67,7 +72,7 @@ pub fn load_edges_from_parquet< G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( graph: &G, - parquet_file_path: &Path, + parquet_path: &Path, src: &str, dst: &str, time: &str, @@ -86,24 +91,25 @@ pub fn load_edges_from_parquet< } } - let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; - df.check_cols_exist(&cols_to_check)?; - let size = cols_to_check.len(); - - load_edges_from_df( - &df, - size, - src, - dst, - time, - properties, - const_properties, - shared_const_properties, - layer, - layer_in_df.unwrap_or(true), - graph, - ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + for path in get_parquet_file_paths(parquet_path) { + let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + load_edges_from_df( + &df, + size, + src, + dst, + time, + properties.clone(), + const_properties.clone(), + shared_const_properties.clone(), + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + } Ok(()) } @@ -112,7 +118,7 @@ pub fn load_node_props_from_parquet< G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( graph: &G, - parquet_file_path: &Path, + parquet_path: &Path, id: &str, const_properties: Option>, shared_const_properties: Option>, @@ -120,19 +126,20 @@ pub fn load_node_props_from_parquet< let mut cols_to_check = vec![id]; cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); - let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; - df.check_cols_exist(&cols_to_check)?; - let size = cols_to_check.len(); - - load_node_props_from_df( - &df, - size, - id, - const_properties, - shared_const_properties, - graph, - ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + for path in get_parquet_file_paths(parquet_path) { + let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + load_node_props_from_df( + &df, + size, + id, + const_properties.clone(), + shared_const_properties.clone(), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + } Ok(()) } @@ -141,7 +148,7 @@ pub fn load_edge_props_from_parquet< G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, >( graph: &G, - parquet_file_path: &Path, + parquet_path: &Path, src: &str, dst: &str, const_properties: Option>, @@ -157,22 +164,23 @@ pub fn load_edge_props_from_parquet< } cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); - let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; - df.check_cols_exist(&cols_to_check)?; - let size = cols_to_check.len(); - - load_edges_props_from_df( - &df, - size, - src, - dst, - const_properties, - shared_const_properties, - layer, - layer_in_df.unwrap_or(true), - graph, - ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + for path in get_parquet_file_paths(parquet_path) { + let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + load_edges_props_from_df( + &df, + size, + src, + dst, + const_properties.clone(), + shared_const_properties.clone(), + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + } Ok(()) } @@ -181,7 +189,7 @@ pub fn load_edges_deletions_from_parquet< G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + DeletionOps, >( graph: &G, - parquet_file_path: &Path, + parquet_path: &Path, src: &str, dst: &str, time: &str, @@ -195,21 +203,22 @@ pub fn load_edges_deletions_from_parquet< } } - let df = process_parquet_file_to_df(parquet_file_path, cols_to_check.clone())?; - df.check_cols_exist(&cols_to_check)?; - let size = cols_to_check.len(); - - load_edges_deletions_from_df( - &df, - size, - src, - dst, - time, - layer, - layer_in_df.unwrap_or(true), - graph, - ) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + for path in get_parquet_file_paths(parquet_path) { + let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; + df.check_cols_exist(&cols_to_check)?; + let size = cols_to_check.len(); + load_edges_deletions_from_df( + &df, + size, + src, + dst, + time, + layer, + layer_in_df.unwrap_or(true), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + } Ok(()) } @@ -275,6 +284,25 @@ fn read_parquet_file( Ok((names, reader)) } +fn get_parquet_file_paths(parquet_path: &Path) -> Vec { + let mut parquet_files = Vec::new(); + if parquet_path.is_file() { + parquet_files.push(parquet_path.to_path_buf()); + } else if parquet_path.is_dir() { + for entry in fs::read_dir(parquet_path).expect("Directory not found") { + let entry = entry.expect("Unable to read entry"); + let path = entry.path(); + if path.extension().map_or(false, |ext| ext == "parquet") { + parquet_files.push(path); + } + } + } else { + println!("Invalid path provided: {:?}", parquet_path); + } + + parquet_files +} + #[cfg(test)] mod test { use super::*; @@ -283,8 +311,8 @@ mod test { #[test] fn test_process_parquet_file_to_df() { - let parquet_file_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("resources/test/test_data.parquet"); + let parquet_file_path = + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("resources/test/test_data.parquet"); let col_names = vec!["src", "dst", "time", "weight", "marbles"]; let df = process_parquet_file_to_df(parquet_file_path.as_path(), col_names).unwrap(); diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 904f579d29..fe13ab71a8 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -462,7 +462,7 @@ impl PyGraph { /// Load a graph from Parquet file. /// /// Args: - /// edge_parquet_file_path (str): Parquet file containing the edges. + /// edge_parquet_path (str): Parquet file or directory of Parquet files containing the edges. /// edge_src (str): The column name for the source node ids. /// edge_dst (str): The column name for the destination node ids. /// edge_time (str): The column name for the timestamps. @@ -471,7 +471,7 @@ impl PyGraph { /// edge_shared_const_properties (dict): A dictionary of constant properties that will be added to every edge (optional) Defaults to None. /// edge_layer (str): The edge layer name (optional) Defaults to None. /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the edge_df or if it should be used directly as the layer for all edges (optional) defaults to True. - /// node_parquet_file_path (str): Parquet file containing the nodes (optional) Defaults to None. + /// node_parquet_path (str): Parquet file or directory of Parquet files containing the nodes (optional) Defaults to None. /// node_id (str): The column name for the node ids (optional) Defaults to None. /// node_time (str): The column name for the node timestamps (optional) Defaults to None. /// node_properties (list): The column names for the node temporal properties (optional) Defaults to None. @@ -483,11 +483,11 @@ impl PyGraph { /// Returns: /// Graph: The loaded Graph object. #[staticmethod] - #[pyo3(signature = (edge_parquet_file_path, edge_src, edge_dst, edge_time, edge_properties = None, edge_const_properties = None, edge_shared_const_properties = None, - edge_layer = None, layer_in_df = true, node_parquet_file_path = None, node_id = None, node_time = None, node_properties = None, + #[pyo3(signature = (edge_parquet_path, edge_src, edge_dst, edge_time, edge_properties = None, edge_const_properties = None, edge_shared_const_properties = None, + edge_layer = None, layer_in_df = true, node_parquet_path = None, node_id = None, node_time = None, node_properties = None, node_const_properties = None, node_shared_const_properties = None, node_type = None, node_type_in_df = true))] fn load_from_parquet( - edge_parquet_file_path: PathBuf, + edge_parquet_path: PathBuf, edge_src: &str, edge_dst: &str, edge_time: &str, @@ -496,7 +496,7 @@ impl PyGraph { edge_shared_const_properties: Option>, edge_layer: Option<&str>, layer_in_df: Option, - node_parquet_file_path: Option, + node_parquet_path: Option, node_id: Option<&str>, node_time: Option<&str>, node_properties: Option>, @@ -508,11 +508,11 @@ impl PyGraph { let graph = PyGraph { graph: Graph::new(), }; - if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = - (node_parquet_file_path, node_id, node_time) + if let (Some(node_parquet_path), Some(node_id), Some(node_time)) = + (node_parquet_path, node_id, node_time) { graph.load_nodes_from_parquet( - node_parquet_file_path, + node_parquet_path, node_id, node_time, node_type, @@ -523,7 +523,7 @@ impl PyGraph { )?; } graph.load_edges_from_parquet( - edge_parquet_file_path, + edge_parquet_path, edge_src, edge_dst, edge_time, @@ -577,7 +577,7 @@ impl PyGraph { /// Load nodes from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing the nodes + /// parquet_path (str): Parquet file or directory of Parquet files containing the nodes /// id (str): The column name for the node IDs. /// time (str): The column name for the timestamps. /// node_type (str): the column name for the node type @@ -587,10 +587,10 @@ impl PyGraph { /// shared_const_properties (Dictionary/Hashmap of properties): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, id, time, node_type = None, node_type_in_df = true, properties = None, const_properties = None, shared_const_properties = None))] + #[pyo3(signature = (parquet_path, id, time, node_type = None, node_type_in_df = true, properties = None, const_properties = None, shared_const_properties = None))] fn load_nodes_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, id: &str, time: &str, node_type: Option<&str>, @@ -601,7 +601,7 @@ impl PyGraph { ) -> Result<(), GraphError> { load_nodes_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), id, time, node_type, @@ -657,7 +657,7 @@ impl PyGraph { /// Load edges from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing edges + /// parquet_path (str): Parquet file or directory of Parquet files path containing edges /// src (str): The column name for the source node ids. /// dst (str): The column name for the destination node ids. /// time (str): The column name for the update timestamps. @@ -669,10 +669,10 @@ impl PyGraph { /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, src, dst, time, properties = None, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + #[pyo3(signature = (parquet_path, src, dst, time, properties = None, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] fn load_edges_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, src: &str, dst: &str, time: &str, @@ -684,7 +684,7 @@ impl PyGraph { ) -> Result<(), GraphError> { load_edges_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), src, dst, time, @@ -726,24 +726,24 @@ impl PyGraph { /// Load node properties from a parquet file. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing node information. + /// parquet_path (str): Parquet file or directory of Parquet files path containing node information. /// id(str): The column name for the node IDs. /// const_properties (List): List of constant node property column names. Defaults to None. (optional) /// shared_const_properties (>): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, id, const_properties = None, shared_const_properties = None))] + #[pyo3(signature = (parquet_path, id, const_properties = None, shared_const_properties = None))] fn load_node_props_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, id: &str, const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { load_node_props_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), id, const_properties, shared_const_properties, @@ -789,7 +789,7 @@ impl PyGraph { /// Load edge properties from parquet file /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing edge information. + /// parquet_path (str): Parquet file or directory of Parquet files path containing edge information. /// src (str): The column name for the source node. /// dst (str): The column name for the destination node. /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) @@ -799,10 +799,10 @@ impl PyGraph { /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, src, dst, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + #[pyo3(signature = (parquet_path, src, dst, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] fn load_edge_props_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, src: &str, dst: &str, const_properties: Option>, @@ -812,7 +812,7 @@ impl PyGraph { ) -> Result<(), GraphError> { load_edge_props_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), src, dst, const_properties, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 8a73a8a675..4769af9c33 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -449,7 +449,7 @@ impl PyPersistentGraph { /// Load a graph from Parquet file. /// /// Args: - /// edge_parquet_file_path (str): Parquet file containing the edges. + /// edge_parquet_path (str): Parquet file or directory of Parquet files containing the edges. /// edge_src (str): The column name for the source node ids. /// edge_dst (str): The column name for the destination node ids. /// edge_time (str): The column name for the timestamps. @@ -458,7 +458,7 @@ impl PyPersistentGraph { /// edge_shared_const_properties (dict): A dictionary of constant properties that will be added to every edge (optional) Defaults to None. /// edge_layer (str): The edge layer name (optional) Defaults to None. /// layer_in_df (bool): Whether the layer name should be used to look up the values in a column of the edge_df or if it should be used directly as the layer for all edges (optional) defaults to True. - /// node_parquet_file_path (str): Parquet file containing the nodes (optional) Defaults to None. + /// node_parquet_path (str): Parquet file or directory of Parquet files containing the nodes (optional) Defaults to None. /// node_id (str): The column name for the node ids (optional) Defaults to None. /// node_time (str): The column name for the node timestamps (optional) Defaults to None. /// node_properties (list): The column names for the node temporal properties (optional) Defaults to None. @@ -470,11 +470,11 @@ impl PyPersistentGraph { /// Returns: /// Graph: The loaded Graph object. #[staticmethod] - #[pyo3(signature = (edge_parquet_file_path, edge_src, edge_dst, edge_time, edge_properties = None, edge_const_properties = None, edge_shared_const_properties = None, - edge_layer = None, layer_in_df = true, node_parquet_file_path = None, node_id = None, node_time = None, node_properties = None, + #[pyo3(signature = (edge_parquet_path, edge_src, edge_dst, edge_time, edge_properties = None, edge_const_properties = None, edge_shared_const_properties = None, + edge_layer = None, layer_in_df = true, node_parquet_path = None, node_id = None, node_time = None, node_properties = None, node_const_properties = None, node_shared_const_properties = None, node_type = None, node_type_in_df = true))] fn load_from_parquet( - edge_parquet_file_path: PathBuf, + edge_parquet_path: PathBuf, edge_src: &str, edge_dst: &str, edge_time: &str, @@ -483,7 +483,7 @@ impl PyPersistentGraph { edge_shared_const_properties: Option>, edge_layer: Option<&str>, layer_in_df: Option, - node_parquet_file_path: Option, + node_parquet_path: Option, node_id: Option<&str>, node_time: Option<&str>, node_properties: Option>, @@ -496,7 +496,7 @@ impl PyPersistentGraph { graph: PersistentGraph::new(), }; if let (Some(node_parquet_file_path), Some(node_id), Some(node_time)) = - (node_parquet_file_path, node_id, node_time) + (node_parquet_path, node_id, node_time) { graph.load_nodes_from_parquet( node_parquet_file_path, @@ -510,7 +510,7 @@ impl PyPersistentGraph { )?; } graph.load_edges_from_parquet( - edge_parquet_file_path, + edge_parquet_path, edge_src, edge_dst, edge_time, @@ -564,7 +564,7 @@ impl PyPersistentGraph { /// Load nodes from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing the nodes + /// parquet_path (str): Parquet file or directory of Parquet files containing the nodes /// id (str): The column name for the node IDs. /// time (str): The column name for the timestamps. /// node_type (str): the column name for the node type @@ -574,10 +574,10 @@ impl PyPersistentGraph { /// shared_const_properties (Dictionary/Hashmap of properties): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, id, time, node_type = None, node_type_in_df = true, properties = None, const_properties = None, shared_const_properties = None))] + #[pyo3(signature = (parquet_path, id, time, node_type = None, node_type_in_df = true, properties = None, const_properties = None, shared_const_properties = None))] fn load_nodes_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, id: &str, time: &str, node_type: Option<&str>, @@ -588,7 +588,7 @@ impl PyPersistentGraph { ) -> Result<(), GraphError> { load_nodes_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), id, time, node_type, @@ -644,7 +644,7 @@ impl PyPersistentGraph { /// Load edges from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing edges + /// parquet_path (str): Parquet file or directory of Parquet files path containing edges /// src (str): The column name for the source node ids. /// dst (str): The column name for the destination node ids. /// time (str): The column name for the update timestamps. @@ -656,10 +656,10 @@ impl PyPersistentGraph { /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, src, dst, time, properties = None, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + #[pyo3(signature = (parquet_path, src, dst, time, properties = None, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] fn load_edges_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, src: &str, dst: &str, time: &str, @@ -671,7 +671,7 @@ impl PyPersistentGraph { ) -> Result<(), GraphError> { load_edges_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), src, dst, time, @@ -711,7 +711,7 @@ impl PyPersistentGraph { /// Load edges deletions from a Parquet file into the graph. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing edges + /// parquet_path (str): Parquet file or directory of Parquet files path containing node information. /// src (str): The column name for the source node ids. /// dst (str): The column name for the destination node ids. /// time (str): The column name for the update timestamps. @@ -720,10 +720,10 @@ impl PyPersistentGraph { /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, src, dst, time, layer = None, layer_in_df = true))] + #[pyo3(signature = (parquet_path, src, dst, time, layer = None, layer_in_df = true))] fn load_edges_deletions_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, src: &str, dst: &str, time: &str, @@ -732,7 +732,7 @@ impl PyPersistentGraph { ) -> Result<(), GraphError> { load_edges_deletions_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), src, dst, time, @@ -771,24 +771,24 @@ impl PyPersistentGraph { /// Load node properties from a parquet file. /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing node information. + /// parquet_path (str): Parquet file or directory of Parquet files path containing node information. /// id(str): The column name for the node IDs. /// const_properties (List): List of constant node property column names. Defaults to None. (optional) /// shared_const_properties (>): A dictionary of constant properties that will be added to every node. Defaults to None. (optional) /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, id, const_properties = None, shared_const_properties = None))] + #[pyo3(signature = (parquet_path, id, const_properties = None, shared_const_properties = None))] fn load_node_props_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, id: &str, const_properties: Option>, shared_const_properties: Option>, ) -> Result<(), GraphError> { load_node_props_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), id, const_properties, shared_const_properties, @@ -834,7 +834,7 @@ impl PyPersistentGraph { /// Load edge properties from parquet file /// /// Arguments: - /// parquet_file_path (str): Parquet file path containing edge information. + /// parquet_path (str): Parquet file or directory of Parquet files path containing edge information. /// src (str): The column name for the source node. /// dst (str): The column name for the destination node. /// const_properties (List): List of constant edge property column names. Defaults to None. (optional) @@ -844,10 +844,10 @@ impl PyPersistentGraph { /// /// Returns: /// Result<(), GraphError>: Result of the operation. - #[pyo3(signature = (parquet_file_path, src, dst, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] + #[pyo3(signature = (parquet_path, src, dst, const_properties = None, shared_const_properties = None, layer = None, layer_in_df = true))] fn load_edge_props_from_parquet( &self, - parquet_file_path: PathBuf, + parquet_path: PathBuf, src: &str, dst: &str, const_properties: Option>, @@ -857,7 +857,7 @@ impl PyPersistentGraph { ) -> Result<(), GraphError> { load_edge_props_from_parquet( &self.graph, - parquet_file_path.as_path(), + parquet_path.as_path(), src, dst, const_properties, From f102175956b6b343d7f243885e286bf51304f952 Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:17:35 +0100 Subject: [PATCH 32/33] change invalid layers error message to include valid layers --- python/tests/test_graphdb.py | 14 ++++++++++++++ python/tests/test_load_from_pandas.py | 2 +- python/tests/test_load_from_parquet.py | 2 +- raphtory/src/core/entities/graph/tgraph.rs | 15 +++++++++++---- raphtory/src/core/utils/errors.rs | 17 +++++++++++++++-- raphtory/src/db/graph/edge.rs | 9 +++++++-- raphtory/src/disk_graph/graph_impl/layer_ops.rs | 15 +++++++++++---- 7 files changed, 60 insertions(+), 14 deletions(-) diff --git a/python/tests/test_graphdb.py b/python/tests/test_graphdb.py index fe732bd179..402fdd5099 100644 --- a/python/tests/test_graphdb.py +++ b/python/tests/test_graphdb.py @@ -2,6 +2,8 @@ import math import sys import random +import re + import pandas as pd import pandas.core.frame import pytest @@ -1435,6 +1437,18 @@ def test_layer(): assert g.exclude_layers(["layer1", "layer2"]).count_edges() == 1 assert g.exclude_layer("layer2").count_edges() == 4 + with pytest.raises( + Exception, + match=re.escape("Invalid layer: test_layer. Valid layers: _default, layer1, layer2"), + ): + g.layers(["test_layer"]) + + with pytest.raises( + Exception, + match=re.escape("Invalid layer: test_layer. Valid layers: _default, layer1, layer2"), + ): + g.edge(1, 2).layers(["test_layer"]) + def test_layer_node(): g = Graph() diff --git a/python/tests/test_load_from_pandas.py b/python/tests/test_load_from_pandas.py index 0b2fb64c56..e966d1a21b 100644 --- a/python/tests/test_load_from_pandas.py +++ b/python/tests/test_load_from_pandas.py @@ -624,7 +624,7 @@ def assertions_layers_in_df(g): assert g.layers(["layer 3"]).edges.src.id.collect() == [3] with pytest.raises( Exception, - match=re.escape("Invalid layer test_layer."), + match=re.escape("Invalid layer: test_layer. Valid layers: _default, layer 1, layer 2, layer 3, layer 4, layer 5"), ): g.layers(["test_layer"]) diff --git a/python/tests/test_load_from_parquet.py b/python/tests/test_load_from_parquet.py index f67634d4d8..6785df097a 100644 --- a/python/tests/test_load_from_parquet.py +++ b/python/tests/test_load_from_parquet.py @@ -187,7 +187,7 @@ def assert_expected_layers(g): assert g.layers(["layer 1", "layer 4", "layer 5"]).edges.src.id.collect() == [1, 4, 5] with pytest.raises( Exception, - match=re.escape("Invalid layer test_layer."), + match=re.escape("Invalid layer: test_layer. Valid layers: _default, layer 1, layer 2, layer 3, layer 4, layer 5"), ): g.layers(["test_layer"]) diff --git a/raphtory/src/core/entities/graph/tgraph.rs b/raphtory/src/core/entities/graph/tgraph.rs index dcb51de815..f25aeff572 100644 --- a/raphtory/src/core/entities/graph/tgraph.rs +++ b/raphtory/src/core/entities/graph/tgraph.rs @@ -143,21 +143,28 @@ impl TemporalGraph { } pub(crate) fn layer_ids(&self, key: Layer) -> Result { + let valid_layers = self + .edge_meta + .layer_meta() + .get_keys() + .iter() + .map(|x| x.to_string()) + .collect::>(); match key { Layer::None => Ok(LayerIds::None), Layer::All => Ok(LayerIds::All), Layer::Default => Ok(LayerIds::One(0)), Layer::One(id) => match self.edge_meta.get_layer_id(&id) { Some(id) => Ok(LayerIds::One(id)), - None => Err(GraphError::InvalidLayer(id.to_string())), + None => Err(GraphError::invalid_layer(id.to_string(), valid_layers)), }, Layer::Multiple(ids) => { let mut new_layers = ids .iter() .map(|id| { - self.edge_meta - .get_layer_id(id) - .ok_or_else(|| GraphError::InvalidLayer(id.to_string())) + self.edge_meta.get_layer_id(id).ok_or_else(|| { + GraphError::invalid_layer(id.to_string(), valid_layers.clone()) + }) }) .collect::, GraphError>>()?; let num_layers = self.num_layers(); diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index 7b840883f5..88236bc0bd 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -66,8 +66,11 @@ pub enum GraphError { // wasm #[error("Node is not String or Number")] NodeIdNotStringOrNumber, - #[error("Invalid layer {0}.")] - InvalidLayer(String), + #[error("Invalid layer: {invalid_layer}. Valid layers: {valid_layers}")] + InvalidLayer { + invalid_layer: String, + valid_layers: String, + }, #[error("Layer {layer} does not exist for edge ({src}, {dst})")] InvalidEdgeLayer { layer: String, @@ -125,6 +128,16 @@ pub enum GraphError { TimeAPIError, } +impl GraphError { + pub fn invalid_layer(invalid_layer: String, valid_layers: Vec) -> Self { + let valid_layers = valid_layers.join(", "); + GraphError::InvalidLayer { + invalid_layer, + valid_layers, + } + } +} + #[derive(thiserror::Error, Debug, PartialEq)] pub enum MutateGraphError { #[error("Create node '{node_id}' first before adding static properties to it")] diff --git a/raphtory/src/db/graph/edge.rs b/raphtory/src/db/graph/edge.rs index 4accc97dcd..3327c94294 100644 --- a/raphtory/src/db/graph/edge.rs +++ b/raphtory/src/db/graph/edge.rs @@ -163,20 +163,25 @@ impl<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> BaseEdgeViewOps< impl EdgeView { fn resolve_layer(&self, layer: Option<&str>, create: bool) -> Result { + let valid_layers = self + .graph + .unique_layers() + .map(|l| l.0.to_string()) + .collect::>(); match layer { Some(name) => match self.edge.layer() { Some(l_id) => self .graph .get_layer_id(name) .filter(|id| id == l_id) - .ok_or_else(|| GraphError::InvalidLayer(name.to_owned())), + .ok_or_else(|| GraphError::invalid_layer(name.to_owned(), valid_layers)), None => { if create { Ok(self.graph.resolve_layer(layer)) } else { self.graph .get_layer_id(name) - .ok_or(GraphError::InvalidLayer(name.to_owned())) + .ok_or(GraphError::invalid_layer(name.to_owned(), valid_layers)) } } }, diff --git a/raphtory/src/disk_graph/graph_impl/layer_ops.rs b/raphtory/src/disk_graph/graph_impl/layer_ops.rs index fe9e9a6cf5..e2f5bbfc24 100644 --- a/raphtory/src/disk_graph/graph_impl/layer_ops.rs +++ b/raphtory/src/disk_graph/graph_impl/layer_ops.rs @@ -3,6 +3,7 @@ use crate::{ db::api::view::internal::InternalLayerOps, prelude::Layer, }; +use itertools::Itertools; use super::DiskGraph; @@ -16,6 +17,12 @@ impl InternalLayerOps for DiskGraph { } fn layer_ids_from_names(&self, key: Layer) -> Result { + let valid_layers = self + .inner + .layer_names() + .into_iter() + .map(|x| x.clone()) + .collect_vec(); match key { Layer::All => Ok(LayerIds::All), Layer::Default => Ok(LayerIds::One(0)), @@ -23,7 +30,7 @@ impl InternalLayerOps for DiskGraph { let id = self .inner .find_layer_id(&name) - .ok_or_else(|| GraphError::InvalidLayer(name.to_string()))?; + .ok_or_else(|| GraphError::invalid_layer(name.to_string(), valid_layers))?; Ok(LayerIds::One(id)) } Layer::None => Ok(LayerIds::None), @@ -31,9 +38,9 @@ impl InternalLayerOps for DiskGraph { let ids = names .iter() .map(|name| { - self.inner - .find_layer_id(name) - .ok_or_else(|| GraphError::InvalidLayer(name.to_string())) + self.inner.find_layer_id(name).ok_or_else(|| { + GraphError::invalid_layer(name.to_string(), valid_layers.clone()) + }) }) .collect::, _>>()?; Ok(LayerIds::Multiple(ids.into())) From 07d12f356ba1d83ca39af3931c6eff00a96617ac Mon Sep 17 00:00:00 2001 From: Shivam Kapoor <4599890+iamsmkr@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:41:26 +0100 Subject: [PATCH 33/33] fix issue with valid_layers --- raphtory/src/core/entities/graph/tgraph.rs | 26 +++++++++++------ raphtory/src/core/utils/errors.rs | 2 ++ raphtory/src/db/graph/edge.rs | 25 +++++++++++----- .../src/disk_graph/graph_impl/layer_ops.rs | 29 +++++++++++-------- raphtory/src/io/parquet_loaders.rs | 16 +++++----- 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/raphtory/src/core/entities/graph/tgraph.rs b/raphtory/src/core/entities/graph/tgraph.rs index bae74d3673..04f0a88152 100644 --- a/raphtory/src/core/entities/graph/tgraph.rs +++ b/raphtory/src/core/entities/graph/tgraph.rs @@ -115,6 +115,15 @@ impl Default for InternalGraph { } impl TemporalGraph { + fn get_valid_layers(edge_meta: &Arc) -> Vec { + edge_meta + .layer_meta() + .get_keys() + .iter() + .map(|x| x.to_string()) + .collect::>() + } + pub(crate) fn num_layers(&self) -> usize { self.edge_meta.layer_meta().len() } @@ -139,27 +148,26 @@ impl TemporalGraph { } pub(crate) fn layer_ids(&self, key: Layer) -> Result { - let valid_layers = self - .edge_meta - .layer_meta() - .get_keys() - .iter() - .map(|x| x.to_string()) - .collect::>(); match key { Layer::None => Ok(LayerIds::None), Layer::All => Ok(LayerIds::All), Layer::Default => Ok(LayerIds::One(0)), Layer::One(id) => match self.edge_meta.get_layer_id(&id) { Some(id) => Ok(LayerIds::One(id)), - None => Err(GraphError::invalid_layer(id.to_string(), valid_layers)), + None => Err(GraphError::invalid_layer( + id.to_string(), + Self::get_valid_layers(&self.edge_meta), + )), }, Layer::Multiple(ids) => { let mut new_layers = ids .iter() .map(|id| { self.edge_meta.get_layer_id(id).ok_or_else(|| { - GraphError::invalid_layer(id.to_string(), valid_layers.clone()) + GraphError::invalid_layer( + id.to_string(), + Self::get_valid_layers(&self.edge_meta), + ) }) }) .collect::, GraphError>>()?; diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index 0be84df334..d2a6a2dd54 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -12,6 +12,8 @@ pub enum GraphError { #[cfg(feature = "arrow")] #[error("Arrow error: {0}")] Arrow(#[from] error::PolarsError), + #[error("Invalid path = {0}")] + InvalidPath(String), #[error("Graph error occurred")] UnsupportedDataType, #[error("Graph already exists by name = {name}")] diff --git a/raphtory/src/db/graph/edge.rs b/raphtory/src/db/graph/edge.rs index f6bc3e38d8..a1777094d4 100644 --- a/raphtory/src/db/graph/edge.rs +++ b/raphtory/src/db/graph/edge.rs @@ -117,7 +117,9 @@ impl<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> BaseEdgeViewOps< type BaseGraph = G; type Graph = GH; - type ValueType =T where T: 'graph; + type ValueType = T + where + T: 'graph; type PropType = Self; type Nodes = NodeView; type Exploded = Edges<'graph, G, GH>; @@ -162,26 +164,33 @@ impl<'graph, G: GraphViewOps<'graph>, GH: GraphViewOps<'graph>> BaseEdgeViewOps< } impl EdgeView { + fn get_valid_layers(graph: &G) -> Vec { + graph.unique_layers().map(|l| l.0.to_string()).collect() + } + fn resolve_layer(&self, layer: Option<&str>, create: bool) -> Result { - let valid_layers = self - .graph - .unique_layers() - .map(|l| l.0.to_string()) - .collect::>(); match layer { Some(name) => match self.edge.layer() { Some(l_id) => self .graph .get_layer_id(name) .filter(|id| id == l_id) - .ok_or_else(|| GraphError::invalid_layer(name.to_owned(), valid_layers)), + .ok_or_else(|| { + GraphError::invalid_layer( + name.to_owned(), + Self::get_valid_layers(&self.graph), + ) + }), None => { if create { Ok(self.graph.resolve_layer(layer)) } else { self.graph .get_layer_id(name) - .ok_or(GraphError::invalid_layer(name.to_owned(), valid_layers)) + .ok_or(GraphError::invalid_layer( + name.to_owned(), + Self::get_valid_layers(&self.graph), + )) } } }, diff --git a/raphtory/src/disk_graph/graph_impl/layer_ops.rs b/raphtory/src/disk_graph/graph_impl/layer_ops.rs index e2f5bbfc24..a0a6bf1b4f 100644 --- a/raphtory/src/disk_graph/graph_impl/layer_ops.rs +++ b/raphtory/src/disk_graph/graph_impl/layer_ops.rs @@ -1,11 +1,20 @@ +use super::DiskGraph; use crate::{ core::{entities::LayerIds, utils::errors::GraphError}, db::api::view::internal::InternalLayerOps, prelude::Layer, }; use itertools::Itertools; +use pometry_storage::graph::TemporalGraph; +use std::sync::Arc; -use super::DiskGraph; +fn get_valid_layers(graph: &Arc) -> Vec { + graph + .layer_names() + .into_iter() + .map(|x| x.clone()) + .collect_vec() +} impl InternalLayerOps for DiskGraph { fn layer_ids(&self) -> &LayerIds { @@ -17,20 +26,13 @@ impl InternalLayerOps for DiskGraph { } fn layer_ids_from_names(&self, key: Layer) -> Result { - let valid_layers = self - .inner - .layer_names() - .into_iter() - .map(|x| x.clone()) - .collect_vec(); match key { Layer::All => Ok(LayerIds::All), Layer::Default => Ok(LayerIds::One(0)), Layer::One(name) => { - let id = self - .inner - .find_layer_id(&name) - .ok_or_else(|| GraphError::invalid_layer(name.to_string(), valid_layers))?; + let id = self.inner.find_layer_id(&name).ok_or_else(|| { + GraphError::invalid_layer(name.to_string(), get_valid_layers(&self.inner)) + })?; Ok(LayerIds::One(id)) } Layer::None => Ok(LayerIds::None), @@ -39,7 +41,10 @@ impl InternalLayerOps for DiskGraph { .iter() .map(|name| { self.inner.find_layer_id(name).ok_or_else(|| { - GraphError::invalid_layer(name.to_string(), valid_layers.clone()) + GraphError::invalid_layer( + name.to_string(), + get_valid_layers(&self.inner), + ) }) }) .collect::, _>>()?; diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index c0d278710d..0cf35b2586 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -46,7 +46,7 @@ pub fn load_nodes_from_parquet< } } - for path in get_parquet_file_paths(parquet_path) { + for path in get_parquet_file_paths(parquet_path)? { let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; let size = df.get_inner_size(); @@ -91,7 +91,7 @@ pub fn load_edges_from_parquet< } } - for path in get_parquet_file_paths(parquet_path) { + for path in get_parquet_file_paths(parquet_path)? { let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; let size = cols_to_check.len(); @@ -126,7 +126,7 @@ pub fn load_node_props_from_parquet< let mut cols_to_check = vec![id]; cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); - for path in get_parquet_file_paths(parquet_path) { + for path in get_parquet_file_paths(parquet_path)? { let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; let size = cols_to_check.len(); @@ -164,7 +164,7 @@ pub fn load_edge_props_from_parquet< } cols_to_check.extend(const_properties.as_ref().unwrap_or(&Vec::new())); - for path in get_parquet_file_paths(parquet_path) { + for path in get_parquet_file_paths(parquet_path)? { let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; let size = cols_to_check.len(); @@ -203,7 +203,7 @@ pub fn load_edges_deletions_from_parquet< } } - for path in get_parquet_file_paths(parquet_path) { + for path in get_parquet_file_paths(parquet_path)? { let df = process_parquet_file_to_df(path.as_path(), cols_to_check.clone())?; df.check_cols_exist(&cols_to_check)?; let size = cols_to_check.len(); @@ -284,7 +284,7 @@ fn read_parquet_file( Ok((names, reader)) } -fn get_parquet_file_paths(parquet_path: &Path) -> Vec { +fn get_parquet_file_paths(parquet_path: &Path) -> Result, GraphError> { let mut parquet_files = Vec::new(); if parquet_path.is_file() { parquet_files.push(parquet_path.to_path_buf()); @@ -297,10 +297,10 @@ fn get_parquet_file_paths(parquet_path: &Path) -> Vec { } } } else { - println!("Invalid path provided: {:?}", parquet_path); + return Err(GraphError::InvalidPath(parquet_path.display().to_string())); } - parquet_files + Ok(parquet_files) } #[cfg(test)]