diff --git a/flex/engines/graph_db/runtime/common/context.cc b/flex/engines/graph_db/runtime/common/context.cc index 22dab7b96459..11d6edf5c54b 100644 --- a/flex/engines/graph_db/runtime/common/context.cc +++ b/flex/engines/graph_db/runtime/common/context.cc @@ -161,7 +161,7 @@ const std::shared_ptr Context::get(int alias) const { return head; } assert(static_cast(alias) < columns.size()); - assert(columns[alias] != nullptr); + // return nullptr if the column is not set return columns[alias]; } diff --git a/flex/engines/graph_db/runtime/common/operators/intersect.cc b/flex/engines/graph_db/runtime/common/operators/intersect.cc index e4e7a9c6e9e7..ec98818c5359 100644 --- a/flex/engines/graph_db/runtime/common/operators/intersect.cc +++ b/flex/engines/graph_db/runtime/common/operators/intersect.cc @@ -134,6 +134,7 @@ static Context intersect_impl(std::vector&& ctxs, int key) { return idx_col1.get_value(a) < idx_col1.get_value(b); }); std::vector shuffle_offsets; + std::vector shuffle_offsets_1; size_t idx0 = 0, idx1 = 0; while (idx0 < idx_col0.size() && idx1 < idx_col1.size()) { if (idx_col0.get_value(offsets0[idx0]) < @@ -151,6 +152,7 @@ static Context intersect_impl(std::vector&& ctxs, int key) { auto v1 = vlist1.get_vertex(offsets1[idx1]); if (v0 == v1) { shuffle_offsets.push_back(offsets0[idx0]); + shuffle_offsets_1.push_back(offsets1[idx1]); } else if (v0 < v1) { break; } else { @@ -164,7 +166,13 @@ static Context intersect_impl(std::vector&& ctxs, int key) { } ctxs[0].reshuffle(shuffle_offsets); + ctxs[1].reshuffle(shuffle_offsets_1); ctxs[0].pop_idx_col(); + for (size_t i = 0; i < ctxs[1].col_num(); ++i) { + if (i >= ctxs[0].col_num() || ctxs[0].get(i) == nullptr) { + ctxs[0].set(i, ctxs[1].get(i)); + } + } return ctxs[0]; } } diff --git a/interactive_engine/executor/common/huge_container/.gitignore b/interactive_engine/executor/common/huge_container/.gitignore new file mode 100644 index 000000000000..4fffb2f89cbd --- /dev/null +++ b/interactive_engine/executor/common/huge_container/.gitignore @@ -0,0 +1,2 @@ +/target +/Cargo.lock diff --git a/interactive_engine/executor/common/huge_container/Cargo.toml b/interactive_engine/executor/common/huge_container/Cargo.toml new file mode 100644 index 000000000000..3f801755e4ab --- /dev/null +++ b/interactive_engine/executor/common/huge_container/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "huge_container" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +libc = "0.2" +lazy_static = "1.4" diff --git a/interactive_engine/executor/common/huge_container/src/huge_vec.rs b/interactive_engine/executor/common/huge_container/src/huge_vec.rs new file mode 100644 index 000000000000..eaf0b0f45cb3 --- /dev/null +++ b/interactive_engine/executor/common/huge_container/src/huge_vec.rs @@ -0,0 +1,144 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::fmt; +use std::ops; + +pub struct HugeVec { + ptr: *mut T, + cap: usize, + len: usize, +} + +impl HugeVec { + pub fn new() -> Self { + Self { ptr: std::ptr::null_mut(), cap: 0, len: 0 } + } + + pub fn with_capacity(capacity: usize) -> Self { + let cap_in_bytes = capacity * std::mem::size_of::(); + let ptr = crate::hugepage_alloc(cap_in_bytes) as *mut T; + Self { ptr, cap: capacity, len: 0 } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn capacity(&self) -> usize { + self.cap + } + + pub fn reserve(&mut self, additional: usize) { + let new_cap = self.cap + additional; + let new_cap_in_bytes = new_cap * std::mem::size_of::(); + let new_ptr = crate::hugepage_alloc(new_cap_in_bytes) as *mut T; + + if self.len > 0 { + unsafe { + std::ptr::copy_nonoverlapping(self.ptr, new_ptr, self.len); + } + } + if self.cap > 0 { + crate::hugepage_dealloc(self.ptr as *mut u8, self.cap * std::mem::size_of::()); + } + + self.ptr = new_ptr; + self.cap = new_cap; + } + + pub fn as_ptr(&self) -> *const T { + self.ptr + } + + pub fn as_mut_ptr(&mut self) -> *mut T { + self.ptr + } + + pub fn push(&mut self, value: T) { + if self.len == self.cap { + self.reserve(1); + } + + unsafe { + self.ptr.add(self.len).write(value); + } + + self.len += 1; + } + + pub fn clear(&mut self) { + unsafe { std::ptr::drop_in_place(std::slice::from_raw_parts_mut(self.ptr, self.len)) } + self.len = 0; + } + + pub fn resize(&mut self, new_len: usize, value: T) + where + T: Clone, + { + if new_len > self.len { + if new_len > self.cap { + self.reserve(new_len - self.len); + } + + for i in self.len..new_len { + unsafe { + self.ptr.add(i).write(value.clone()); + } + } + } else { + unsafe { + std::ptr::drop_in_place(std::slice::from_raw_parts_mut( + self.ptr.add(new_len), + self.len - new_len, + )); + } + } + + self.len = new_len; + } +} + +impl Drop for HugeVec { + fn drop(&mut self) { + self.clear(); + if self.cap > 0 { + crate::hugepage_dealloc(self.ptr as *mut u8, self.cap * std::mem::size_of::()); + } + } +} + +impl ops::Deref for HugeVec { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + unsafe { std::slice::from_raw_parts(self.ptr, self.len) } + } +} + +impl ops::DerefMut for HugeVec { + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) } + } +} + +impl fmt::Debug for HugeVec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } +} + +unsafe impl Sync for HugeVec {} +unsafe impl Send for HugeVec {} diff --git a/interactive_engine/executor/common/huge_container/src/lib.rs b/interactive_engine/executor/common/huge_container/src/lib.rs new file mode 100644 index 000000000000..1384642a085b --- /dev/null +++ b/interactive_engine/executor/common/huge_container/src/lib.rs @@ -0,0 +1,57 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +#[cfg(target_os = "linux")] +mod linux_hugepages; +#[cfg(target_os = "linux")] +use linux_hugepages::hugepage_alloc; +#[cfg(target_os = "linux")] +use linux_hugepages::hugepage_dealloc; + +#[cfg(not(target_os = "linux"))] +mod notlinux_hugepages; +#[cfg(not(target_os = "linux"))] +use notlinux_hugepages::hugepage_alloc; +#[cfg(not(target_os = "linux"))] +use notlinux_hugepages::hugepage_dealloc; + +mod huge_vec; + +pub use huge_vec::HugeVec; + +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + + let mut vec = HugeVec::::new(); + vec.push(1); + vec.push(2); + vec.push(3); + + assert_eq!(vec.len(), 3); + assert_eq!(vec[0], 1); + assert_eq!(vec[1], 2); + assert_eq!(vec[2], 3); + } +} diff --git a/interactive_engine/executor/common/huge_container/src/linux_hugepages.rs b/interactive_engine/executor/common/huge_container/src/linux_hugepages.rs new file mode 100644 index 000000000000..230b25b28b28 --- /dev/null +++ b/interactive_engine/executor/common/huge_container/src/linux_hugepages.rs @@ -0,0 +1,75 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use lazy_static::lazy_static; +use std::{ + fs::File, + io::{self, BufRead, BufReader}, +}; + +fn get_hugepage_size() -> io::Result { + let file = File::open("/proc/meminfo")?; + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line?; + if line.starts_with("Hugepagesize:") { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + if let Ok(size_kb) = parts[1].parse::() { + match parts[2] { + "kB" => return Ok(size_kb * 1024), + "MB" => return Ok(size_kb * 1024 * 1024), + "GB" => return Ok(size_kb * 1024 * 1024 * 1024), + _ => {} + } + } + } + } + } + + Err(io::Error::new(io::ErrorKind::NotFound, "Hugepagesize info not found")) +} + +lazy_static! { + static ref HUGE_PAGE_SIZE: usize = get_hugepage_size().unwrap(); +} + +fn align_to(size: usize, align: usize) -> usize { + (size + align - 1) & !(align - 1) +} + +pub(crate) fn hugepage_alloc(size: usize) -> *mut u8 { + let len = align_to(size, *HUGE_PAGE_SIZE); + let p = unsafe { + libc::mmap( + std::ptr::null_mut(), + len, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_HUGETLB, + -1, + 0, + ) + }; + p as *mut u8 +} + +pub(crate) fn hugepage_dealloc(ptr: *mut u8, size: usize) { + let len = align_to(size, *HUGE_PAGE_SIZE); + let ret = unsafe { libc::munmap(ptr as *mut libc::c_void, len) }; + if ret != 0 { + panic!("hugepage deallocation failed, {} - {} -> {}", ret, size, len); + } +} diff --git a/interactive_engine/executor/common/huge_container/src/notlinux_hugepages.rs b/interactive_engine/executor/common/huge_container/src/notlinux_hugepages.rs new file mode 100644 index 000000000000..022b4270fbd3 --- /dev/null +++ b/interactive_engine/executor/common/huge_container/src/notlinux_hugepages.rs @@ -0,0 +1,40 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +pub(crate) fn hugepage_alloc(size: usize) -> *mut u8 { + let ptr = unsafe { + libc::mmap( + std::ptr::null_mut(), + size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, + -1, + 0, + ) + }; + + if ptr == libc::MAP_FAILED { + panic!("hugepage allocation failed"); + } + + ptr as *mut u8 +} + +pub(crate) fn hugepage_dealloc(ptr: *mut u8, size: usize) { + let ret = unsafe { libc::munmap(ptr as *mut libc::c_void, size) }; + if ret != 0 { + panic!("hugepage deallocation failed, {}", ret); + } +} diff --git a/interactive_engine/executor/store/Cargo.toml b/interactive_engine/executor/store/Cargo.toml index 12fcffc513dd..4c6b3d89b514 100644 --- a/interactive_engine/executor/store/Cargo.toml +++ b/interactive_engine/executor/store/Cargo.toml @@ -4,7 +4,8 @@ members = [ "mcsr", "global_query", "groot", - "exp_store" + "exp_store", + "bmcsr" ] [profile.release] diff --git a/interactive_engine/executor/store/bmcsr/Cargo.toml b/interactive_engine/executor/store/bmcsr/Cargo.toml new file mode 100644 index 000000000000..14123b9b23ae --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "bmcsr" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_derive = "1.0" +serde_json = "1.0" +pegasus_common = { path = "../../engine/pegasus/common"} +dyn_type = { path = "../../common/dyn_type" } +huge_container = { path = "../../common/huge_container" } +log = "0.4" +bincode = "1.0.1" +itertools = "0.9" +csv = "1.1" +abomonation = "0.7.3" +abomonation_derive = "0.5" +env_logger = "0.7.1" +chrono = "0.4.23" +fnv = "1.0.3" +regex = "1.7.1" +rust-htslib = { version = "0.39.5", default-features = false, features = ["bzip2", "lzma"] } +clap = "2.32.0" +byteorder = "1.5.0" +glob = "0.3" +rayon = "1.5.1" +dashmap = "5.5.3" + +[features] +hugepage_csr = [] +hugepage_table = [] \ No newline at end of file diff --git a/interactive_engine/executor/store/bmcsr/README.md b/interactive_engine/executor/store/bmcsr/README.md new file mode 100644 index 000000000000..f2e2796903d1 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/README.md @@ -0,0 +1,34 @@ +# Mutable CSR Store +## Prepare schema for data loading +When loading the graph into storage, two schema files must be provided: the input schema and the graph schema. The input schema specifies the path to the input file, while the graph schema defines the structure of the graph. +### Input schema +The input schema contains the following information for the graph loading: +- Mappings from vertex label to file path of raw data +- Mappings from vertex label to column info of raw data +- Mappings from a tuple that includes the labels of the source vertex, edge, and target vertex to file path of raw data +- Mappings from a tuple that includes the labels of the source vertex, edge, and target vertex column info of raw data + +The schema file is formatted using Json. We have provided a sampled schema file for modern graph in `data/modern_input.json`. + +### Graph schema +The graph schema contains the following information for the graph storage: +- Mapping from vertex label to label id. +- Mapping from edge label to a 3-tuple, which contains edge label id, source vertex label id, and target vertex label id. +- The properties (name and datatype) of each type of vertex/edge. +The schema file is formatted using Json. We have provided a sampled schema file for modern graph in `data/modern_schema.json`. + +## Build Binary Data +```bash +INPUT_PATH=$1 +OUTPUT_PATH=$2 +INPUT_SCHEMA_PATH=$3 +GRAPH_SCHEMA_PATH=$4 +PARTITION_NUM=$5 +PARTITION_ID=$6 + +#USAGE: +# build_bmcsr_partition -i -p [--skip_header] +cmd="./target/release/build_bmcsr_partition $INPUT_PATH $OUTPUT_PATH $INPUT_SCHEMA_PATH $GRAPH_SCHEMA_PATH -p $PARTITION_NUM -i $PARTITION_ID" +echo $cmd +eval $cmd +``` \ No newline at end of file diff --git a/interactive_engine/executor/store/bmcsr/data/modern_graph/created.csv b/interactive_engine/executor/store/bmcsr/data/modern_graph/created.csv new file mode 100644 index 000000000000..162928a61317 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/data/modern_graph/created.csv @@ -0,0 +1,5 @@ +src_id|dst_id|weight +1|3|0.4 +4|5|1.0 +4|3|0.4 +6|3|0.2 diff --git a/interactive_engine/executor/store/bmcsr/data/modern_graph/knows.csv b/interactive_engine/executor/store/bmcsr/data/modern_graph/knows.csv new file mode 100644 index 000000000000..0987e08bb369 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/data/modern_graph/knows.csv @@ -0,0 +1,3 @@ +src_id|dst_id|weight +1|2|0.5 +1|4|1.0 diff --git a/interactive_engine/executor/store/bmcsr/data/modern_graph/person.csv b/interactive_engine/executor/store/bmcsr/data/modern_graph/person.csv new file mode 100644 index 000000000000..1ec20cd59a3e --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/data/modern_graph/person.csv @@ -0,0 +1,5 @@ +id|name|age +2|vadas|27 +6|peter|35 +4|josh|32 +1|marko|29 diff --git a/interactive_engine/executor/store/bmcsr/data/modern_graph/software.csv b/interactive_engine/executor/store/bmcsr/data/modern_graph/software.csv new file mode 100644 index 000000000000..79c6d8f92ed3 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/data/modern_graph/software.csv @@ -0,0 +1,3 @@ +id|name|lang +3|lop|java +5|ripple|java diff --git a/interactive_engine/executor/store/bmcsr/data/modern_input.json b/interactive_engine/executor/store/bmcsr/data/modern_input.json new file mode 100644 index 000000000000..fe263a1c3107 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/data/modern_input.json @@ -0,0 +1,90 @@ +{ + "vertex": [ + { + "label": "person", + "columns": [ + { + "name": "id", + "data_type": "ID" + }, + { + "name": "name", + "data_type": "String" + }, + { + "name": "age", + "data_type": "Int32" + } + ], + "files":[ + "person.csv" + ] + }, + { + "label": "software", + "columns": [ + { + "name": "id", + "data_type": "ID" + }, + { + "name": "name", + "data_type": "String" + }, + { + "name": "lang", + "data_type": "String" + } + ], + "files":[ + "software.csv" + ] + } + ], + "edge": [ + { + "src_label": "person", + "dst_label": "person", + "label": "knows", + "columns": [ + { + "name": "start_id", + "data_type": "ID" + }, + { + "name": "end_id", + "data_type": "ID" + }, + { + "name": "weight", + "data_type": "Double" + } + ], + "files": [ + "knows.csv" + ] + }, + { + "src_label": "person", + "dst_label": "software", + "label": "created", + "columns": [ + { + "name": "start_id", + "data_type": "ID" + }, + { + "name": "end_id", + "data_type": "ID" + }, + { + "name": "weight", + "data_type": "Double" + } + ], + "files": [ + "created.csv" + ] + } + ] +} \ No newline at end of file diff --git a/interactive_engine/executor/store/bmcsr/data/modern_schema.json b/interactive_engine/executor/store/bmcsr/data/modern_schema.json new file mode 100644 index 000000000000..847179ddc85b --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/data/modern_schema.json @@ -0,0 +1,56 @@ +{ + "vertex": [ + { + "label": "person", + "partition_type": "Dynamic", + "properties": [ + { + "name": "name", + "data_type": "String" + }, + { + "name": "age", + "data_type": "Int32" + } + ] + }, + { + "label": "software", + "partition_type": "Dynamic", + "properties": [ + { + "name": "name", + "data_type": "String" + }, + { + "name": "lang", + "data_type": "String" + } + ] + } + ], + "edge": [ + { + "src_label": "person", + "dst_label": "person", + "label": "knows", + "properties": [ + { + "name": "weight", + "data_type": "Double" + } + ] + }, + { + "src_label": "person", + "dst_label": "software", + "label": "created", + "properties": [ + { + "name": "weight", + "data_type": "Double" + } + ] + } + ] +} \ No newline at end of file diff --git a/interactive_engine/executor/store/bmcsr/src/bin/build_bmcsr_partition.rs b/interactive_engine/executor/store/bmcsr/src/bin/build_bmcsr_partition.rs new file mode 100644 index 000000000000..98335d52cea9 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/bin/build_bmcsr_partition.rs @@ -0,0 +1,154 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::path::PathBuf; + +use bmcsr::graph_loader::GraphLoader; +use bmcsr::schema::CsrGraphSchema; +use bmcsr::types::*; +use clap::{App, Arg}; +use env_logger; + +fn main() { + env_logger::init(); + let matches = App::new(NAME) + .version(VERSION) + .about("Build graph storage on single machine.") + .args(&[ + Arg::with_name("raw_data_dir") + .short("r") + .long_help("The directory to the raw data") + .required(true) + .takes_value(true) + .index(1), + Arg::with_name("graph_data_dir") + .short("g") + .long_help("The directory to graph store") + .required(true) + .takes_value(true) + .index(2), + Arg::with_name("input_schema_file") + .long_help("The input schema file") + .required(true) + .takes_value(true) + .index(3), + Arg::with_name("graph_schema_file") + .long_help("The graph schema file") + .required(true) + .takes_value(true) + .index(4), + Arg::with_name("partition") + .short("p") + .long_help("The number of partitions") + .takes_value(true), + Arg::with_name("index") + .short("i") + .long_help("The index of partitions") + .takes_value(true), + Arg::with_name("delimiter") + .short("t") + .long_help( + "The delimiter of the raw data [comma|semicolon|pipe]. pipe (|) is the default option", + ) + .takes_value(true), + Arg::with_name("skip_header") + .long("skip_header") + .long_help("Whether skip the first line in input file") + .takes_value(false), + ]) + .get_matches(); + + let raw_data_dir = matches + .value_of("raw_data_dir") + .unwrap() + .to_string(); + let graph_data_dir = matches + .value_of("graph_data_dir") + .unwrap() + .to_string(); + let input_schema_file = matches + .value_of("input_schema_file") + .unwrap() + .to_string(); + let graph_schema_file = matches + .value_of("graph_schema_file") + .unwrap() + .to_string(); + let partition_num = matches + .value_of("partition") + .unwrap_or("1") + .parse::() + .expect(&format!("Specify invalid partition number")); + let partition_index = matches + .value_of("index") + .unwrap_or("0") + .parse::() + .expect(&format!("Specify invalid partition number")); + + let delimiter_str = matches + .value_of("delimiter") + .unwrap_or("pipe") + .to_uppercase(); + + let skip_header = matches.is_present("skip_header"); + + let delimiter = if delimiter_str.as_str() == "COMMA" { + b',' + } else if delimiter_str.as_str() == "SEMICOLON" { + b';' + } else { + b'|' + }; + + let out_dir = PathBuf::from(format!("{}/{}", graph_data_dir, DIR_GRAPH_SCHEMA)); + if !out_dir.exists() { + std::fs::create_dir_all(&out_dir).expect("Create graph schema directory error"); + } + let graph_schema = + CsrGraphSchema::from_json_file(&graph_schema_file).expect("Read graph schema error!"); + graph_schema + .to_json_file(&out_dir.join(FILE_SCHEMA)) + .expect("Write graph schema error!"); + + let mut handles = Vec::with_capacity(partition_num); + let raw_dir = raw_data_dir.clone(); + let graph_schema_f = graph_schema_file.clone(); + let input_schema_f = input_schema_file.clone(); + + let cur_out_dir = graph_data_dir.clone(); + + let handle = std::thread::spawn(move || { + let mut loader: GraphLoader = GraphLoader::::new( + raw_dir, + cur_out_dir, + input_schema_f, + graph_schema_f, + partition_index, + partition_num, + ); + loader = loader.with_delimiter(delimiter); + if skip_header { + loader.skip_header(); + } + + loader.load().expect("Load error"); + }); + + handles.push(handle); + + for handle in handles { + handle.join().unwrap(); + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/bin/traverse.rs b/interactive_engine/executor/store/bmcsr/src/bin/traverse.rs new file mode 100644 index 000000000000..c133ec04a285 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/bin/traverse.rs @@ -0,0 +1,255 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::collections::HashMap; +use std::fs::File; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use bmcsr::columns::DataType; +use bmcsr::graph_db::GraphDB; +use bmcsr::ldbc_parser::LDBCVertexParser; +use bmcsr::schema::Schema; +use bmcsr::types::{DefaultId, LabelId, DIR_BINARY_DATA, NAME, VERSION}; +use clap::{App, Arg}; + +fn get_partition_num(graph_data_dir: &String) -> usize { + let root_dir = PathBuf::from_str(graph_data_dir.as_str()).unwrap(); + let partitions_dir = root_dir.join(DIR_BINARY_DATA); + let mut index = 0_usize; + loop { + let partition_dir = partitions_dir.join(format!("partition_{}", index)); + let b = Path::new(partition_dir.to_str().unwrap()).is_dir(); + if b { + index += 1; + } else { + return index; + } + } +} + +fn output_vertices(graph: &GraphDB, output_dir: &String, files: &mut HashMap) { + let vertex_label_names = graph.graph_schema.vertex_label_names(); + let output_dir_path = PathBuf::from_str(output_dir.as_str()).unwrap(); + for n in vertex_label_names.iter() { + if let Some(v_label) = graph + .graph_schema + .get_vertex_label_id(n.as_str()) + { + println!("outputing vertex-{}, size {}", n, graph.get_vertices_num(v_label)); + let header = graph + .graph_schema + .get_vertex_header(v_label) + .unwrap(); + if !files.contains_key(&v_label) { + let file = File::create(output_dir_path.join(n.as_str())).unwrap(); + files.insert(v_label.clone(), file); + } + let file = files.get_mut(&v_label).unwrap(); + + let v_labels = vec![v_label]; + for v in graph.get_all_vertices(Some(&v_labels)) { + let id = LDBCVertexParser::::get_original_id(v.get_id()); + write!(file, "\"{}\"", id.to_string()).unwrap(); + for c in header { + if c.1 != DataType::ID { + write!( + file, + "|\"{}\"", + v.get_property(c.0.as_str()) + .unwrap() + .to_string() + ) + .unwrap(); + } + } + writeln!(file).unwrap(); + } + } + } +} + +fn output_edges( + graph: &GraphDB, output_dir: &String, files: &mut HashMap<(LabelId, LabelId, LabelId), File>, +) { + let output_dir_path = PathBuf::from_str(output_dir.as_str()).unwrap(); + let vertex_label_num = graph.vertex_label_num; + let edge_label_num = graph.edge_label_num; + for src_label in 0..vertex_label_num { + for edge_label in 0..edge_label_num { + for dst_label in 0..vertex_label_num { + if let Some(header) = graph.graph_schema.get_edge_header( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + ) { + println!("{}_{}_{}", src_label, edge_label, dst_label); + let src_label_name = + graph.graph_schema.vertex_label_names()[src_label as usize].clone(); + let dst_label_name = + graph.graph_schema.vertex_label_names()[dst_label as usize].clone(); + let edge_label_name = + graph.graph_schema.edge_label_names()[edge_label as usize].clone(); + let filename = src_label_name.clone() + + "_" + + &*edge_label_name.clone() + + "_" + + &*dst_label_name.clone(); + let mut file = File::create(output_dir_path.join(filename.as_str())).unwrap(); + if !graph.graph_schema.is_single_oe( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + ) { + let subgraph = graph.get_sub_graph( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + bmcsr::graph::Direction::Outgoing, + ); + for vertex_id in 0..subgraph.get_vertex_num() { + let src_global_id = graph + .get_global_id(vertex_id, src_label as LabelId) + .unwrap(); + let src_oid = + LDBCVertexParser::::get_original_id(src_global_id as usize) as u64; + if let Some(edges) = subgraph.get_adj_list_with_offset(vertex_id) { + for (nbr, offset) in edges { + let dst_global_id = graph + .get_global_id(nbr, dst_label as LabelId) + .unwrap(); + let dst_oid = + LDBCVertexParser::::get_original_id(dst_global_id as usize) + as u64; + write!(file, "\"{}\"|\"{}\"", src_oid, dst_oid).unwrap(); + if let Some(properties) = subgraph.get_properties() { + for c in header { + if c.1 != DataType::ID { + write!( + file, + "|\"{}\"", + properties + .get_column_by_name(c.0.as_str()) + .get(offset) + .unwrap() + .to_string() + ) + .unwrap(); + } + } + } + writeln!(file).unwrap(); + } + } + } + } else { + let subgraph = graph.get_single_sub_graph( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + bmcsr::graph::Direction::Outgoing, + ); + for vertex_id in 0..subgraph.get_vertex_num() { + let src_global_id = graph + .get_global_id(vertex_id, src_label as LabelId) + .unwrap(); + let src_oid = + LDBCVertexParser::::get_original_id(src_global_id as usize) as u64; + if let Some(edges) = subgraph.get_adj_list_with_offset(vertex_id) { + for (nbr, offset) in edges { + let dst_global_id = graph + .get_global_id(nbr, dst_label as LabelId) + .unwrap(); + let dst_oid = + LDBCVertexParser::::get_original_id(dst_global_id as usize) + as u64; + write!(file, "\"{}\"|\"{}\"", src_oid, dst_oid).unwrap(); + if let Some(properties) = subgraph.get_properties() { + for c in header { + if c.1 != DataType::ID { + write!( + file, + "|\"{}\"", + properties + .get_column_by_name(c.0.as_str()) + .get(offset) + .unwrap() + .to_string() + ) + .unwrap(); + } + } + } + writeln!(file).unwrap(); + } + } + } + } + } + } + } + } +} + +fn traverse_partition( + graph_data_dir: &String, output_dir: &String, partition: usize, v_files: &mut HashMap, + e_files: &mut HashMap<(LabelId, LabelId, LabelId), File>, +) { + let graph = GraphDB::deserialize(graph_data_dir.as_str(), partition, None).unwrap(); + + // output_vertices(&graph, output_dir, v_files); + println!("start output edges"); + output_edges(&graph, output_dir, e_files); +} + +fn main() { + env_logger::init(); + let matches = App::new(NAME) + .version(VERSION) + .about("Build graph storage on single machine.") + .args(&[ + Arg::with_name("graph_data_dir") + .short("g") + .long_help("The directory to graph store") + .required(true) + .takes_value(true) + .index(1), + Arg::with_name("output_dir") + .short("o") + .long_help("The directory to place output files") + .required(true) + .takes_value(true) + .index(2), + ]) + .get_matches(); + + let graph_data_dir = matches + .value_of("graph_data_dir") + .unwrap() + .to_string(); + let output_dir = matches + .value_of("output_dir") + .unwrap() + .to_string(); + + let partition_num = get_partition_num(&graph_data_dir); + + let mut v_files = HashMap::::new(); + let mut e_files = HashMap::<(LabelId, LabelId, LabelId), File>::new(); + for i in 0..partition_num { + traverse_partition(&graph_data_dir, &output_dir, i, &mut v_files, &mut e_files); + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/bmcsr.rs b/interactive_engine/executor/store/bmcsr/src/bmcsr.rs new file mode 100644 index 000000000000..9e284aa5a74d --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/bmcsr.rs @@ -0,0 +1,792 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::any::Any; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +#[cfg(feature = "hugepage_csr")] +use huge_container::HugeVec; + +use crate::col_table::ColTable; +use crate::csr::{CsrBuildError, CsrTrait, NbrIter, NbrIterBeta, NbrOffsetIter, SafeMutPtr, SafePtr}; +use crate::graph::IndexType; + +#[cfg(feature = "hugepage_csr")] +type ArrayType = HugeVec; + +#[cfg(not(feature = "hugepage_csr"))] +type ArrayType = Vec; + +pub struct BatchMutableCsr { + pub neighbors: ArrayType, + pub offsets: ArrayType, + pub degree: ArrayType, + + edge_num: usize, +} + +pub struct BatchMutableCsrBuilder { + neighbors: ArrayType, + offsets: ArrayType, + insert_offsets: ArrayType, + + edge_num: usize, +} + +impl BatchMutableCsrBuilder { + pub fn new() -> Self { + BatchMutableCsrBuilder { + neighbors: ArrayType::new(), + offsets: ArrayType::new(), + insert_offsets: ArrayType::new(), + edge_num: 0, + } + } + + pub fn init(&mut self, degree: &Vec, _: f64) { + let vertex_num = degree.len(); + let mut edge_num = 0_usize; + for i in 0..vertex_num { + edge_num += degree[i] as usize; + } + self.edge_num = 0; + + self.neighbors.resize(edge_num, I::new(0)); + self.offsets.resize(vertex_num, 0); + self.insert_offsets.resize(vertex_num, 0); + + let mut offset = 0_usize; + for i in 0..vertex_num { + self.insert_offsets[i] = 0; + self.offsets[i] = offset; + offset += degree[i] as usize; + } + } + + pub fn put_edge(&mut self, src: I, dst: I) -> Result { + let offset = self.offsets[src.index()] + self.insert_offsets[src.index()] as usize; + self.neighbors[offset] = dst; + self.insert_offsets[src.index()] += 1; + self.edge_num += 1; + Ok(offset) + } + + pub fn finish(self) -> Result, CsrBuildError> { + Ok(BatchMutableCsr { + neighbors: self.neighbors, + offsets: self.offsets, + degree: self.insert_offsets, + edge_num: self.edge_num, + }) + } +} + +impl BatchMutableCsr { + pub fn new() -> Self { + BatchMutableCsr { + neighbors: ArrayType::new(), + offsets: ArrayType::new(), + degree: ArrayType::new(), + edge_num: 0, + } + } +} + +unsafe impl Send for BatchMutableCsr {} +unsafe impl Sync for BatchMutableCsr {} + +impl CsrTrait for BatchMutableCsr { + fn vertex_num(&self) -> I { + I::new(self.offsets.len()) + } + + fn edge_num(&self) -> usize { + self.edge_num + } + + fn max_edge_offset(&self) -> usize { + self.neighbors.len() + } + + fn degree(&self, u: I) -> usize { + let u = u.index(); + if u >= self.degree.len() { + 0 + } else { + self.degree[u] as usize + } + } + + fn serialize(&self, path: &String) { + let file = File::create(path).unwrap(); + let mut writer = BufWriter::new(file); + info!("edge_num = {}", self.edge_num); + writer + .write_u64::(self.edge_num as u64) + .unwrap(); + + info!("neighbor_size = {}", self.neighbors.len()); + writer + .write_u64::(self.neighbors.len() as u64) + .unwrap(); + for i in 0..self.neighbors.len() { + self.neighbors[i].write(&mut writer).unwrap(); + } + info!("offset_size = {}", self.offsets.len()); + writer + .write_u64::(self.offsets.len() as u64) + .unwrap(); + for i in 0..self.offsets.len() { + writer + .write_u64::(self.offsets[i] as u64) + .unwrap(); + } + info!("degree_size = {}", self.degree.len()); + writer + .write_u64::(self.degree.len() as u64) + .unwrap(); + for i in 0..self.degree.len() { + writer + .write_i32::(self.degree[i]) + .unwrap(); + } + writer.flush().unwrap(); + } + + fn deserialize(&mut self, path: &String) { + let file = File::open(path).unwrap(); + let mut reader = BufReader::new(file); + + self.edge_num = reader.read_u64::().unwrap() as usize; + info!("edge_num = {}", self.edge_num); + + let neighbor_size = reader.read_u64::().unwrap() as usize; + info!("neighbor_size = {}", neighbor_size); + self.neighbors = ArrayType::with_capacity(neighbor_size); + for _ in 0..neighbor_size { + self.neighbors + .push(I::read(&mut reader).unwrap()); + } + + let offset_size = reader.read_u64::().unwrap() as usize; + info!("offset_size = {}", offset_size); + self.offsets = ArrayType::with_capacity(offset_size); + for _ in 0..offset_size { + self.offsets + .push(reader.read_u64::().unwrap() as usize); + } + + let degree_size = reader.read_u64::().unwrap() as usize; + info!("degree_size = {}", degree_size); + let degree_capacity = degree_size + degree_size / 2; + self.degree = ArrayType::with_capacity(degree_capacity); + for _ in 0..degree_size { + self.degree + .push(reader.read_i32::().unwrap()); + } + } + + fn get_edges(&self, u: I) -> Option> { + let u = u.index(); + if u >= self.offsets.len() { + None + } else { + let start = self.offsets[u]; + let end = self.offsets[u] + self.degree[u] as usize; + Some(NbrIter::new(&self.neighbors, start, end)) + } + } + + fn get_edges_beta(&self, u: I) -> NbrIterBeta { + let u = u.index(); + if u >= self.offsets.len() { + NbrIterBeta::new(self.neighbors.as_ptr(), self.neighbors.as_ptr()) + } else { + let start = self.offsets[u]; + let end = self.offsets[u] + self.degree[u] as usize; + let start = unsafe { self.neighbors.as_ptr().add(start) }; + let end = unsafe { self.neighbors.as_ptr().add(end) }; + NbrIterBeta::new(start, end) + } + } + + fn get_edges_with_offset(&self, u: I) -> Option> { + let u = u.index(); + if u >= self.offsets.len() { + None + } else { + let start = self.offsets[u]; + let end = self.offsets[u] + self.degree[u] as usize; + Some(NbrOffsetIter::new(&self.neighbors, start, end)) + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn delete_vertices(&mut self, vertices: &HashSet) { + for vertex in vertices { + let vertex = vertex.index(); + if vertex >= self.degree.len() { + continue; + } + self.edge_num -= self.degree[vertex] as usize; + self.degree[vertex] = 0; + } + } + + fn parallel_delete_edges(&mut self, edges: &Vec<(I, I)>, reverse: bool, p: u32) { + let mut delete_map: HashMap> = HashMap::new(); + let mut keys = vec![]; + if reverse { + for (src, dst) in edges.iter() { + if let Some(set) = delete_map.get_mut(&dst) { + set.insert(*src); + } else { + let mut set = HashSet::new(); + set.insert(*src); + delete_map.insert(*dst, set); + keys.push(*dst); + } + } + } else { + for (src, dst) in edges.iter() { + if let Some(set) = delete_map.get_mut(&src) { + set.insert(*dst); + } else { + let mut set = HashSet::new(); + set.insert(*dst); + delete_map.insert(*src, set); + keys.push(*src); + } + } + } + keys.sort(); + + let safe_offsets_ptr = SafePtr::new(&self.offsets); + let safe_degree_ptr = SafeMutPtr::new(&mut self.degree); + let safe_neighbors_ptr = SafeMutPtr::new(&mut self.neighbors); + let safe_keys_ptr = SafePtr::new(&keys); + + let keys_size = keys.len(); + let num_threads = p as usize; + let chunk_size = (keys_size + num_threads - 1) / num_threads; + + let mut thread_deleted_edges = vec![0_usize; num_threads]; + + let safe_delete_map_ptr = SafePtr::new(&delete_map); + let safe_tde_ptr = SafeMutPtr::new(&mut thread_deleted_edges); + + rayon::scope(|s| { + for i in 0..num_threads { + let start_idx = i * chunk_size; + let end_idx = keys_size.min(start_idx + chunk_size); + s.spawn(move |_| { + let keys_ref = safe_keys_ptr.get_ref(); + let offsets_ref = safe_offsets_ptr.get_ref(); + let degree_ref = safe_degree_ptr.get_mut(); + let neighbors_ref = safe_neighbors_ptr.get_mut(); + let tde_ref = safe_tde_ptr.get_mut(); + let delete_map_ref = safe_delete_map_ptr.get_ref(); + let mut deleted_edges = 0; + for v_index in start_idx..end_idx { + let v = keys_ref[v_index]; + let mut offset = offsets_ref[v.index()]; + let deg = degree_ref[v.index()]; + + let set = delete_map_ref.get(&v).unwrap(); + let mut end = offset + deg as usize; + while offset < (end - 1) { + let nbr = neighbors_ref[offset]; + if set.contains(&nbr) { + neighbors_ref[offset] = neighbors_ref[end - 1]; + end -= 1; + } else { + offset += 1; + } + } + let nbr = neighbors_ref[end - 1]; + if set.contains(&nbr) { + end -= 1; + } + + let new_deg = (end - offsets_ref[v.index()]) as i32; + degree_ref[v.index()] = new_deg; + + deleted_edges += (deg - new_deg) as usize; + } + + tde_ref[i] = deleted_edges; + }); + } + }); + + for v in thread_deleted_edges.iter() { + self.edge_num -= *v; + } + } + + fn parallel_delete_edges_with_props( + &mut self, edges: &Vec<(I, I)>, reverse: bool, table: &mut ColTable, p: u32, + ) { + let mut delete_map: HashMap> = HashMap::new(); + let mut keys = vec![]; + if reverse { + for (src, dst) in edges.iter() { + if let Some(set) = delete_map.get_mut(&dst) { + set.insert(*src); + } else { + let mut set = HashSet::new(); + set.insert(*src); + delete_map.insert(*dst, set); + keys.push(*dst); + } + } + } else { + for (src, dst) in edges.iter() { + if let Some(set) = delete_map.get_mut(&src) { + set.insert(*dst); + } else { + let mut set = HashSet::new(); + set.insert(*dst); + delete_map.insert(*src, set); + keys.push(*src); + } + } + } + keys.sort(); + + let safe_offsets_ptr = SafePtr::new(&self.offsets); + let safe_degree_ptr = SafeMutPtr::new(&mut self.degree); + let safe_neighbors_ptr = SafeMutPtr::new(&mut self.neighbors); + let safe_keys_ptr = SafePtr::new(&keys); + let safe_table_ptr = SafeMutPtr::new(table); + + let keys_size = keys.len(); + let num_threads = p as usize; + let chunk_size = (keys_size + num_threads - 1) / num_threads; + + let mut thread_deleted_edges = vec![0_usize; num_threads]; + + let safe_delete_map_ptr = SafePtr::new(&delete_map); + let safe_tde_ptr = SafeMutPtr::new(&mut thread_deleted_edges); + + rayon::scope(|s| { + for i in 0..num_threads { + let start_idx = i * chunk_size; + let end_idx = keys_size.min(start_idx + chunk_size); + s.spawn(move |_| { + let keys_ref = safe_keys_ptr.get_ref(); + let offsets_ref = safe_offsets_ptr.get_ref(); + let degree_ref = safe_degree_ptr.get_mut(); + let neighbors_ref = safe_neighbors_ptr.get_mut(); + let table_ref = safe_table_ptr.get_mut(); + let tde_ref = safe_tde_ptr.get_mut(); + let delete_map_ref = safe_delete_map_ptr.get_ref(); + let mut deleted_edges = 0; + for v_index in start_idx..end_idx { + let v = keys_ref[v_index]; + let mut offset = offsets_ref[v.index()]; + let deg = degree_ref[v.index()]; + + let set = delete_map_ref.get(&v).unwrap(); + let mut end = offset + deg as usize; + while offset < (end - 1) { + let nbr = neighbors_ref[offset]; + if set.contains(&nbr) { + neighbors_ref[offset] = neighbors_ref[end - 1]; + table_ref.move_row(end - 1, offset); + end -= 1; + } else { + offset += 1; + } + } + let nbr = neighbors_ref[end - 1]; + if set.contains(&nbr) { + end -= 1; + } + + let new_deg = (end - offsets_ref[v.index()]) as i32; + degree_ref[v.index()] = new_deg; + + deleted_edges += (deg - new_deg) as usize; + } + + tde_ref[i] = deleted_edges; + }); + } + }); + + for v in thread_deleted_edges.iter() { + self.edge_num -= *v; + } + } + + fn insert_edges(&mut self, vertex_num: usize, edges: &Vec<(I, I)>, reverse: bool, p: u32) { + let mut new_degree = vec![0; vertex_num]; + + if reverse { + for e in edges.iter() { + new_degree[e.1.index()] += 1; + } + } else { + for e in edges.iter() { + new_degree[e.0.index()] += 1; + } + } + + let num_threads = p as usize; + + let old_vertex_num = self.offsets.len(); + // let chunk_size = ((vertex_num + num_threads - 1) / num_threads).min(32768); // + // let chunk_size = ((vertex_num + num_threads - 1) / num_threads); // inf + let chunk_size = (((vertex_num + num_threads - 1) / num_threads) + 3) / 4; + let chunk_num = (vertex_num + chunk_size - 1) / chunk_size; + + let mut chunk_offset = vec![0_usize; chunk_num]; + let safe_chunk_offset_ptr = SafeMutPtr::new(&mut chunk_offset); + let mut new_offsets = ArrayType::with_capacity(vertex_num); + new_offsets.resize(vertex_num, 0); + let safe_new_offsets_ptr = SafeMutPtr::new(&mut new_offsets); + let safe_new_degree_ptr = SafeMutPtr::new(&mut new_degree); + let safe_degree_ptr = SafePtr::new(&self.degree); + + let chunk_i = AtomicUsize::new(0); + + rayon::scope(|s| { + for _ in 0..num_threads { + s.spawn(|_| { + let chunk_i_ref = &chunk_i; + let new_offsets_ref = safe_new_offsets_ptr.get_mut(); + let chunk_offset_ref = safe_chunk_offset_ptr.get_mut(); + let degree_ref = safe_degree_ptr.get_ref(); + let new_degree_ref = safe_new_degree_ptr.get_mut(); + loop { + let cur_chunk = chunk_i_ref.fetch_add(1, Ordering::Relaxed); + if cur_chunk >= chunk_num { + break; + } + let mut local_offset = 0_usize; + + let start_idx = cur_chunk * chunk_size; + let end_idx = vertex_num.min(start_idx + chunk_size); + + if end_idx > old_vertex_num { + if start_idx >= old_vertex_num { + for v in start_idx..end_idx { + new_offsets_ref[v] = local_offset; + local_offset += (new_degree[v]) as usize; + } + } else { + for v in start_idx..old_vertex_num { + new_offsets_ref[v] = local_offset; + local_offset += (degree_ref[v] + new_degree_ref[v]) as usize; + } + for v in old_vertex_num..end_idx { + new_offsets_ref[v] = local_offset; + local_offset += (new_degree[v]) as usize; + } + } + } else { + for v in start_idx..end_idx { + new_offsets_ref[v] = local_offset; + local_offset += (degree_ref[v] + new_degree_ref[v]) as usize; + } + } + chunk_offset_ref[cur_chunk] = local_offset; + } + }); + } + }); + let mut cur_offset = 0_usize; + for i in 0..chunk_num { + let tmp = chunk_offset[i] + cur_offset; + chunk_offset[i] = cur_offset; + cur_offset = tmp; + } + + let mut new_neighbors = ArrayType::with_capacity(cur_offset); + new_neighbors.resize(cur_offset, I::new(0)); + + let safe_new_neighbors_ptr = SafeMutPtr::new(&mut new_neighbors); + let safe_neighbors_ptr = SafePtr::new(&self.neighbors); + let safe_offsets_ptr = SafePtr::new(&self.offsets); + + let chunk_i = AtomicUsize::new(0); + + rayon::scope(|s| { + for _ in 0..num_threads { + s.spawn(|_| { + let chunk_i_ref = &chunk_i; + let neighbors_ref = safe_neighbors_ptr.get_ref(); + let new_neighbors_ref = safe_new_neighbors_ptr.get_mut(); + let new_offsets_ref = safe_new_offsets_ptr.get_mut(); + let chunk_offset_ref = safe_chunk_offset_ptr.get_mut(); + let offsets_ref = safe_offsets_ptr.get_ref(); + let degree_ref = safe_degree_ptr.get_ref(); + + loop { + let cur_chunk = chunk_i_ref.fetch_add(1, Ordering::Relaxed); + if cur_chunk >= chunk_num { + break; + } + + let local_offset = chunk_offset_ref[cur_chunk]; + let start_idx = cur_chunk * chunk_size; + let end_idx = vertex_num.min(start_idx + chunk_size); + + if end_idx > old_vertex_num { + if start_idx >= old_vertex_num { + for v in start_idx..end_idx { + new_offsets_ref[v] += local_offset; + } + } else { + for v in start_idx..old_vertex_num { + let offset = new_offsets_ref[v] + local_offset; + new_offsets_ref[v] = offset; + let old_offset = offsets_ref[v]; + let deg = degree_ref[v] as usize; + new_neighbors_ref[offset..offset + deg] + .copy_from_slice(&neighbors_ref[old_offset..old_offset + deg]); + } + for v in old_vertex_num..end_idx { + new_offsets_ref[v] += local_offset; + } + } + } else { + for v in start_idx..end_idx { + let offset = new_offsets_ref[v] + local_offset; + new_offsets_ref[v] = offset; + let old_offset = offsets_ref[v]; + let deg = degree_ref[v] as usize; + new_neighbors_ref[offset..offset + deg] + .copy_from_slice(&neighbors_ref[old_offset..old_offset + deg]); + } + } + } + }); + } + }); + + self.degree.resize(vertex_num, 0); + let new_degree = &mut self.degree; + if reverse { + for (src, dst) in edges.iter() { + let offset = new_offsets[dst.index()] + new_degree[dst.index()] as usize; + new_degree[dst.index()] += 1; + new_neighbors[offset] = *src; + } + } else { + for (src, dst) in edges.iter() { + let offset = new_offsets[src.index()] + new_degree[src.index()] as usize; + new_degree[src.index()] += 1; + new_neighbors[offset] = *dst; + } + } + + self.neighbors = new_neighbors; + self.offsets = new_offsets; + self.edge_num = cur_offset; + } + + fn insert_edges_with_prop( + &mut self, vertex_num: usize, edges: &Vec<(I, I)>, edges_prop: &ColTable, reverse: bool, p: u32, + old_table: ColTable, + ) -> ColTable { + let mut new_degree = vec![0; vertex_num]; + if reverse { + for e in edges.iter() { + new_degree[e.1.index()] += 1; + } + } else { + for e in edges.iter() { + new_degree[e.0.index()] += 1; + } + } + let mut new_table = old_table.new_empty(); + new_table.resize(self.edge_num + edges.len()); + + let num_threads = p as usize; + + let old_vertex_num = self.offsets.len(); + let chunk_size = (((vertex_num + num_threads - 1) / num_threads) + 3) / 4; + let chunk_num = (vertex_num + chunk_size - 1) / chunk_size; + + let mut chunk_offset = vec![0_usize; chunk_num]; + let safe_chunk_offset_ptr = SafeMutPtr::new(&mut chunk_offset); + let mut new_offsets = ArrayType::with_capacity(vertex_num); + new_offsets.resize(vertex_num, 0); + let safe_new_offsets_ptr = SafeMutPtr::new(&mut new_offsets); + let safe_new_degree_ptr = SafeMutPtr::new(&mut new_degree); + let safe_degree_ptr = SafePtr::new(&self.degree); + + let chunk_i = AtomicUsize::new(0); + + rayon::scope(|s| { + for _ in 0..num_threads { + s.spawn(|_| { + let chunk_i_ref = &chunk_i; + let new_offsets_ref = safe_new_offsets_ptr.get_mut(); + let chunk_offset_ref = safe_chunk_offset_ptr.get_mut(); + let degree_ref = safe_degree_ptr.get_ref(); + let new_degree_ref = safe_new_degree_ptr.get_mut(); + loop { + let cur_chunk = chunk_i_ref.fetch_add(1, Ordering::Relaxed); + if cur_chunk >= chunk_num { + break; + } + let mut local_offset = 0_usize; + + let start_idx = cur_chunk * chunk_size; + let end_idx = vertex_num.min(start_idx + chunk_size); + + if end_idx > old_vertex_num { + if start_idx >= old_vertex_num { + for v in start_idx..end_idx { + new_offsets_ref[v] = local_offset; + local_offset += (new_degree[v]) as usize; + } + } else { + for v in start_idx..old_vertex_num { + new_offsets_ref[v] = local_offset; + local_offset += (degree_ref[v] + new_degree_ref[v]) as usize; + } + for v in old_vertex_num..end_idx { + new_offsets_ref[v] = local_offset; + local_offset += (new_degree[v]) as usize; + } + } + } else { + for v in start_idx..end_idx { + new_offsets_ref[v] = local_offset; + local_offset += (degree_ref[v] + new_degree_ref[v]) as usize; + } + } + chunk_offset_ref[cur_chunk] = local_offset; + } + }); + } + }); + let mut cur_offset = 0_usize; + for i in 0..chunk_num { + let tmp = chunk_offset[i] + cur_offset; + chunk_offset[i] = cur_offset; + cur_offset = tmp; + } + + let mut new_neighbors = ArrayType::with_capacity(cur_offset); + new_neighbors.resize(cur_offset, I::new(0)); + + let safe_new_neighbors_ptr = SafeMutPtr::new(&mut new_neighbors); + let safe_neighbors_ptr = SafePtr::new(&self.neighbors); + let safe_offsets_ptr = SafePtr::new(&self.offsets); + + let safe_new_table_ptr = SafeMutPtr::new(&mut new_table); + let safe_old_table_ptr = SafePtr::new(&old_table); + + let chunk_i = AtomicUsize::new(0); + + rayon::scope(|s| { + for _ in 0..num_threads { + s.spawn(|_| { + let chunk_i_ref = &chunk_i; + let neighbors_ref = safe_neighbors_ptr.get_ref(); + let new_neighbors_ref = safe_new_neighbors_ptr.get_mut(); + let new_offsets_ref = safe_new_offsets_ptr.get_mut(); + let chunk_offset_ref = safe_chunk_offset_ptr.get_mut(); + let offsets_ref = safe_offsets_ptr.get_ref(); + let degree_ref = safe_degree_ptr.get_ref(); + let new_table_ref = safe_new_table_ptr.get_mut(); + let old_table_ref = safe_old_table_ptr.get_ref(); + + loop { + let cur_chunk = chunk_i_ref.fetch_add(1, Ordering::Relaxed); + if cur_chunk >= chunk_num { + break; + } + + let local_offset = chunk_offset_ref[cur_chunk]; + let start_idx = cur_chunk * chunk_size; + let end_idx = vertex_num.min(start_idx + chunk_size); + + if end_idx > old_vertex_num { + if start_idx >= old_vertex_num { + for v in start_idx..end_idx { + new_offsets_ref[v] += local_offset; + } + } else { + for v in start_idx..old_vertex_num { + let offset = new_offsets_ref[v] + local_offset; + new_offsets_ref[v] = offset; + let old_offset = offsets_ref[v]; + let deg = degree_ref[v] as usize; + new_neighbors_ref[offset..offset + deg] + .copy_from_slice(&neighbors_ref[old_offset..old_offset + deg]); + new_table_ref.copy_range(offset, old_table_ref, old_offset, deg); + } + for v in old_vertex_num..end_idx { + new_offsets_ref[v] += local_offset; + } + } + } else { + for v in start_idx..end_idx { + let offset = new_offsets_ref[v] + local_offset; + new_offsets_ref[v] = offset; + let old_offset = offsets_ref[v]; + let deg = degree_ref[v] as usize; + new_neighbors_ref[offset..offset + deg] + .copy_from_slice(&neighbors_ref[old_offset..old_offset + deg]); + new_table_ref.copy_range(offset, old_table_ref, old_offset, deg); + } + } + } + }); + } + }); + + self.degree.resize(vertex_num, 0); + let new_degree = &mut self.degree; + if reverse { + for (row_i, (src, dst)) in edges.iter().enumerate() { + let offset = new_offsets[dst.index()] + new_degree[dst.index()] as usize; + new_degree[dst.index()] += 1; + new_neighbors[offset] = *src; + new_table.set_table_row(offset, edges_prop, row_i); + } + } else { + for (row_i, (src, dst)) in edges.iter().enumerate() { + let offset = new_offsets[src.index()] + new_degree[src.index()] as usize; + new_degree[src.index()] += 1; + new_neighbors[offset] = *dst; + new_table.set_table_row(offset, edges_prop, row_i); + } + } + + self.neighbors = new_neighbors; + self.offsets = new_offsets; + self.edge_num = cur_offset; + + new_table + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/bmscsr.rs b/interactive_engine/executor/store/bmcsr/src/bmscsr.rs new file mode 100644 index 000000000000..ca1a9dac1d9b --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/bmscsr.rs @@ -0,0 +1,364 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::any::Any; +use std::collections::HashSet; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +#[cfg(feature = "hugepage_csr")] +use huge_container::HugeVec; + +use crate::col_table::ColTable; +use crate::csr::{CsrBuildError, CsrTrait, NbrIter, NbrIterBeta, NbrOffsetIter, SafeMutPtr, SafePtr}; +use crate::graph::IndexType; + +#[cfg(feature = "hugepage_csr")] +type ArrayType = HugeVec; + +#[cfg(not(feature = "hugepage_csr"))] +type ArrayType = Vec; + +pub struct BatchMutableSingleCsr { + nbr_list: ArrayType, + + vertex_num: usize, + edge_num: usize, + + vertex_capacity: usize, +} + +pub struct BatchMutableSingleCsrBuilder { + nbr_list: ArrayType, + + vertex_num: usize, + edge_num: usize, + + vertex_capacity: usize, +} + +impl BatchMutableSingleCsrBuilder { + pub fn new() -> Self { + BatchMutableSingleCsrBuilder { + nbr_list: ArrayType::new(), + vertex_num: 0, + edge_num: 0, + vertex_capacity: 0, + } + } + + pub fn init(&mut self, degree: &Vec, reserve_rate: f64) { + let vertex_num = degree.len(); + let mut edge_num = 0_usize; + for i in 0..vertex_num { + edge_num += degree[i] as usize; + } + + self.vertex_num = vertex_num; + self.edge_num = edge_num; + + self.vertex_capacity = vertex_num * reserve_rate as usize; + + self.nbr_list + .resize(self.vertex_capacity, ::max()); + } + + pub fn put_edge(&mut self, src: I, dst: I) -> Result { + self.nbr_list[src.index()] = dst; + Ok(src.index()) + } + + pub fn finish(self) -> Result, CsrBuildError> { + Ok(BatchMutableSingleCsr { + nbr_list: self.nbr_list, + vertex_num: self.vertex_num, + edge_num: self.edge_num, + vertex_capacity: self.vertex_capacity, + }) + } +} + +impl BatchMutableSingleCsr { + pub fn new() -> Self { + BatchMutableSingleCsr { nbr_list: ArrayType::new(), vertex_num: 0, edge_num: 0, vertex_capacity: 0 } + } + + pub fn resize_vertex(&mut self, vertex_num: usize) { + if vertex_num < self.vertex_num { + self.vertex_num = vertex_num; + } else if vertex_num == self.vertex_num { + return; + } else if vertex_num < self.vertex_capacity { + for i in self.vertex_num..vertex_num { + self.nbr_list[i] = ::max(); + } + self.vertex_num = vertex_num; + } else { + // warn!("resize vertex capacity from {} to {}", self.vertex_capacity, vertex_num); + self.nbr_list + .resize(vertex_num, ::max()); + self.vertex_num = vertex_num; + self.vertex_capacity = vertex_num; + } + } + + pub fn put_edge(&mut self, src: I, dst: I) { + self.nbr_list[src.index()] = dst; + } + + pub fn remove_vertex(&mut self, vertex: I) { + self.nbr_list[vertex.index()] = ::max(); + } + + pub fn remove_edge(&mut self, src: I, dst: I) { + if self.nbr_list[src.index()] == dst { + self.nbr_list[src.index()] = ::max(); + } + } + + pub fn get_edge(&self, src: I) -> Option { + if self.nbr_list[src.index()] == ::max() { + None + } else { + Some(self.nbr_list[src.index()]) + } + } + + pub fn get_edge_with_offset(&self, src: I) -> Option<(I, usize)> { + if self.nbr_list[src.index()] == ::max() { + None + } else { + Some((self.nbr_list[src.index()], src.index())) + } + } + + pub fn insert_edge(&mut self, src: I, dst: I) { + self.nbr_list[src.index()] = dst; + } +} + +unsafe impl Send for BatchMutableSingleCsr {} +unsafe impl Sync for BatchMutableSingleCsr {} + +impl CsrTrait for BatchMutableSingleCsr { + fn vertex_num(&self) -> I { + I::new(self.vertex_num) + } + + fn edge_num(&self) -> usize { + self.edge_num + } + + fn max_edge_offset(&self) -> usize { + self.vertex_num + } + + fn degree(&self, u: I) -> usize { + (self.nbr_list[u.index()] == ::max()) as usize + } + + fn serialize(&self, path: &String) { + let file = File::create(path).unwrap(); + let mut writer = BufWriter::new(file); + writer + .write_u64::(self.vertex_num as u64) + .unwrap(); + writer + .write_u64::(self.edge_num as u64) + .unwrap(); + writer + .write_u64::(self.vertex_capacity as u64) + .unwrap(); + writer + .write_u64::(self.nbr_list.len() as u64) + .unwrap(); + for i in 0..self.nbr_list.len() { + self.nbr_list[i].write(&mut writer).unwrap(); + } + writer.flush().unwrap(); + } + + fn deserialize(&mut self, path: &String) { + let file = File::open(path).unwrap(); + let mut reader = BufReader::new(file); + + self.vertex_num = reader.read_u64::().unwrap() as usize; + self.edge_num = reader.read_u64::().unwrap() as usize; + self.vertex_capacity = reader.read_u64::().unwrap() as usize; + let len = reader.read_u64::().unwrap() as usize; + self.nbr_list = ArrayType::with_capacity(len); + for _ in 0..len { + self.nbr_list + .push(I::read(&mut reader).unwrap()); + } + } + + fn get_edges(&self, src: I) -> Option> { + if self.nbr_list[src.index()] == ::max() { + None + } else { + Some(NbrIter::new(&self.nbr_list, src.index(), src.index() + 1)) + } + } + + fn get_edges_beta(&self, u: I) -> NbrIterBeta { + if self.nbr_list[u.index()] == ::max() { + NbrIterBeta::new(self.nbr_list.as_ptr(), self.nbr_list.as_ptr()) + } else { + NbrIterBeta::new(unsafe { self.nbr_list.as_ptr().add(u.index()) }, unsafe { + self.nbr_list.as_ptr().add(u.index() + 1) + }) + } + } + + fn get_edges_with_offset(&self, src: I) -> Option> { + if self.nbr_list[src.index()] == ::max() { + None + } else { + Some(NbrOffsetIter::new(&self.nbr_list, src.index(), src.index() + 1)) + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn delete_vertices(&mut self, vertices: &HashSet) { + for vertex in vertices { + self.remove_vertex(*vertex); + } + } + + fn parallel_delete_edges(&mut self, edges: &Vec<(I, I)>, reverse: bool, p: u32) { + let edges_num = edges.len(); + + let safe_nbr_list_ptr = SafeMutPtr::new(&mut self.nbr_list); + let safe_edges_ptr = SafePtr::new(edges); + + let num_threads = p as usize; + let chunk_size = (edges_num + num_threads - 1) / num_threads; + rayon::scope(|s| { + for i in 0..num_threads { + let start_idx = i * chunk_size; + let end_idx = edges_num.min(start_idx + chunk_size); + s.spawn(move |_| { + let edges_ref = safe_edges_ptr.get_ref(); + let nbr_list_ref = safe_nbr_list_ptr.get_mut(); + if reverse { + for k in start_idx..end_idx { + let v = edges_ref[k].1; + nbr_list_ref[v.index()] = ::max(); + } + } else { + for k in start_idx..end_idx { + let v = edges_ref[k].0; + nbr_list_ref[v.index()] = ::max(); + } + } + }); + } + }); + } + + fn parallel_delete_edges_with_props( + &mut self, edges: &Vec<(I, I)>, reverse: bool, _: &mut ColTable, p: u32, + ) { + self.parallel_delete_edges(edges, reverse, p); + } + + fn insert_edges(&mut self, vertex_num: usize, edges: &Vec<(I, I)>, reverse: bool, p: u32) { + self.resize_vertex(vertex_num); + + let num_threads = p as usize; + let chunk_size = (edges.len() + num_threads - 1) / num_threads; + + let safe_nbr_list_ptr = SafeMutPtr::new(&mut self.nbr_list); + let safe_edges_ptr = SafePtr::new(edges); + + let edge_num = edges.len(); + + rayon::scope(|s| { + for i in 0..num_threads { + let start_idx = i * chunk_size; + let end_idx = (start_idx + chunk_size).min(edge_num); + s.spawn(move |_| { + let nbr_list_ref = safe_nbr_list_ptr.get_mut(); + let edges_ref = safe_edges_ptr.get_ref(); + if reverse { + for idx in start_idx..end_idx { + let (dst, src) = edges_ref.get(idx).unwrap(); + nbr_list_ref[src.index()] = *dst; + } + } else { + for idx in start_idx..end_idx { + let (src, dst) = edges_ref.get(idx).unwrap(); + nbr_list_ref[src.index()] = *dst; + } + } + }); + } + }); + } + + fn insert_edges_with_prop( + &mut self, vertex_num: usize, edges: &Vec<(I, I)>, edges_prop: &ColTable, reverse: bool, p: u32, + mut table: ColTable, + ) -> ColTable { + self.resize_vertex(vertex_num); + table.resize(vertex_num); + + let num_threads = p as usize; + let chunk_size = (edges.len() + num_threads - 1) / num_threads; + + let safe_nbr_list_ptr = SafeMutPtr::new(&mut self.nbr_list); + let safe_edges_ptr = SafePtr::new(edges); + let safe_table_ptr = SafeMutPtr::new(&mut table); + + let edge_num = edges.len(); + + rayon::scope(|s| { + for i in 0..num_threads { + let start_idx = i * chunk_size; + let end_idx = (start_idx + chunk_size).min(edge_num); + s.spawn(move |_| { + let nbr_list_ref = safe_nbr_list_ptr.get_mut(); + let edges_ref = safe_edges_ptr.get_ref(); + let table_ref = safe_table_ptr.get_mut(); + + if reverse { + for idx in start_idx..end_idx { + let (dst, src) = edges_ref.get(idx).unwrap(); + nbr_list_ref[src.index()] = *dst; + table_ref.set_table_row(src.index(), edges_prop, idx); + } + } else { + for idx in start_idx..end_idx { + let (src, dst) = edges_ref.get(idx).unwrap(); + nbr_list_ref[src.index()] = *dst; + table_ref.set_table_row(src.index(), edges_prop, idx); + } + } + }); + } + }); + + table + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/col_table.rs b/interactive_engine/executor/store/bmcsr/src/col_table.rs new file mode 100644 index 000000000000..339eb11fa315 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/col_table.rs @@ -0,0 +1,581 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::collections::HashMap; +use std::fmt::Debug; +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Write}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use csv::StringRecord; + +use crate::columns::*; +use crate::date::parse_date; +use crate::date_time::parse_datetime; +use crate::error::GDBResult; + +#[derive(Debug)] +pub struct ColTable { + columns: Vec>, + pub header: HashMap, + row_num: usize, +} + +impl ColTable { + pub fn new(types: Vec<(DataType, String)>) -> Self { + let mut columns = Vec::>::with_capacity(types.len()); + let mut header = HashMap::new(); + for pair in types.into_iter().enumerate() { + header.insert(pair.1 .1, pair.0); + match pair.1 .0 { + DataType::Int32 => { + columns.push(Box::new(Int32Column::new())); + } + DataType::UInt32 => { + columns.push(Box::new(UInt32Column::new())); + } + DataType::Int64 => { + columns.push(Box::new(Int64Column::new())); + } + DataType::UInt64 => { + columns.push(Box::new(UInt64Column::new())); + } + DataType::String => { + columns.push(Box::new(StringColumn::new())); + } + DataType::LCString => { + columns.push(Box::new(LCStringColumn::new())); + } + DataType::Double => { + columns.push(Box::new(DoubleColumn::new())); + } + DataType::Date => { + columns.push(Box::new(DateColumn::new())); + } + DataType::DateTime => { + columns.push(Box::new(DateTimeColumn::new())); + } + DataType::ID => { + columns.push(Box::new(IDColumn::new())); + } + DataType::NULL => { + error!("Unexpected column type"); + } + } + } + Self { columns, header, row_num: 0 } + } + + pub fn new_empty(&self) -> Self { + let mut types = Vec::<(DataType, String)>::new(); + types.resize(self.col_num(), (DataType::NULL, String::new())); + for col_i in 0..self.col_num() { + types[col_i].0 = self.columns[col_i].get_type(); + } + for (name, idx) in self.header.iter() { + types[*idx].1 = name.clone(); + } + Self::new(types) + } + + pub fn col_num(&self) -> usize { + self.columns.len() + } + + pub fn row_num(&self) -> usize { + self.row_num + } + + pub fn push(&mut self, row: &Vec) { + let col_num = self.columns.len(); + if row.len() < col_num { + info!("schema not match when push, row_len = {}, col num = {}", row.len(), col_num); + return; + } + for i in 0..col_num { + self.columns[i].push(row[i].clone()); + } + self.row_num += 1; + } + + pub fn insert(&mut self, index: usize, row: &Vec) { + let col_num = self.columns.len(); + if self.row_num <= index { + let null_num = index - self.row_num; + for i in 0..col_num { + let col = &mut self.columns[i]; + for _ in 0..null_num { + col.push(Item::Null); + } + } + self.row_num = index; + self.push(row); + } else { + for i in 0..col_num { + self.columns[i].set(index, row[i].clone()); + } + } + } + + pub fn add_property(&mut self, index_name: String, data_type: DataType) { + let property_num = self.columns.len(); + if !self.header.contains_key(&index_name) { + self.header.insert(index_name, property_num); + match data_type { + DataType::Int32 => { + let mut column = Int32Column::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::UInt32 => { + let mut column = UInt32Column::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::Int64 => { + let mut column = Int64Column::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::UInt64 => { + let mut column = UInt64Column::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::Double => { + let mut column = DoubleColumn::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::String => { + let mut column = StringColumn::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::Date => { + let mut column = DateColumn::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::DateTime => { + let mut column = DateTimeColumn::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::LCString => { + let mut column = LCStringColumn::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::ID => { + let mut column = IDColumn::new(); + column.resize(self.row_num); + self.columns.push(Box::new(column)); + } + DataType::NULL => { + panic!("Data type of column can not be null"); + } + } + } + } + + pub fn set_property(&mut self, prop_name: String, index: &Vec, data: Box) { + if let Some(prop_id) = self.header.get(&prop_name) { + let mut column = self.columns.get_mut(*prop_id).unwrap(); + column.set_column_batch(index, &data); + } + } + + pub fn get_column_by_index(&self, index: usize) -> &'_ Box { + &self.columns[index] + } + + pub fn get_column_by_name(&self, name: &str) -> &'_ Box { + let index = self.header.get(name).unwrap(); + &self.columns[*index] + } + + pub fn get_item(&self, col_name: &str, row_i: usize) -> Option { + if let Some(col_i) = self.header.get(col_name) { + self.columns[*col_i].get(row_i) + } else { + None + } + } + + pub fn get_item_by_index(&self, col_i: usize, row_i: usize) -> Option { + if col_i < self.columns.len() { + self.columns[col_i].get(row_i) + } else { + None + } + } + + pub fn get_row(&self, row_i: usize) -> Option> { + if row_i < self.row_num { + let mut row = Vec::new(); + for col in self.columns.iter() { + row.push(col.get(row_i).unwrap().to_owned()); + } + Some(row) + } else { + None + } + } + + pub fn set_table_row(&mut self, self_i: usize, other: &ColTable, other_i: usize) { + if self.row_num <= self_i { + self.resize(self_i + 1); + } + for col_i in 0..self.col_num() { + self.columns[col_i].set_column_elem(self_i, &other.columns[col_i], other_i); + } + } + + pub fn move_row(&mut self, from: usize, to: usize) { + for col_i in 0..self.col_num() { + self.columns[col_i].move_elem(from, to); + } + } + + pub fn copy_range(&mut self, self_i: usize, other: &ColTable, other_i: usize, num: usize) { + if self.row_num < (self_i + num) { + self.resize(self_i + num); + } + for col_i in 0..self.col_num() { + self.columns[col_i].copy_range(self_i, &other.columns[col_i], other_i, num); + } + } + + pub fn resize(&mut self, row_num: usize) { + for col_i in 0..self.col_num() { + self.columns[col_i].resize(row_num); + } + self.row_num = row_num; + } + + pub fn serialize_table(&self, path: &String) { + let f = File::create(path).unwrap(); + let mut writer = BufWriter::new(f); + writer + .write_u64::(self.row_num as u64) + .unwrap(); + writer + .write_u64::(self.header.len() as u64) + .unwrap(); + for pair in self.header.iter() { + writer + .write_u64::(pair.0.len() as u64) + .unwrap(); + writer.write_all(pair.0.as_bytes()).unwrap(); + writer + .write_u64::(*pair.1 as u64) + .unwrap(); + } + + writer + .write_u64::(self.columns.len() as u64) + .unwrap(); + for col in self.columns.iter() { + writer + .write_i32::(col.get_type().to_i32()) + .unwrap(); + col.serialize(&mut writer).unwrap(); + } + } + + pub fn deserialize_table(&mut self, path: &String) { + let f = File::open(path).unwrap(); + let mut reader = BufReader::new(f); + self.row_num = reader.read_u64::().unwrap() as usize; + let header_len = reader.read_u64::().unwrap() as usize; + self.header.clear(); + for _ in 0..header_len { + let str_len = reader.read_u64::().unwrap() as usize; + let mut str_bytes = vec![0u8; str_len]; + reader.read_exact(&mut str_bytes).unwrap(); + let s = String::from_utf8(str_bytes).unwrap(); + let ind = reader.read_u64::().unwrap() as usize; + + self.header.insert(s, ind); + } + + let column_len = reader.read_u64::().unwrap() as usize; + self.columns.clear(); + for _ in 0..column_len { + let t = DataType::from_i32(reader.read_i32::().unwrap()).unwrap(); + match t { + DataType::Int32 => { + let mut col = Int32Column::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::UInt32 => { + let mut col = UInt32Column::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::Int64 => { + let mut col = Int64Column::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::UInt64 => { + let mut col = UInt64Column::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::Double => { + let mut col = DoubleColumn::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::String => { + let mut col = StringColumn::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::Date => { + let mut col = DateColumn::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::DateTime => { + let mut col = DateTimeColumn::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::LCString => { + let mut col = LCStringColumn::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::ID => { + let mut col = IDColumn::new(); + col.deserialize(&mut reader).unwrap(); + self.columns.push(Box::new(col)); + } + DataType::NULL => { + let col = Int32Column::new(); + self.columns.push(Box::new(col)); + } + }; + } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.header != other.header { + info!("header not same"); + return false; + } + if self.columns.len() != other.columns.len() { + info!("columns num not same"); + return false; + } + let col_num = self.columns.len(); + for i in 0..col_num { + if self.columns[i].get_type() != other.columns[i].get_type() { + info!("column-{} type not same", i); + return false; + } + match self.columns[i].get_type() { + DataType::Int32 => { + if !self.columns[i] + .as_any() + .downcast_ref::() + .unwrap() + .is_same( + other.columns[i] + .as_any() + .downcast_ref::() + .unwrap(), + ) + { + info!("column-{} data not same", i); + return false; + } + } + DataType::DateTime => { + if !self.columns[i] + .as_any() + .downcast_ref::() + .unwrap() + .is_same( + other.columns[i] + .as_any() + .downcast_ref::() + .unwrap(), + ) + { + info!("column-{} data not same", i); + return false; + } + } + DataType::String => { + let lhs = &self.columns[i] + .as_any() + .downcast_ref::() + .unwrap() + .data; + let rhs = &other.columns[i] + .as_any() + .downcast_ref::() + .unwrap() + .data; + if lhs.len() != rhs.len() { + info!("column-{} data not same", i); + return false; + } + let num = lhs.len(); + for i in 0..num { + if lhs[i] != rhs[i] { + info!("column-{} data not same", i); + return false; + } + } + } + DataType::LCString => { + if !self.columns[i] + .as_any() + .downcast_ref::() + .unwrap() + .is_same( + other.columns[i] + .as_any() + .downcast_ref::() + .unwrap(), + ) + { + info!("column-{} data not same", i); + return false; + } + } + DataType::Date => { + if !self.columns[i] + .as_any() + .downcast_ref::() + .unwrap() + .is_same( + other.columns[i] + .as_any() + .downcast_ref::() + .unwrap(), + ) + { + info!("column-{} data not same", i); + return false; + } + } + _ => { + info!("unexpected type"); + return false; + } + } + } + return true; + } +} + +unsafe impl Sync for ColTable {} + +unsafe impl Send for ColTable {} + +pub fn parse_properties( + record: &StringRecord, header: &[(String, DataType)], selected: &[bool], +) -> GDBResult> { + let mut properties = Vec::new(); + for (index, val) in record.iter().enumerate() { + if selected[index] { + match header[index].1 { + DataType::Int32 => { + properties.push(Item::Int32(val.parse::()?)); + } + DataType::UInt32 => { + properties.push(Item::UInt32(val.parse::()?)); + } + DataType::Int64 => { + properties.push(Item::Int64(val.parse::()?)); + } + DataType::UInt64 => { + properties.push(Item::UInt64(val.parse::()?)); + } + DataType::String => { + properties.push(Item::String(val.to_string())); + } + DataType::Date => { + properties.push(Item::Date(parse_date(val)?)); + } + DataType::DateTime => { + properties.push(Item::DateTime(parse_datetime(val))); + } + DataType::Double => { + properties.push(Item::Double(val.parse::()?)); + } + DataType::NULL => { + error!("Unexpected field type"); + } + DataType::ID => {} + DataType::LCString => { + properties.push(Item::String(val.to_string())); + } + } + } + } + Ok(properties) +} + +pub fn parse_properties_by_mappings( + record: &StringRecord, header: &[(String, DataType)], mappings: &Vec, +) -> GDBResult> { + let mut properties = vec![]; + for (index, val) in record.iter().enumerate() { + if index < mappings.len() && mappings[index] >= 0 { + match header[mappings[index] as usize].1 { + DataType::Int32 => { + properties.push(Item::Int32(val.parse::()?)); + } + DataType::UInt32 => { + properties.push(Item::UInt32(val.parse::()?)); + } + DataType::Int64 => { + properties.push(Item::Int64(val.parse::()?)); + } + DataType::UInt64 => { + properties.push(Item::UInt64(val.parse::()?)); + } + DataType::String => { + properties.push(Item::String(val.to_string())); + } + DataType::Date => { + properties.push(Item::Date(parse_date(val)?)); + } + DataType::DateTime => { + properties.push(Item::DateTime(parse_datetime(val))); + } + DataType::Double => { + properties.push(Item::Double(val.parse::()?)); + } + DataType::NULL => { + error!("Unexpected field type"); + } + DataType::ID => {} + DataType::LCString => { + properties.push(Item::String(val.to_string())); + } + } + } + } + Ok(properties) +} diff --git a/interactive_engine/executor/store/bmcsr/src/columns.rs b/interactive_engine/executor/store/bmcsr/src/columns.rs new file mode 100644 index 000000000000..fa2d7992c083 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/columns.rs @@ -0,0 +1,1814 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::any::Any; +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Write}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use dyn_type::object::RawType; +use dyn_type::CastError; +#[cfg(feature = "hugepage_table")] +use huge_container::HugeVec; +use pegasus_common::codec::{Decode, Encode}; +use pegasus_common::io::{ReadExt, WriteExt}; +use serde::{Deserialize, Serialize}; + +use crate::date::Date; +use crate::date_time::DateTime; +use crate::types::DefaultId; + +#[cfg(feature = "hugepage_table")] +pub type ColumnContainer = HugeVec; + +#[cfg(not(feature = "hugepage_table"))] +pub type ColumnContainer = Vec; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)] +pub enum DataType { + Int32 = 1, + UInt32 = 2, + Int64 = 3, + UInt64 = 4, + Double = 5, + String = 6, + Date = 7, + DateTime = 8, + LCString = 9, + ID = 10, + NULL = 0, +} + +impl Encode for DataType { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + match *self { + DataType::NULL => writer.write_u8(0), + DataType::Int32 => writer.write_u8(1), + DataType::UInt32 => writer.write_u8(2), + DataType::Int64 => writer.write_u8(3), + DataType::UInt64 => writer.write_u8(4), + DataType::Double => writer.write_u8(5), + DataType::String => writer.write_u8(6), + DataType::Date => writer.write_u8(7), + DataType::DateTime => writer.write_u8(8), + DataType::LCString => writer.write_u8(9), + DataType::ID => writer.write_u8(10), + }; + Ok(()) + } +} + +impl Decode for DataType { + fn read_from(reader: &mut R) -> std::io::Result { + let data_type = match reader.read_u8()? { + 0 => DataType::NULL, + 1 => DataType::Int32, + 2 => DataType::UInt32, + 3 => DataType::Int64, + 4 => DataType::UInt64, + 5 => DataType::Double, + 6 => DataType::String, + 7 => DataType::Date, + 8 => DataType::DateTime, + 9 => DataType::LCString, + 10 => DataType::ID, + _ => panic!("Unknown data type"), + }; + Ok(data_type) + } +} + +impl DataType { + pub fn from_i32(n: i32) -> Option { + match n { + 0 => Some(Self::NULL), + 1 => Some(Self::Int32), + 2 => Some(Self::UInt32), + 3 => Some(Self::Int64), + 4 => Some(Self::UInt64), + 5 => Some(Self::Double), + 6 => Some(Self::String), + 7 => Some(Self::Date), + 8 => Some(Self::DateTime), + 9 => Some(Self::LCString), + 10 => Some(Self::ID), + _ => None, + } + } + + pub fn to_i32(&self) -> i32 { + match self { + Self::NULL => 0, + Self::Int32 => 1, + Self::UInt32 => 2, + Self::Int64 => 3, + Self::UInt64 => 4, + Self::Double => 5, + Self::String => 6, + Self::Date => 7, + Self::DateTime => 8, + Self::LCString => 9, + Self::ID => 10, + } + } +} + +impl<'a> From<&'a str> for DataType { + fn from(_token: &'a str) -> Self { + info!("token = {}", _token); + let token_str = _token.to_uppercase(); + let token = token_str.as_str(); + if token == "INT32" { + DataType::Int32 + } else if token == "UINT32" { + DataType::UInt32 + } else if token == "INT64" { + DataType::Int64 + } else if token == "UINT64" { + DataType::UInt64 + } else if token == "DOUBLE" { + DataType::Double + } else if token == "STRING" { + DataType::String + } else if token == "DATE" { + DataType::Date + } else if token == "DATETIME" { + DataType::DateTime + } else if token == "ID" { + DataType::ID + } else if token == "LCString" { + DataType::LCString + } else { + error!("Unsupported type {:?}", token); + DataType::NULL + } + } +} + +#[derive(Clone)] +pub enum Item { + Boolean(bool), + Int32(i32), + UInt32(u32), + Int64(i64), + UInt64(u64), + Float(f32), + Double(f64), + String(String), + Date(Date), + DateTime(DateTime), + VertexId(usize), + EdgeId((u64, u64)), + Null, +} + +#[derive(Clone)] +pub enum RefItem<'a> { + Boolean(&'a bool), + Int32(&'a i32), + UInt32(&'a u32), + Int64(&'a i64), + UInt64(&'a u64), + Float(&'a f32), + Double(&'a f64), + Date(&'a Date), + DateTime(&'a DateTime), + VertexId(&'a usize), + EdgeId((&'a u64, &'a u64)), + String(&'a String), + Null, +} + +impl<'a> RefItem<'a> { + pub fn to_owned(self) -> Item { + match self { + RefItem::Boolean(v) => Item::Boolean(*v), + RefItem::Int32(v) => Item::Int32(*v), + RefItem::UInt32(v) => Item::UInt32(*v), + RefItem::Int64(v) => Item::Int64(*v), + RefItem::UInt64(v) => Item::UInt64(*v), + RefItem::Float(v) => Item::Float(*v), + RefItem::Double(v) => Item::Double(*v), + RefItem::Date(v) => Item::Date(*v), + RefItem::DateTime(v) => Item::DateTime(*v), + RefItem::VertexId(v) => Item::VertexId(*v), + RefItem::EdgeId((src, dst)) => Item::EdgeId((*src, *dst)), + RefItem::String(v) => Item::String(v.clone()), + RefItem::Null => Item::Null, + } + } +} + +pub trait ConvertItem { + fn to_ref_item(&self) -> RefItem; + fn from_item(v: Item) -> Self; +} + +impl ConvertItem for i32 { + fn to_ref_item(&self) -> RefItem { + RefItem::Int32(self) + } + + fn from_item(v: Item) -> Self { + match v { + Item::Int32(v) => v, + _ => 0, + } + } +} + +impl ConvertItem for DateTime { + fn to_ref_item(&self) -> RefItem { + RefItem::DateTime(self) + } + + fn from_item(v: Item) -> Self { + match v { + Item::DateTime(v) => v, + _ => DateTime::empty(), + } + } +} + +impl ConvertItem for () { + fn to_ref_item(&self) -> RefItem { + RefItem::Null + } + + fn from_item(_v: Item) -> Self { + () + } +} + +impl Debug for Item { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Item::Int32(v) => { + write!(f, "int32[{}]", v) + } + Item::UInt32(v) => { + write!(f, "uint32[{}]", v) + } + Item::Int64(v) => { + write!(f, "int64[{}]", v) + } + Item::UInt64(v) => { + write!(f, "uint64[{}]", v) + } + Item::Double(v) => { + write!(f, "double[{}]", v) + } + Item::Date(v) => { + write!(f, "date[{}]", v.to_string()) + } + Item::DateTime(v) => { + write!(f, "datetime[{}]", v.to_string()) + } + Item::VertexId(v) => { + write!(f, "id[{}]", v) + } + Item::String(v) => { + write!(f, "string[{}]", v) + } + _ => { + write!(f, "") + } + } + } +} + +impl ToString for Item { + fn to_string(&self) -> String { + match self { + Item::Int32(v) => v.to_string(), + Item::UInt32(v) => v.to_string(), + Item::Int64(v) => v.to_string(), + Item::UInt64(v) => v.to_string(), + Item::Double(v) => v.to_string(), + Item::Date(v) => v.to_string(), + Item::DateTime(v) => v.to_string(), + Item::VertexId(v) => v.to_string(), + Item::String(v) => v.to_string(), + _ => "".to_string(), + } + } +} + +impl<'a> Debug for RefItem<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + RefItem::Int32(v) => { + write!(f, "int32[{}]", v) + } + RefItem::UInt32(v) => { + write!(f, "uint32[{}]", v) + } + RefItem::Int64(v) => { + write!(f, "int64[{}]", v) + } + RefItem::UInt64(v) => { + write!(f, "uint64[{}]", v) + } + RefItem::Double(v) => { + write!(f, "double[{}]", v) + } + RefItem::Date(v) => { + write!(f, "date[{}]", v.to_string()) + } + RefItem::DateTime(v) => { + write!(f, "datetime[{}]", v.to_string()) + } + RefItem::VertexId(v) => { + write!(f, "id[{}]", v) + } + RefItem::String(v) => { + write!(f, "string[{}]", v) + } + _ => { + write!(f, "") + } + } + } +} + +impl<'a> ToString for RefItem<'a> { + fn to_string(&self) -> String { + match self { + RefItem::Int32(v) => v.to_string(), + RefItem::UInt32(v) => v.to_string(), + RefItem::Int64(v) => v.to_string(), + RefItem::UInt64(v) => v.to_string(), + RefItem::Double(v) => v.to_string(), + RefItem::Date(v) => v.to_string(), + RefItem::DateTime(v) => v.to_string(), + RefItem::VertexId(v) => v.to_string(), + RefItem::String(v) => v.to_string(), + _ => "".to_string(), + } + } +} + +impl<'a> RefItem<'a> { + #[inline] + pub fn as_u64(&self) -> Result { + match self { + RefItem::Int32(v) => Ok(**v as u64), + RefItem::UInt32(v) => Ok(**v as u64), + RefItem::Int64(v) => Ok(**v as u64), + RefItem::UInt64(v) => Ok(**v), + RefItem::Double(v) => Ok(**v as u64), + RefItem::Date(_) => Ok(0_u64), + RefItem::DateTime(v) => Ok(v.to_i64() as u64), + RefItem::VertexId(v) => Ok(**v as u64), + RefItem::String(_) => Err(CastError::new::(RawType::String)), + _ => Ok(0_u64), + } + } + + #[inline] + pub fn as_i32(&self) -> Result { + match self { + RefItem::Int32(v) => Ok(**v), + RefItem::UInt32(v) => Ok(**v as i32), + RefItem::Int64(v) => Ok(**v as i32), + RefItem::UInt64(v) => Ok(**v as i32), + RefItem::Double(v) => Ok(**v as i32), + RefItem::Date(_) => Ok(0), + RefItem::DateTime(_) => Ok(0), + RefItem::VertexId(v) => Ok(**v as i32), + RefItem::String(_) => Err(CastError::new::(RawType::String)), + _ => Ok(0), + } + } + + #[inline] + pub fn as_str(&self) -> Result, CastError> { + match self { + RefItem::String(str) => Ok(Cow::Borrowed(*str)), + _ => Err(CastError::new::(RawType::Unknown)), + } + } + + #[inline] + pub fn as_datetime(&self) -> Result { + match self { + RefItem::Int32(_) => Err(CastError::new::(RawType::Integer)), + RefItem::UInt32(_) => Err(CastError::new::(RawType::Integer)), + RefItem::Int64(_) => Err(CastError::new::(RawType::Long)), + RefItem::UInt64(_) => Err(CastError::new::(RawType::Long)), + RefItem::Double(_) => Err(CastError::new::(RawType::Float)), + RefItem::Date(_) => Err(CastError::new::(RawType::Unknown)), + RefItem::DateTime(v) => Ok(**v), + RefItem::VertexId(_) => Err(CastError::new::(RawType::Long)), + RefItem::String(_) => Err(CastError::new::(RawType::String)), + _ => Err(CastError::new::(RawType::Unknown)), + } + } +} + +pub trait Column: Debug { + fn get_type(&self) -> DataType; + fn get(&self, index: usize) -> Option; + fn set(&mut self, index: usize, val: Item); + fn push(&mut self, val: Item); + fn len(&self) -> usize; + fn as_any(&self) -> &dyn Any; + + fn set_column_batch(&mut self, index: &Vec, col: &Box); + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize); + fn move_elem(&mut self, from: usize, to: usize); + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize); + fn resize(&mut self, size: usize); + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()>; + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()>; +} + +pub struct Int32Column { + pub data: ColumnContainer, +} + +unsafe impl Send for Int32Column {} + +unsafe impl Sync for Int32Column {} + +impl Int32Column { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + for k in 0..num { + if self.data[k] != other.data[k] { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> Int32Column { + Int32Column { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> Int32Column { + Int32Column { data } + } + + pub fn clone_from(other: &Int32Column) -> Int32Column { + Int32Column { data: other.data.clone() } + } +} + +impl Debug for Int32Column { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Int32Column: {:?}", self.data) + } +} + +impl Column for Int32Column { + fn get_type(&self) -> DataType { + DataType::Int32 + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::Int32(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::Int32(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = 0; + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::Int32(v) => { + self.data.push(v); + } + _ => { + self.data.push(0); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_i32::()?); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_i32::(*v)?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct UInt32Column { + pub data: ColumnContainer, +} + +unsafe impl Send for UInt32Column {} + +unsafe impl Sync for UInt32Column {} + +impl UInt32Column { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + for k in 0..num { + if self.data[k] != other.data[k] { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> UInt32Column { + UInt32Column { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> UInt32Column { + UInt32Column { data } + } + + pub fn clone_from(other: &UInt32Column) -> UInt32Column { + UInt32Column { data: other.data.clone() } + } +} + +impl Debug for UInt32Column { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "UInt32Column: {:?}", self.data) + } +} + +impl Column for UInt32Column { + fn get_type(&self) -> DataType { + DataType::UInt32 + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::UInt32(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::UInt32(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = 0; + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::UInt32(v) => { + self.data.push(v); + } + _ => { + self.data.push(0); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_u32::()?); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_u32::(*v)?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct Int64Column { + pub data: ColumnContainer, +} + +unsafe impl Send for Int64Column {} + +unsafe impl Sync for Int64Column {} + +impl Int64Column { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + for k in 0..num { + if self.data[k] != other.data[k] { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> Int64Column { + Int64Column { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> Int64Column { + Int64Column { data } + } + + pub fn clone_from(other: &Int64Column) -> Int64Column { + Int64Column { data: other.data.clone() } + } +} + +impl Debug for Int64Column { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Int64Column: {:?}", self.data) + } +} + +impl Column for Int64Column { + fn get_type(&self) -> DataType { + DataType::Int32 + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::Int64(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::Int64(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = 0; + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::Int64(v) => { + self.data.push(v); + } + _ => { + self.data.push(0); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_i64::()?); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_i64::(*v)?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct UInt64Column { + pub data: ColumnContainer, +} + +unsafe impl Send for UInt64Column {} + +unsafe impl Sync for UInt64Column {} + +impl UInt64Column { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + for k in 0..num { + if self.data[k] != other.data[k] { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> UInt64Column { + UInt64Column { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> UInt64Column { + UInt64Column { data } + } + + pub fn clone_from(other: &UInt64Column) -> UInt64Column { + UInt64Column { data: other.data.clone() } + } +} + +impl Debug for UInt64Column { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "UInt64Column: {:?}", self.data) + } +} + +impl Column for UInt64Column { + fn get_type(&self) -> DataType { + DataType::UInt64 + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::UInt64(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::UInt64(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = 0; + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::UInt64(v) => { + self.data.push(v); + } + _ => { + self.data.push(0); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_u64::()?); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_u64::(*v)?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct IDColumn { + pub data: ColumnContainer, +} + +unsafe impl Send for IDColumn {} + +unsafe impl Sync for IDColumn {} + +impl IDColumn { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> IDColumn { + IDColumn { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> IDColumn { + IDColumn { data } + } + + pub fn clone_from(other: &IDColumn) -> IDColumn { + IDColumn { data: other.data.clone() } + } +} + +impl Debug for IDColumn { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "IDColumn: {:?}", self.data) + } +} + +impl Column for IDColumn { + fn get_type(&self) -> DataType { + DataType::ID + } + + fn get(&self, index: usize) -> Option { + self.data + .get(index) + .map(|x| RefItem::VertexId(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::VertexId(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = 0; + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::VertexId(v) => { + self.data.push(v); + } + _ => { + self.data.push(0); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_u64::()? as DefaultId); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_u64::(*v as u64)?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct DoubleColumn { + pub data: ColumnContainer, +} + +unsafe impl Send for DoubleColumn {} + +unsafe impl Sync for DoubleColumn {} + +impl DoubleColumn { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> DoubleColumn { + DoubleColumn { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> DoubleColumn { + DoubleColumn { data } + } + + pub fn clone_from(other: &DoubleColumn) -> DoubleColumn { + DoubleColumn { data: other.data.clone() } + } +} + +impl Debug for DoubleColumn { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DoubleColumn: {:?}", self.data) + } +} + +impl Column for DoubleColumn { + fn get_type(&self) -> DataType { + DataType::Double + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::Double(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::Double(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = 0_f64; + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::Double(v) => { + self.data.push(v); + } + _ => { + self.data.push(0_f64); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_f64::()?); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_f64::(*v)?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0.0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct StringColumn { + pub data: Vec, +} + +unsafe impl Send for StringColumn {} + +unsafe impl Sync for StringColumn {} + +impl StringColumn { + pub fn new() -> Self { + Self { data: Vec::new() } + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> StringColumn { + StringColumn { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> StringColumn { + StringColumn { data } + } + + pub fn clone_from(other: &StringColumn) -> StringColumn { + StringColumn { data: other.data.clone() } + } +} + +impl Debug for StringColumn { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "StringColumn: {:?}", self.data) + } +} + +impl Column for StringColumn { + fn get_type(&self) -> DataType { + DataType::String + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::String(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::String(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = String::from(""); + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::String(v) => { + self.data.push(v); + } + _ => { + self.data.push(String::from("")); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = Vec::::with_capacity(row_num); + for _ in 0..row_num { + let length = reader.read_i32::()?; + let mut string_bytes = vec![0u8; length as usize]; + reader.read_exact(&mut string_bytes)?; + data.push(String::from_utf8(string_bytes).unwrap()); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_i32::(v.len() as i32)?; + writer.write_all(v.as_bytes())?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, String::new()); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index].clone(); + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index].clone(); + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from].clone(); + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for i in 0..num { + self.data[self_index + i] = casted_col.data[col_index + i].clone(); + } + } +} + +pub struct LCStringColumn { + pub data: ColumnContainer, + pub table: HashMap, + pub list: Vec, +} + +unsafe impl Send for LCStringColumn {} + +unsafe impl Sync for LCStringColumn {} + +impl LCStringColumn { + pub fn new() -> Self { + Self { data: ColumnContainer::new(), table: HashMap::new(), list: Vec::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + if self.list != other.list { + return false; + } + for k in 0..num { + if self.data[k] != other.data[k] { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec, table: HashMap, list: Vec) -> LCStringColumn { + LCStringColumn { data, table, list } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec, table: HashMap, list: Vec) -> LCStringColumn { + LCStringColumn { data, table, list } + } + + pub fn clone_from(other: &LCStringColumn) -> LCStringColumn { + LCStringColumn { data: other.data.clone(), table: other.table.clone(), list: other.list.clone() } + } +} + +impl Debug for LCStringColumn { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "LCStringColumn: {:?},{:?},{:?}", self.data, self.table, self.list) + } +} + +impl Column for LCStringColumn { + fn get_type(&self) -> DataType { + DataType::LCString + } + + fn get(&self, index: usize) -> Option { + self.data + .get(index) + .map(|x| RefItem::String(&self.list[*x as usize])) + } + + fn set(&mut self, index: usize, val: Item) { + let value = match val { + Item::String(v) => v, + _ => "".to_string(), + }; + if let Some(v) = self.table.get(&value) { + self.data[index] = *v; + } else { + assert!(self.list.len() < 65535); + let cur = self.list.len() as u16; + self.list.push(value.clone()); + self.table.insert(value, cur); + self.data[index] = cur; + } + } + + fn push(&mut self, val: Item) { + let value = match val { + Item::String(v) => v, + _ => "".to_string(), + }; + if let Some(v) = self.table.get(&value) { + self.data.push(*v); + } else { + assert!(self.list.len() < 65535); + let cur = self.list.len() as u16; + self.list.push(value.clone()); + self.table.insert(value, cur); + self.data.push(cur); + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(reader.read_u16::()?); + } + + let list_size = reader.read_u16::()? as usize; + let mut list = Vec::::with_capacity(list_size); + let mut table = HashMap::new(); + for i in 0..list_size { + let length = reader.read_i32::()?; + let mut string_bytes = vec![0u8; length as usize]; + reader.read_exact(&mut string_bytes)?; + let parsed_string = String::from_utf8(string_bytes).unwrap(); + list.push(parsed_string.clone()); + table.insert(parsed_string, i as u16); + } + + self.data = data; + self.table = table; + self.list = list; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_u16::(*v)?; + } + + writer.write_u16::(self.list.len() as u16)?; + for s in self.list.iter() { + let length = s.len() as i32; + writer.write_i32::(length)?; + writer.write_all(s.as_bytes())?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, 0); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + let val = casted_col.list[casted_col.data[index] as usize].clone(); + if let Some(idx) = self.table.get(&val) { + self.data[*i] = *idx; + } else { + let idx = self.table.len() as u16; + self.list.push(val.clone()); + self.table.insert(val, idx); + self.data[*i] = idx; + } + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + let val = casted_col.list[casted_col.data[col_index] as usize].clone(); + if let Some(idx) = self.table.get(&val) { + self.data[self_index] = *idx; + } else { + let idx = self.table.len() as u16; + self.list.push(val.clone()); + self.table.insert(val, idx); + self.data[self_index] = idx; + } + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for i in 0..num { + let val = casted_col.list[casted_col.data[col_index + i] as usize].clone(); + if let Some(idx) = self.table.get(&val) { + self.data[self_index + i] = *idx; + } else { + let idx = self.table.len() as u16; + self.list.push(val.clone()); + self.table.insert(val, idx); + self.data[self_index + i] = idx; + } + } + } +} + +pub struct DateColumn { + pub data: ColumnContainer, +} + +unsafe impl Send for DateColumn {} + +unsafe impl Sync for DateColumn {} + +impl DateColumn { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + for k in 0..num { + if self.data[k].to_i32() != other.data[k].to_i32() { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> DateColumn { + DateColumn { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> DateColumn { + DateColumn { data } + } + + pub fn clone_from(other: &DateColumn) -> DateColumn { + DateColumn { data: other.data.clone() } + } +} + +impl Debug for DateColumn { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DateColumn: {:?}", self.data) + } +} + +impl Column for DateColumn { + fn get_type(&self) -> DataType { + DataType::Date + } + + fn get(&self, index: usize) -> Option { + self.data.get(index).map(|x| RefItem::Date(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::Date(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = Date::empty(); + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::Date(v) => { + self.data.push(v); + } + _ => { + self.data.push(Date::empty()); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(Date::from_i32(reader.read_i32::()?)); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_i32::(v.to_i32())?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, Date::empty()); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} + +pub struct DateTimeColumn { + pub data: ColumnContainer, +} + +unsafe impl Send for DateTimeColumn {} + +unsafe impl Sync for DateTimeColumn {} + +impl DateTimeColumn { + pub fn new() -> Self { + Self { data: ColumnContainer::new() } + } + + pub fn is_same(&self, other: &Self) -> bool { + if self.data.len() != other.data.len() { + return false; + } + let num = self.data.len(); + for k in 0..num { + if self.data[k].to_i64() != other.data[k].to_i64() { + return false; + } + } + return true; + } + + #[cfg(feature = "hugepage_table")] + pub fn from(data: HugeVec) -> DateTimeColumn { + DateTimeColumn { data } + } + + #[cfg(not(feature = "hugepage_table"))] + pub fn from(data: Vec) -> DateTimeColumn { + DateTimeColumn { data } + } + + pub fn clone_from(other: &DateTimeColumn) -> DateTimeColumn { + DateTimeColumn { data: other.data.clone() } + } +} + +impl Debug for DateTimeColumn { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DateTimeColumn: {:?}", self.data) + } +} + +impl Column for DateTimeColumn { + fn get_type(&self) -> DataType { + DataType::DateTime + } + + fn get(&self, index: usize) -> Option { + self.data + .get(index) + .map(|x| RefItem::DateTime(x)) + } + + fn set(&mut self, index: usize, val: Item) { + match val { + Item::DateTime(v) => { + self.data[index] = v; + } + _ => { + self.data[index] = DateTime::empty(); + } + } + } + + fn push(&mut self, val: Item) { + match val { + Item::DateTime(v) => { + self.data.push(v); + } + _ => { + self.data.push(DateTime::empty()); + } + } + } + + fn len(&self) -> usize { + self.data.len() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn deserialize(&mut self, reader: &mut BufReader) -> std::io::Result<()> { + let row_num = reader.read_u64::()? as usize; + let mut data = ColumnContainer::::with_capacity(row_num); + for _ in 0..row_num { + data.push(DateTime::new(reader.read_i64::()?)); + } + self.data = data; + Ok(()) + } + + fn serialize(&self, writer: &mut BufWriter) -> std::io::Result<()> { + writer.write_u64::(self.data.len() as u64)?; + for v in self.data.iter() { + writer.write_i64::(v.to_i64())?; + } + + Ok(()) + } + + fn resize(&mut self, size: usize) { + self.data.resize(size, DateTime::empty()); + } + + fn set_column_batch(&mut self, index: &Vec, col: &Box) { + if col.as_any().is::() { + let casted_col = col.as_any().downcast_ref::().unwrap(); + for (index, i) in index.iter().enumerate() { + self.data[*i] = casted_col.data[index]; + } + } + } + + fn set_column_elem(&mut self, self_index: usize, col: &Box, col_index: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index] = casted_col.data[col_index]; + } + + fn move_elem(&mut self, from: usize, to: usize) { + self.data[to] = self.data[from]; + } + + fn copy_range(&mut self, self_index: usize, col: &Box, col_index: usize, num: usize) { + let casted_col = col.as_any().downcast_ref::().unwrap(); + self.data[self_index..self_index + num] + .copy_from_slice(&casted_col.data[col_index..col_index + num]); + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/csr.rs b/interactive_engine/executor/store/bmcsr/src/csr.rs new file mode 100644 index 000000000000..10b92cd4306d --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/csr.rs @@ -0,0 +1,180 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::any::Any; +use std::collections::HashSet; +use std::marker::PhantomData; + +#[cfg(feature = "hugepage_csr")] +use huge_container::HugeVec; + +use crate::col_table::ColTable; +use crate::graph::IndexType; + +#[cfg(feature = "hugepage_csr")] +type ArrayType = HugeVec; + +#[cfg(not(feature = "hugepage_csr"))] +type ArrayType = Vec; + +pub struct NbrIter<'a, I> { + inner: std::slice::Iter<'a, I>, +} + +impl<'a, I> NbrIter<'a, I> { + pub fn new(vec: &'a ArrayType, start: usize, end: usize) -> Self { + NbrIter { inner: vec[start..end].iter() } + } +} + +impl<'a, I: IndexType> Iterator for NbrIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option { + self.inner.next() + } +} + +pub struct NbrIterBeta { + start: *const I, + end: *const I, +} + +impl NbrIterBeta { + pub fn new(start: *const I, end: *const I) -> Self { + NbrIterBeta { start, end } + } +} + +impl Iterator for NbrIterBeta { + type Item = I; + fn next(&mut self) -> Option { + if self.start == self.end { + None + } else { + let ret = unsafe { *self.start }; + self.start = unsafe { self.start.add(1) }; + Some(ret) + } + } +} + +unsafe impl Sync for NbrIterBeta {} +unsafe impl Send for NbrIterBeta {} + +pub struct NbrOffsetIter<'a, I> { + inner: std::slice::Iter<'a, I>, + offset: usize, +} + +impl<'a, I> NbrOffsetIter<'a, I> { + pub fn new(vec: &'a ArrayType, start: usize, end: usize) -> Self { + NbrOffsetIter { inner: vec[start..end].iter(), offset: start } + } +} + +impl<'a, I: IndexType> Iterator for NbrOffsetIter<'a, I> { + type Item = (I, usize); + + fn next(&mut self) -> Option { + match self.inner.next() { + Some(x) => { + let ret = (x.clone(), self.offset); + self.offset += 1; + Some(ret) + } + None => None, + } + } +} + +pub trait CsrTrait: Send + Sync { + fn vertex_num(&self) -> I; + fn max_edge_offset(&self) -> usize; + fn edge_num(&self) -> usize; + fn degree(&self, u: I) -> usize; + fn serialize(&self, path: &String); + fn deserialize(&mut self, path: &String); + + fn get_edges(&self, u: I) -> Option>; + fn get_edges_beta(&self, u: I) -> NbrIterBeta; + fn get_edges_with_offset(&self, u: I) -> Option>; + + fn as_any(&self) -> &dyn Any; + fn as_mut_any(&mut self) -> &mut dyn Any; + + fn delete_vertices(&mut self, vertices: &HashSet); + fn parallel_delete_edges(&mut self, edges: &Vec<(I, I)>, reverse: bool, p: u32); + fn parallel_delete_edges_with_props( + &mut self, edges: &Vec<(I, I)>, reverse: bool, table: &mut ColTable, p: u32, + ); + + fn insert_edges(&mut self, vertex_num: usize, edges: &Vec<(I, I)>, reverse: bool, p: u32); + + fn insert_edges_with_prop( + &mut self, vertex_num: usize, edges: &Vec<(I, I)>, edges_prop: &ColTable, reverse: bool, p: u32, + old_table: ColTable, + ) -> ColTable; +} + +#[derive(Debug)] +pub enum CsrBuildError { + OffsetOutOfCapacity, + UnfinishedVertex, +} + +pub struct SafePtr(*const I, PhantomData); +unsafe impl Send for SafePtr {} +unsafe impl Sync for SafePtr {} + +impl Clone for SafePtr { + fn clone(&self) -> Self { + SafePtr(self.0.clone(), PhantomData) + } +} + +impl Copy for SafePtr {} + +impl SafePtr { + pub fn new(ptr: &I) -> Self { + Self { 0: ptr as *const I, 1: PhantomData } + } + + pub fn get_ref(&self) -> &I { + unsafe { &*self.0 } + } +} + +pub struct SafeMutPtr(*mut I, PhantomData); +unsafe impl Send for SafeMutPtr {} +unsafe impl Sync for SafeMutPtr {} + +impl SafeMutPtr { + pub fn new(ptr: &mut I) -> Self { + Self { 0: ptr as *mut I, 1: PhantomData } + } + + pub fn get_mut(&self) -> &mut I { + unsafe { &mut *self.0 } + } +} + +impl Clone for SafeMutPtr { + fn clone(&self) -> Self { + SafeMutPtr(self.0.clone(), PhantomData) + } +} + +impl Copy for SafeMutPtr {} diff --git a/interactive_engine/executor/store/bmcsr/src/date.rs b/interactive_engine/executor/store/bmcsr/src/date.rs new file mode 100644 index 000000000000..e1de79e3efb5 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/date.rs @@ -0,0 +1,108 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::fmt::{Debug, Display, Formatter}; + +use chrono::{Datelike, Duration, NaiveDate}; + +use crate::date_time::DateTime; +use crate::error::GDBResult; + +#[derive(Copy, Clone)] +pub struct Date { + inner: i32, +} + +impl Date { + pub fn empty() -> Self { + Date { inner: 0 } + } + + pub fn from_i32(inner: i32) -> Self { + Self { inner } + } + + pub fn new(year: i32, month: u32, day: u32) -> Self { + Date { + inner: chrono::NaiveDate::from_ymd_opt(year as i32, month as u32, day as u32) + .unwrap() + .num_days_from_ce(), + } + } + + pub fn year(&self) -> i32 { + chrono::NaiveDate::from_num_days_from_ce_opt(self.inner) + .unwrap() + .year() + } + + pub fn month(&self) -> u32 { + chrono::NaiveDate::from_num_days_from_ce_opt(self.inner) + .unwrap() + .month() + } + + pub fn day(&self) -> u32 { + chrono::NaiveDate::from_num_days_from_ce_opt(self.inner) + .unwrap() + .day() + } + + pub fn to_i32(&self) -> i32 { + self.inner + } + + pub fn add_days(&self, days: i32) -> Self { + let din = NaiveDate::from_num_days_from_ce_opt(self.inner).unwrap(); + let duration = Duration::days(days as i64); + let dout = din + duration; + Self::from_i32(dout.num_days_from_ce()) + } +} + +impl PartialEq for Date { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } +} + +impl Eq for Date {} + +impl Display for Date { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{:04}-{:02}-{:02}", self.year(), self.month(), self.day()) + } +} + +impl Debug for Date { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string()) + } +} + +pub fn parse_date(val: &str) -> GDBResult { + if let Ok(timestamp) = val.parse::() { + let datetime = DateTime::new(timestamp); + let year = datetime.year(); + let month = datetime.month() as u32; + let day = datetime.day() as u32; + Ok(Date::new(year, month, day)) + } else { + let year = val[0..4].parse::()?; + let month = val[5..7].parse::()?; + let day = val[8..10].parse::()?; + Ok(Date::new(year, month, day)) + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/date_time.rs b/interactive_engine/executor/store/bmcsr/src/date_time.rs new file mode 100644 index 000000000000..b4105490d844 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/date_time.rs @@ -0,0 +1,184 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::cmp::{Ord, Ordering, PartialOrd}; +use std::fmt::{Debug, Display, Formatter}; + +use chrono::{DateTime as CDateTime, TimeZone}; +use chrono::{Datelike, Duration, Timelike, Utc}; + +#[derive(Clone, Copy)] +pub struct DateTime { + inner: i64, +} + +impl DateTime { + pub fn empty() -> Self { + DateTime { inner: 0 } + } + + pub fn new(timestamp: i64) -> Self { + DateTime { inner: timestamp } + } + + pub fn from_datetime( + year: i32, month: u32, day: u32, hour: u32, minute: u32, second: u32, millisecond: u32, + ) -> Self { + let date_dt = CDateTime::::from_utc( + chrono::NaiveDate::from_ymd_opt(year as i32, month as u32, day as u32) + .unwrap() + .and_hms_milli_opt(hour as u32, minute as u32, second as u32, millisecond as u32) + .unwrap(), + Utc, + ) + .timestamp_millis(); + Self { inner: date_dt } + } + + pub fn year(&self) -> i32 { + chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .year() + } + + pub fn month(&self) -> i32 { + chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .month() as i32 + } + + pub fn day(&self) -> i32 { + chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .day() as i32 + } + + pub fn hour(&self) -> i32 { + chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .hour() as i32 + } + + pub fn minute(&self) -> i32 { + chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .minute() as i32 + } + + pub fn second(&self) -> i32 { + chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .second() as i32 + } + + pub fn millisecond(&self) -> i32 { + (chrono::NaiveDateTime::from_timestamp_millis(self.inner) + .unwrap() + .timestamp_subsec_nanos() + / 1000000) as i32 + } + + pub fn to_i64(&self) -> i64 { + self.inner + } + + pub fn to_days_i64(&self) -> i64 { + Utc.timestamp_millis_opt(self.inner) + .unwrap() + .date() + .and_hms(0, 0, 0) + .timestamp_millis() + } + + pub fn to_chrono_date_utc(&self) -> CDateTime { + CDateTime::::from_utc(chrono::NaiveDateTime::from_timestamp_millis(self.inner).unwrap(), Utc) + } + + pub fn from_chrono_date_utc(dt: CDateTime) -> Self { + Self::new(dt.timestamp_millis()) + } + + pub fn add_days(&self, days: u32) -> Self { + let utc_dt = self.to_chrono_date_utc(); + let duration = Duration::days(days as i64); + let ret = utc_dt + duration; + Self::from_chrono_date_utc(ret) + } + + pub fn minus_hours(&self, hours: u32) -> Self { + let utc_dt = self.to_chrono_date_utc(); + let duration = Duration::hours(hours as i64); + let ret = utc_dt - duration; + Self::from_chrono_date_utc(ret) + } +} + +impl PartialEq for DateTime { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } +} + +impl Eq for DateTime {} + +impl PartialOrd for DateTime { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for DateTime { + fn cmp(&self, other: &Self) -> Ordering { + self.inner.cmp(&other.inner) + } +} + +impl Display for DateTime { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}.{:03}+00:00", + self.year(), + self.month(), + self.day(), + self.hour(), + self.minute(), + self.second(), + self.millisecond(), + ) + } +} + +impl Debug for DateTime { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string()) + } +} + +pub fn parse_datetime(val: &str) -> DateTime { + let datetime = if let Ok(utc_dt) = val.parse::>() { + let tz_hour = val[24..26].parse::().unwrap(); + let tz_minute = val[27..29].parse::().unwrap(); + let duration_hour = Duration::hours(tz_hour as i64); + let duration_minute = Duration::minutes(tz_minute as i64); + let dt = utc_dt + duration_hour + duration_minute; + DateTime::from_chrono_date_utc(dt) + } else if let Ok(timestamp) = val.parse::() { + DateTime::new(timestamp) + } else { + panic!("Failed to parse datetime {}", val); + }; + datetime +} diff --git a/interactive_engine/executor/store/bmcsr/src/edge_trim.rs b/interactive_engine/executor/store/bmcsr/src/edge_trim.rs new file mode 100644 index 000000000000..003bc89ed152 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/edge_trim.rs @@ -0,0 +1,83 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::collections::HashSet; +use std::fmt::Debug; + +use serde::{Deserialize, Serialize}; + +use crate::schema::CsrGraphSchema; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct EdgeTrimJson { + ie_enable: Vec>, + oe_enable: Vec>, +} + +impl EdgeTrimJson { + pub fn get_enable_indexs(&self, schema: &CsrGraphSchema) -> (HashSet, HashSet) { + let mut ie_index_set = HashSet::::new(); + let mut oe_index_set = HashSet::::new(); + + let vertex_label_num = schema.vertex_type_to_id.len(); + let edge_label_num = schema.edge_type_to_id.len(); + + for ie in &self.ie_enable { + let src_label_name = ie.get(0).expect("src label name not found"); + let dst_label_name = ie.get(2).expect("dst label name not found"); + let edge_label_name = ie.get(1).expect("edge label name not found"); + let src_label_id = *schema + .vertex_type_to_id + .get(src_label_name) + .expect("label id not found") as usize; + let dst_label_id = *schema + .vertex_type_to_id + .get(dst_label_name) + .expect("label id not found") as usize; + let edge_label_id = *schema + .edge_type_to_id + .get(edge_label_name) + .expect("label id not found") as usize; + let index = src_label_id * vertex_label_num * edge_label_num + + dst_label_id * edge_label_num + + edge_label_id; + ie_index_set.insert(index); + } + + for oe in &self.oe_enable { + let src_label_name = oe.get(0).expect("src label name not found"); + let dst_label_name = oe.get(2).expect("dst label name not found"); + let edge_label_name = oe.get(1).expect("edge label name not found"); + let src_label_id = *schema + .vertex_type_to_id + .get(src_label_name) + .expect("label id not found") as usize; + let dst_label_id = *schema + .vertex_type_to_id + .get(dst_label_name) + .expect("label id not found") as usize; + let edge_label_id = *schema + .edge_type_to_id + .get(edge_label_name) + .expect("label id not found") as usize; + let index = src_label_id * vertex_label_num * edge_label_num + + dst_label_id * edge_label_num + + edge_label_id; + oe_index_set.insert(index); + } + + (ie_index_set, oe_index_set) + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/error.rs b/interactive_engine/executor/store/bmcsr/src/error.rs new file mode 100644 index 000000000000..5a55bd38d1d1 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/error.rs @@ -0,0 +1,89 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::any::Any; +use std::io::Error; +use std::num::{ParseFloatError, ParseIntError}; + +use dyn_type::CastError; + +pub type GDBResult = Result; + +#[derive(Debug)] +pub enum GDBError { + ModifyReadOnlyError, + BincodeError(std::boxed::Box), + JsonError(serde_json::Error), + IOError(std::io::Error), + DynError(Box), + CastError(CastError), + DBNotFoundError, + LruZeroCapacity, + JsonObjectFieldError, + BooleanExpressionError, + StringExpressionError, + NumberExpressionError, + EdgeNotFoundError, + VertexNotFoundError, + UnknownError, + CrossComparisonError, + OutOfBoundError, + ParseError, + InvalidFunctionCallError, + InvalidTypeError, + FieldNotExistError, +} + +impl From for GDBError { + fn from(error: Error) -> Self { + GDBError::IOError(error) + } +} + +impl From for GDBError { + fn from(_error: ParseIntError) -> Self { + GDBError::ParseError + } +} + +impl From for GDBError { + fn from(_error: ParseFloatError) -> Self { + GDBError::ParseError + } +} + +impl From for GDBError { + fn from(error: serde_json::Error) -> Self { + GDBError::JsonError(error) + } +} + +impl From> for GDBError { + fn from(error: Box) -> Self { + GDBError::BincodeError(error) + } +} + +impl From<()> for GDBError { + fn from(_error: ()) -> Self { + GDBError::UnknownError + } +} + +impl From> for GDBError { + fn from(error: Box) -> Self { + GDBError::DynError(error) + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/graph.rs b/interactive_engine/executor/store/bmcsr/src/graph.rs new file mode 100644 index 000000000000..951e2f9f952c --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/graph.rs @@ -0,0 +1,130 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::fmt; +use std::hash::Hash; +use std::ops::AddAssign; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; + +/// Trait for the unsigned integer type used for node and edge indices. +/// +/// Marked `unsafe` because: the trait must faithfully preserve +/// and convert index values. +pub unsafe trait IndexType: + Copy + Default + Hash + Ord + fmt::Debug + 'static + AddAssign + Send + Sync +{ + fn new(x: usize) -> Self; + fn index(&self) -> usize; + fn max() -> Self; + + fn add_assign(&mut self, other: Self); + + fn read(reader: &mut R) -> std::io::Result; + fn write(&self, writer: &mut W) -> std::io::Result<()>; +} + +unsafe impl IndexType for usize { + #[inline(always)] + fn new(x: usize) -> Self { + x + } + #[inline(always)] + fn index(&self) -> Self { + *self + } + #[inline(always)] + fn max() -> Self { + ::std::usize::MAX + } + + #[inline(always)] + fn add_assign(&mut self, other: Self) { + *self += other; + } + + fn read(reader: &mut R) -> std::io::Result { + let ret = reader.read_u64::()? as usize; + Ok(ret) + } + + fn write(&self, writer: &mut W) -> std::io::Result<()> { + writer.write_u64::(*self as u64) + } +} + +unsafe impl IndexType for u32 { + #[inline(always)] + fn new(x: usize) -> Self { + x as u32 + } + #[inline(always)] + fn index(&self) -> usize { + *self as usize + } + #[inline(always)] + fn max() -> Self { + ::std::u32::MAX + } + + #[inline(always)] + fn add_assign(&mut self, other: Self) { + *self += other; + } + + fn read(reader: &mut R) -> std::io::Result { + let ret = reader.read_u32::()?; + Ok(ret) + } + + fn write(&self, writer: &mut W) -> std::io::Result<()> { + writer.write_u32::(*self) + } +} + +// Index into the NodeIndex and EdgeIndex arrays +/// Edge direction. +#[derive(Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash)] +#[repr(usize)] +pub enum Direction { + /// An `Outgoing` edge is an outward edge *from* the current node. + Outgoing = 0, + /// An `Incoming` edge is an inbound edge *to* the current node. + Incoming = 1, +} + +impl Direction { + /// Return the opposite `Direction`. + #[inline] + pub fn opposite(self) -> Direction { + match self { + Direction::Outgoing => Direction::Incoming, + Direction::Incoming => Direction::Outgoing, + } + } + + /// Return `0` for `Outgoing` and `1` for `Incoming`. + #[inline] + pub fn index(self) -> usize { + (self as usize) & 0x1 + } +} + +impl Clone for Direction { + #[inline] + fn clone(&self) -> Self { + *self + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/graph_db.rs b/interactive_engine/executor/store/bmcsr/src/graph_db.rs new file mode 100644 index 000000000000..5b3ce84971dd --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/graph_db.rs @@ -0,0 +1,668 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::sync::Arc; + +use rayon::iter::IntoParallelIterator; +use rayon::prelude::*; + +use crate::bmcsr::BatchMutableCsr; +use crate::bmscsr::BatchMutableSingleCsr; +use crate::col_table::ColTable; +use crate::columns::{Column, DataType, Item, RefItem}; +use crate::csr::CsrTrait; +use crate::edge_trim::EdgeTrimJson; +use crate::error::GDBResult; +use crate::graph::{Direction, IndexType}; +use crate::schema::CsrGraphSchema; +use crate::sub_graph::{SingleSubGraph, SubGraph}; +use crate::types::*; +use crate::utils::{Iter, LabeledIterator, Range}; +use crate::vertex_map::VertexMap; + +/// A data structure to maintain a local view of the vertex. +#[derive(Debug, Clone)] +pub struct LocalVertex<'a, G: IndexType + Sync + Send, I: IndexType + Sync + Send> { + /// The vertex's global id + index: I, + /// The vertex's label + label: LabelId, + /// A property reference maintains a `Row` view of the properties, which is either + /// a reference or an owned structure, depending on the form of storage. + /// + table: Option<&'a ColTable>, + id_list: &'a Vec, + corner_id_list: &'a Vec, +} + +impl<'a, G: IndexType + Sync + Send, I: IndexType + Sync + Send> LocalVertex<'a, G, I> { + pub fn new(index: I, label: LabelId, id_list: &'a Vec, corner_id_list: &'a Vec) -> Self { + LocalVertex { index, label, id_list, table: None, corner_id_list } + } + + pub fn with_property( + index: I, label: LabelId, id_list: &'a Vec, corner_id_list: &'a Vec, + table: Option<&'a ColTable>, + ) -> Self { + LocalVertex { index, label, id_list, table, corner_id_list } + } + + pub fn is_valid(&self) -> bool { + self.get_id() != ::max() + } + + pub fn get_id(&self) -> G { + let index = self.index.index(); + if index < self.id_list.len() { + self.id_list[index] + } else { + self.corner_id_list[::max().index() - index - 1] + } + } + + pub fn get_label(&self) -> LabelId { + self.label + } + + pub fn get_property(&self, key: &str) -> Option { + if let Some(prop) = self.table { + prop.get_item(key, self.index.index()) + } else { + None + } + } + + pub fn get_all_properties(&self) -> Option> { + if let Some(prop) = self.table { + let mut property_table = HashMap::new(); + for head in prop.header.keys() { + property_table.insert(head.clone(), prop.get_item(head, self.index.index()).unwrap()); + } + Some(property_table) + } else { + None + } + } +} + +/// A data structure to maintain a local view of the edge. +#[derive(Clone)] +pub struct LocalEdge<'a, G: IndexType + Sync + Send, I: IndexType + Sync + Send> { + /// The start vertex's global id + start: I, + /// The end vertex's global id + end: I, + /// The edge label id + label: LabelId, + src_label: LabelId, + dst_label: LabelId, + + offset: usize, + /// A property reference maintains a `Row` view of the properties, which is either + /// a reference or an owned structure, depending on the form of storage. + table: Option<&'a ColTable>, + + vertex_map: &'a VertexMap, +} + +impl<'a, G: IndexType + Sync + Send, I: IndexType + Sync + Send> LocalEdge<'a, G, I> { + pub fn new( + start: I, end: I, label: LabelId, src_label: LabelId, dst_label: LabelId, + vertex_map: &'a VertexMap, offset: usize, properties: Option<&'a ColTable>, + ) -> Self { + LocalEdge { start, end, label, src_label, dst_label, offset, table: properties, vertex_map } + } + + pub fn get_src_id(&self) -> G { + self.vertex_map + .get_global_id(self.src_label, self.start) + .unwrap() + } + + pub fn get_dst_id(&self) -> G { + self.vertex_map + .get_global_id(self.dst_label, self.end) + .unwrap() + } + + pub fn get_src_label(&self) -> LabelId { + self.src_label + } + + pub fn get_offset(&self) -> usize { + self.offset + } + + pub fn get_dst_label(&self) -> LabelId { + self.dst_label + } + + pub fn get_label(&self) -> LabelId { + self.label + } + + pub fn get_src_lid(&self) -> I { + self.start + } + + pub fn get_dst_lid(&self) -> I { + self.end + } + + pub fn get_property(&self, key: &str) -> Option { + if let Some(prop) = self.table { + prop.get_item(key, self.offset) + } else { + None + } + } + + pub fn get_all_properties(&self) -> Option> { + if let Some(prop) = self.table { + let mut property_table = HashMap::new(); + for head in prop.header.keys() { + property_table.insert(head.clone(), prop.get_item(head, self.offset).unwrap()); + } + Some(property_table) + } else { + None + } + } +} + +pub struct GraphDB { + pub partition: usize, + pub ie: Vec>>, + pub oe: Vec>>, + + pub graph_schema: CsrGraphSchema, + + pub vertex_map: VertexMap, + + pub vertex_prop_table: Vec, + pub ie_edge_prop_table: HashMap, + pub oe_edge_prop_table: HashMap, + + pub vertex_label_num: usize, + pub edge_label_num: usize, +} + +impl GraphDB +where + G: Eq + IndexType + Send + Sync, + I: IndexType + Send + Sync, +{ + pub fn edge_label_to_index( + &self, src_label: LabelId, dst_label: LabelId, edge_label: LabelId, dir: Direction, + ) -> usize { + match dir { + Direction::Incoming => { + dst_label as usize * self.vertex_label_num * self.edge_label_num + + src_label as usize * self.edge_label_num + + edge_label as usize + } + Direction::Outgoing => { + src_label as usize * self.vertex_label_num * self.edge_label_num + + dst_label as usize * self.edge_label_num + + edge_label as usize + } + } + } + + pub fn get_vertices_num(&self, label: LabelId) -> usize { + self.vertex_map.vertex_num(label) + } + + pub fn get_edges_num(&self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId) -> usize { + let index = self.edge_label_to_index(src_label, dst_label, edge_label, Direction::Outgoing); + self.oe[index].edge_num() + } + + pub fn get_max_edge_offset( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, dir: Direction, + ) -> usize { + let index = self.edge_label_to_index(src_label, dst_label, edge_label, Direction::Outgoing); + match dir { + Direction::Incoming => self.ie[index].max_edge_offset(), + Direction::Outgoing => self.oe[index].max_edge_offset(), + } + } + + pub fn get_global_id(&self, id: I, label: LabelId) -> Option { + self.vertex_map.get_global_id(label, id) + } + + pub fn get_internal_id(&self, id: G) -> I { + self.vertex_map.get_internal_id(id).unwrap().1 + } + + pub fn get_internal_id_beta(&self, id: G) -> Option { + if let Some((_, id)) = self.vertex_map.get_internal_id(id) { + Some(id) + } else { + None + } + } + + fn index_to_local_vertex(&self, label_id: LabelId, index: I, with_property: bool) -> LocalVertex { + if with_property { + LocalVertex::with_property( + index, + label_id, + &self.vertex_map.index_to_global_id[label_id as usize], + &self.vertex_map.index_to_corner_global_id[label_id as usize], + Some(&self.vertex_prop_table[label_id as usize]), + ) + } else { + LocalVertex::new( + index, + label_id, + &self.vertex_map.index_to_global_id[label_id as usize], + &self.vertex_map.index_to_corner_global_id[label_id as usize], + ) + } + } + + pub fn get_all_vertices(&self, labels: Option<&Vec>) -> Iter> { + if labels.is_none() { + let mut iters = vec![]; + let mut got_labels = vec![]; + for v in 0..self.vertex_label_num { + iters.push(Range::new(I::new(0), I::new(self.get_vertices_num(v as LabelId))).into_iter()); + got_labels.push(v as LabelId) + } + Iter::from_iter( + LabeledIterator::new(got_labels, iters) + .map(move |(label, index)| self.index_to_local_vertex(label, index, true)), + ) + } else if labels.unwrap().len() == 1 { + let label = labels.unwrap()[0]; + let range = Range::new(I::new(0), I::new(self.get_vertices_num(label))); + Iter::from_iter( + range + .into_iter() + .map(move |index| self.index_to_local_vertex(label, index, true)), + ) + } else { + let mut iters = vec![]; + let mut got_labels = vec![]; + for v in labels.unwrap() { + iters.push(Range::new(I::new(0), I::new(self.get_vertices_num(*v))).into_iter()); + got_labels.push(*v) + } + Iter::from_iter( + LabeledIterator::new(got_labels, iters) + .map(move |(label, index)| self.index_to_local_vertex(label, index, true)), + ) + } + } + + pub fn deserialize(dir: &str, partition: usize, trim_json_path: Option) -> GDBResult { + let root_dir = PathBuf::from_str(dir).unwrap(); + let schema_path = root_dir + .join(DIR_GRAPH_SCHEMA) + .join(FILE_SCHEMA); + let graph_schema = CsrGraphSchema::from_json_file(schema_path)?; + // graph_schema.desc(); + let partition_dir = root_dir + .join(DIR_BINARY_DATA) + .join(format!("partition_{}", partition)); + + let (ie_enable, oe_enable) = if let Some(trim_json_path) = &trim_json_path { + let edge_trim_path = PathBuf::from_str(trim_json_path).unwrap(); + let file = File::open(edge_trim_path)?; + let trim_json = + serde_json::from_reader::(file).map_err(std::io::Error::from)?; + trim_json.get_enable_indexs(&graph_schema) + } else { + (HashSet::::new(), HashSet::::new()) + }; + + let vertex_label_num = graph_schema.vertex_type_to_id.len(); + let edge_label_num = graph_schema.edge_type_to_id.len(); + + let csr_num = vertex_label_num * vertex_label_num * edge_label_num; + let mut ie: Vec>> = vec![]; + let mut oe: Vec>> = vec![]; + for _ in 0..csr_num { + ie.push(Box::new(BatchMutableSingleCsr::::new())); + oe.push(Box::new(BatchMutableSingleCsr::::new())); + } + + let mut csr_tasks = vec![]; + for e_label_i in 0..edge_label_num { + let edge_label_name = graph_schema.edge_label_names()[e_label_i].clone(); + for src_label_i in 0..vertex_label_num { + let src_label_name = graph_schema.vertex_label_names()[src_label_i].clone(); + for dst_label_i in 0..vertex_label_num { + let dst_label_name = graph_schema.vertex_label_names()[dst_label_i].clone(); + let index: usize = src_label_i * vertex_label_num * edge_label_num + + dst_label_i * edge_label_num + + e_label_i; + let ie_path = &partition_dir + .join(format!("ie_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + if Path::exists(ie_path) && (trim_json_path.is_none() || ie_enable.contains(&index)) { + csr_tasks.push((src_label_i, e_label_i, dst_label_i, Direction::Incoming)); + } + let oe_path = &partition_dir + .join(format!("oe_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + if Path::exists(oe_path) && (trim_json_path.is_none() || oe_enable.contains(&index)) { + csr_tasks.push((src_label_i, e_label_i, dst_label_i, Direction::Outgoing)); + } + } + } + } + + let csr_return: Vec>> = csr_tasks + .par_iter() + .map(|(src_label, edge_label, dst_label, dir)| { + let src_label_name = graph_schema.vertex_label_names()[*src_label].clone(); + let dst_label_name = graph_schema.vertex_label_names()[*dst_label].clone(); + let edge_label_name = graph_schema.edge_label_names()[*edge_label].clone(); + let index: usize = + src_label * vertex_label_num * edge_label_num + dst_label * edge_label_num + edge_label; + if *dir == Direction::Outgoing { + let oe_path = &partition_dir + .join(format!("oe_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + if Path::exists(oe_path) && (trim_json_path.is_none() || oe_enable.contains(&index)) { + info!("importing {}", oe_path.as_os_str().to_str().unwrap()); + let path_str = oe_path.to_str().unwrap().to_string(); + let mut oe_csr: Box> = if graph_schema.is_single_oe( + *src_label as LabelId, + *edge_label as LabelId, + *dst_label as LabelId, + ) { + Box::new(BatchMutableSingleCsr::::new()) + } else { + Box::new(BatchMutableCsr::::new()) + }; + oe_csr.deserialize(&path_str); + return oe_csr; + } + } else { + let ie_path = &partition_dir + .join(format!("ie_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + if Path::exists(ie_path) && (trim_json_path.is_none() || ie_enable.contains(&index)) { + info!("importing {}", ie_path.as_os_str().to_str().unwrap()); + let path_str = ie_path.to_str().unwrap().to_string(); + let mut ie_csr: Box> = if graph_schema.is_single_ie( + *src_label as LabelId, + *edge_label as LabelId, + *dst_label as LabelId, + ) { + Box::new(BatchMutableSingleCsr::::new()) + } else { + Box::new(BatchMutableCsr::::new()) + }; + ie_csr.deserialize(&path_str); + return ie_csr; + } + } + Box::new(BatchMutableSingleCsr::::new()) + }) + .collect(); + + for ((src_label_i, e_label_i, dst_label_i, dir), csr) in csr_tasks + .into_iter() + .zip(csr_return.into_iter()) + { + let index: usize = + src_label_i * vertex_label_num * edge_label_num + dst_label_i * edge_label_num + e_label_i; + if dir == Direction::Outgoing { + oe[index] = csr; + } else { + ie[index] = csr; + } + } + + let vertex_prop_table: Vec = (0..vertex_label_num) + .into_par_iter() + .map(|v_label| { + let v_label_name = graph_schema.vertex_label_names()[v_label].clone(); + let mut table = ColTable::new(vec![]); + let table_path = &partition_dir.join(format!("vp_{}", v_label_name)); + let table_path_str = table_path.to_str().unwrap().to_string(); + table.deserialize_table(&table_path_str); + table + }) + .collect(); + + let mut oe_edge_prop_table = HashMap::new(); + let mut ie_edge_prop_table = HashMap::new(); + for e_label_i in 0..edge_label_num { + for src_label_i in 0..vertex_label_num { + for dst_label_i in 0..vertex_label_num { + let edge_index = src_label_i * vertex_label_num * edge_label_num + + dst_label_i * edge_label_num + + e_label_i; + let src_label_name = graph_schema.vertex_label_names()[src_label_i].clone(); + let dst_label_name = graph_schema.vertex_label_names()[dst_label_i].clone(); + let edge_label_name = graph_schema.edge_label_names()[e_label_i].clone(); + + let oe_edge_property_path = &partition_dir + .join(format!("oep_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let oe_edge_property_path_str = oe_edge_property_path + .to_str() + .unwrap() + .to_string(); + if Path::new(&oe_edge_property_path_str).exists() { + let mut table = ColTable::new(vec![]); + info!( + "importing oe edge property: {}_{}_{}, {}", + src_label_name, edge_label_name, dst_label_name, oe_edge_property_path_str + ); + table.deserialize_table(&oe_edge_property_path_str); + oe_edge_prop_table.insert(edge_index, table); + } + + let ie_edge_property_path = &partition_dir + .join(format!("iep_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let ie_edge_property_path_str = ie_edge_property_path + .to_str() + .unwrap() + .to_string(); + if Path::new(&ie_edge_property_path_str).exists() { + let mut table = ColTable::new(vec![]); + info!( + "importing ie edge property: {}_{}_{}, {}", + src_label_name, edge_label_name, dst_label_name, oe_edge_property_path_str + ); + table.deserialize_table(&ie_edge_property_path_str); + ie_edge_prop_table.insert(edge_index, table); + } + } + } + } + + let mut vertex_map = VertexMap::new(vertex_label_num); + let vm_path = &partition_dir.join("vm"); + let vm_path_str = vm_path.to_str().unwrap().to_string(); + vertex_map.deserialize(&vm_path_str); + + Ok(Self { + partition, + ie, + oe, + graph_schema: graph_schema, + vertex_prop_table, + vertex_map, + ie_edge_prop_table, + oe_edge_prop_table, + vertex_label_num, + edge_label_num, + }) + } + + pub fn get_sub_graph( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, dir: Direction, + ) -> SubGraph<'_, G, I> { + let index = self.edge_label_to_index(src_label, dst_label, edge_label, dir); + info!( + "get_sub_graph: {} - {} - {}, {:?}", + self.graph_schema.vertex_label_names()[src_label as usize], + self.graph_schema.edge_label_names()[edge_label as usize], + self.graph_schema.vertex_label_names()[dst_label as usize], + dir + ); + match dir { + Direction::Outgoing => SubGraph::new( + &self.oe[index] + .as_any() + .downcast_ref::>() + .unwrap(), + &self.vertex_map, + src_label, + dst_label, + edge_label, + &self.vertex_prop_table[src_label as usize], + self.oe_edge_prop_table.get(&index), + ), + Direction::Incoming => SubGraph::new( + &self.ie[index] + .as_any() + .downcast_ref::>() + .unwrap(), + &self.vertex_map, + src_label, + dst_label, + edge_label, + &self.vertex_prop_table[src_label as usize], + self.ie_edge_prop_table.get(&index), + ), + } + } + + pub fn get_single_sub_graph( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, dir: Direction, + ) -> SingleSubGraph<'_, G, I> { + info!( + "get_single_sub_graph: {} - {} - {}, {:?}", + self.graph_schema.vertex_label_names()[src_label as usize], + self.graph_schema.edge_label_names()[edge_label as usize], + self.graph_schema.vertex_label_names()[dst_label as usize], + dir + ); + let index = self.edge_label_to_index(src_label, dst_label, edge_label, dir); + match dir { + Direction::Outgoing => SingleSubGraph::new( + &self.oe[index] + .as_any() + .downcast_ref::>() + .unwrap(), + &self.vertex_map, + src_label, + dst_label, + edge_label, + &self.vertex_prop_table[src_label as usize], + self.oe_edge_prop_table.get(&index), + ), + Direction::Incoming => SingleSubGraph::new( + &self.ie[index] + .as_any() + .downcast_ref::>() + .unwrap(), + &self.vertex_map, + src_label, + dst_label, + edge_label, + &self.vertex_prop_table[src_label as usize], + self.ie_edge_prop_table.get(&index), + ), + } + } + + pub fn insert_vertex(&mut self, label: LabelId, id: G, properties: Option>) { + let lid = self.vertex_map.add_vertex(id, label); + if let Some(properties) = properties { + self.vertex_prop_table[label as usize].insert(lid.index(), &properties); + } + } + + pub fn init_vertex_index_prop( + &mut self, index_name: String, vertex_label: LabelId, data_type: DataType, + ) { + if let Some(prop_index) = + self.graph_schema + .add_vertex_index_prop(index_name.clone(), vertex_label, data_type) + { + if let Some(mut col_table) = self + .vertex_prop_table + .get_mut(vertex_label as usize) + { + if !col_table.header.contains_key(&index_name) { + col_table.add_property(index_name, data_type); + } + } + } + } + + pub fn set_vertex_index_prop( + &mut self, index_name: String, vertex_label: LabelId, index: &Vec, data: Box, + ) { + if let Some(mut col_table) = self + .vertex_prop_table + .get_mut(vertex_label as usize) + { + col_table.set_property(index_name, index, data); + } + } + + pub fn init_edge_index_prop( + &mut self, index_name: String, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + data_type: DataType, + ) { + if let Some(prop_index) = self.graph_schema.add_edge_index_prop( + index_name.clone(), + src_label, + edge_label, + dst_label, + data_type, + ) { + let edge_index = src_label as usize * self.vertex_label_num * self.edge_label_num + + dst_label as usize * self.edge_label_num + + edge_label as usize; + if let Some(in_col_table) = self.ie_edge_prop_table.get_mut(&edge_index) { + in_col_table.add_property(index_name.clone(), data_type); + } + if let Some(out_col_table) = self.oe_edge_prop_table.get_mut(&edge_index) { + out_col_table.add_property(index_name, data_type); + } + } + } + + pub fn set_edge_index_prop( + &mut self, index_name: String, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + in_index: Option<&Vec>, in_data: Option>, out_index: Option<&Vec>, + out_data: Option>, + ) { + let edge_index = src_label as usize * self.vertex_label_num * self.edge_label_num + + dst_label as usize * self.edge_label_num + + edge_label as usize; + if in_index.is_some() { + if let Some(mut in_col_table) = self.ie_edge_prop_table.get_mut(&edge_index) { + in_col_table.set_property(index_name.clone(), in_index.unwrap(), in_data.unwrap()); + } + } + if out_index.is_some() { + if let Some(mut out_col_table) = self.oe_edge_prop_table.get_mut(&edge_index) { + out_col_table.set_property(index_name, out_index.unwrap(), out_data.unwrap()); + } + } + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/graph_loader.rs b/interactive_engine/executor/store/bmcsr/src/graph_loader.rs new file mode 100644 index 000000000000..c80defcf46f1 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/graph_loader.rs @@ -0,0 +1,636 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::collections::HashSet; +use std::fs::{create_dir_all, read_dir, File}; +use std::io::{BufReader, Read}; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::sync::Arc; + +use csv::{Reader, ReaderBuilder}; +use glob::glob; +use regex::Regex; +use rust_htslib::bgzf::Reader as GzReader; + +use crate::bmcsr::BatchMutableCsrBuilder; +use crate::bmscsr::BatchMutableSingleCsrBuilder; +use crate::col_table::{parse_properties, ColTable}; +use crate::columns::Item; +use crate::csr::CsrTrait; +use crate::error::{GDBError, GDBResult}; +use crate::graph::IndexType; +use crate::ldbc_parser::{LDBCEdgeParser, LDBCVertexParser}; +use crate::schema::{CsrGraphSchema, InputSchema, Schema}; +use crate::types::{DefaultId, InternalId, LabelId, DIR_BINARY_DATA}; +use crate::vertex_map::VertexMap; + +pub fn get_files_list_beta(prefix: &PathBuf, file_strings: &Vec) -> Vec { + let mut ret = vec![]; + for suffix in file_strings.iter() { + let path = prefix.to_str().unwrap().to_string() + "/" + suffix; + for entry in glob(&path).unwrap() { + match entry { + Ok(p) => ret.push(p), + Err(e) => warn!("parsing {} failed: {:?}", path, e), + } + } + } + ret +} + +pub fn get_files_list(prefix: &PathBuf, file_strings: &Vec) -> GDBResult> { + let mut path_lists = vec![]; + for file_string in file_strings { + let temp_path = PathBuf::from(prefix.to_string_lossy().to_string() + "/" + file_string); + let filename = temp_path + .file_name() + .ok_or(GDBError::UnknownError)? + .to_str() + .ok_or(GDBError::UnknownError)?; + if filename.contains("*") { + let re_string = "^".to_owned() + &filename.replace(".", "\\.").replace("*", ".*") + "$"; + let re = Regex::new(&re_string).unwrap(); + let parent_dir = temp_path.parent().unwrap(); + for _entry in read_dir(parent_dir)? { + let entry = _entry?; + let path = entry.path(); + let fname = path + .file_name() + .ok_or(GDBError::UnknownError)? + .to_str() + .ok_or(GDBError::UnknownError)?; + if re.is_match(fname) { + path_lists.push(path); + } + } + } else { + path_lists.push(temp_path); + } + } + Ok(path_lists) +} + +pub(crate) fn keep_vertex(vid: G, peers: usize, work_id: usize) -> bool { + vid.index() % peers == work_id +} + +pub struct GraphLoader< + G: FromStr + Send + Sync + IndexType = DefaultId, + I: Send + Sync + IndexType = InternalId, +> { + input_dir: PathBuf, + partition_dir: PathBuf, + + work_id: usize, + peers: usize, + delim: u8, + input_schema: Arc, + graph_schema: Arc, + skip_header: bool, + vertex_map: VertexMap, +} + +impl GraphLoader { + pub fn new>( + input_dir: D, output_path: D, input_schema_file: D, graph_schema_file: D, work_id: usize, + peers: usize, + ) -> GraphLoader { + let graph_schema = + CsrGraphSchema::from_json_file(graph_schema_file).expect("Read trim schema error!"); + let input_schema = InputSchema::from_json_file(input_schema_file, &graph_schema) + .expect("Read graph schema error!"); + graph_schema.desc(); + + let vertex_label_num = graph_schema.vertex_type_to_id.len(); + let vertex_map = VertexMap::::new(vertex_label_num); + + let output_dir = output_path.as_ref(); + let partition_dir = output_dir + .join(DIR_BINARY_DATA) + .join(format!("partition_{}", work_id)); + + Self { + input_dir: input_dir.as_ref().to_path_buf(), + partition_dir, + + work_id, + peers, + delim: b'|', + input_schema: Arc::new(input_schema), + graph_schema: Arc::new(graph_schema), + skip_header: false, + + vertex_map, + } + } + + /// For specifying a different delimiter + pub fn with_delimiter(mut self, delim: u8) -> Self { + self.delim = delim; + self + } + + pub fn skip_header(&mut self) { + self.skip_header = true; + } + + fn load_vertices( + &mut self, vertex_type: LabelId, mut rdr: Reader, table: &mut ColTable, is_static: bool, + ) { + let input_header = self + .input_schema + .get_vertex_header(vertex_type) + .unwrap(); + let graph_header = self + .graph_schema + .get_vertex_header(vertex_type) + .unwrap(); + let mut keep_set = HashSet::new(); + for pair in graph_header { + keep_set.insert(pair.0.clone()); + } + let mut selected = vec![false; input_header.len()]; + let mut id_col_id = 0; + for (index, (n, _)) in input_header.iter().enumerate() { + if keep_set.contains(n) { + selected[index] = true; + } + if n == "id" { + id_col_id = index; + } + } + let parser = LDBCVertexParser::new(vertex_type, id_col_id); + info!("loading vertex-{}", vertex_type); + if is_static { + for result in rdr.records() { + if let Ok(record) = result { + let vertex_meta = parser.parse_vertex_meta(&record); + if let Ok(properties) = parse_properties(&record, input_header, selected.as_slice()) { + let vertex_index = self + .vertex_map + .add_vertex(vertex_meta.global_id, vertex_meta.label); + if properties.len() > 0 { + table.insert(vertex_index.index(), &properties); + } + } + } + } + } else { + for result in rdr.records() { + if let Ok(record) = result { + let vertex_meta = parser.parse_vertex_meta(&record); + if keep_vertex(vertex_meta.global_id, self.peers, self.work_id) { + if let Ok(properties) = parse_properties(&record, input_header, selected.as_slice()) + { + let vertex_index = self + .vertex_map + .add_vertex(vertex_meta.global_id, vertex_meta.label); + if properties.len() > 0 { + table.insert(vertex_index.index(), &properties); + } + } + } + } + } + } + } + + fn load_edges( + &mut self, src_vertex_type: LabelId, dst_vertex_type: LabelId, edge_type: LabelId, + is_src_static: bool, is_dst_static: bool, mut rdr: Reader, idegree: &mut Vec, + odegree: &mut Vec, parsed_edges: &mut Vec<(I, I, Vec)>, + ) { + info!("loading edge-{}-{}-{}", src_vertex_type, edge_type, dst_vertex_type); + let input_header = self + .input_schema + .get_edge_header(src_vertex_type, edge_type, dst_vertex_type) + .unwrap(); + let graph_header = self + .graph_schema + .get_edge_header(src_vertex_type, edge_type, dst_vertex_type) + .unwrap(); + let mut keep_set = HashSet::new(); + for pair in graph_header { + keep_set.insert(pair.0.clone()); + } + let mut selected = vec![false; input_header.len()]; + let mut src_col_id = 0; + let mut dst_col_id = 1; + for (index, (name, _)) in input_header.iter().enumerate() { + if keep_set.contains(name) { + selected[index] = true; + } + if name == "start_id" { + src_col_id = index; + } else if name == "end_id" { + dst_col_id = index; + } + } + + let src_num = self.vertex_map.vertex_num(src_vertex_type); + let dst_num = self.vertex_map.vertex_num(dst_vertex_type); + let mut parser = LDBCEdgeParser::::new(src_vertex_type, dst_vertex_type, edge_type); + parser.with_endpoint_col_id(src_col_id, dst_col_id); + + if is_src_static && is_dst_static { + for result in rdr.records() { + if let Ok(record) = result { + let edge_meta = parser.parse_edge_meta(&record); + if let Ok(properties) = parse_properties(&record, input_header, selected.as_slice()) { + let src_lid = self + .vertex_map + .add_corner_vertex(edge_meta.src_global_id, src_vertex_type); + if src_lid.index() < src_num { + odegree[src_lid.index()] += 1; + } + let dst_lid = self + .vertex_map + .add_corner_vertex(edge_meta.dst_global_id, dst_vertex_type); + if dst_lid.index() < dst_num { + idegree[dst_lid.index()] += 1; + } + parsed_edges.push((src_lid, dst_lid, properties)); + } + } + } + } else if is_src_static && !is_dst_static { + for result in rdr.records() { + if let Ok(record) = result { + let edge_meta = parser.parse_edge_meta(&record); + if let Ok(properties) = parse_properties(&record, input_header, selected.as_slice()) { + if keep_vertex(edge_meta.src_global_id, self.peers, self.work_id) + || keep_vertex(edge_meta.dst_global_id, self.peers, self.work_id) + { + let src_lid = self + .vertex_map + .add_corner_vertex(edge_meta.src_global_id, src_vertex_type); + if src_lid.index() < src_num { + odegree[src_lid.index()] += 1; + } + let dst_lid = self + .vertex_map + .add_corner_vertex(edge_meta.dst_global_id, dst_vertex_type); + if dst_lid.index() < dst_num { + idegree[dst_lid.index()] += 1; + } + parsed_edges.push((src_lid, dst_lid, properties)); + } + } + } + } + } else if !is_src_static && is_dst_static { + for result in rdr.records() { + if let Ok(record) = result { + let edge_meta = parser.parse_edge_meta(&record); + if let Ok(properties) = parse_properties(&record, input_header, selected.as_slice()) { + if keep_vertex(edge_meta.src_global_id, self.peers, self.work_id) { + let src_lid = self + .vertex_map + .add_corner_vertex(edge_meta.src_global_id, src_vertex_type); + if src_lid.index() < src_num { + odegree[src_lid.index()] += 1; + } + let dst_lid = self + .vertex_map + .add_corner_vertex(edge_meta.dst_global_id, dst_vertex_type); + if dst_lid.index() < dst_num { + idegree[dst_lid.index()] += 1; + } + parsed_edges.push((src_lid, dst_lid, properties)); + } + } + } + } + } else { + for result in rdr.records() { + if let Ok(record) = result { + let edge_meta = parser.parse_edge_meta(&record); + if let Ok(properties) = parse_properties(&record, input_header, selected.as_slice()) { + if keep_vertex(edge_meta.src_global_id, self.peers, self.work_id) + || keep_vertex(edge_meta.dst_global_id, self.peers, self.work_id) + { + let src_lid = self + .vertex_map + .add_corner_vertex(edge_meta.src_global_id, src_vertex_type); + if src_lid.index() < src_num { + odegree[src_lid.index()] += 1; + } + let dst_lid = self + .vertex_map + .add_corner_vertex(edge_meta.dst_global_id, dst_vertex_type); + if dst_lid.index() < dst_num { + idegree[dst_lid.index()] += 1; + } + parsed_edges.push((src_lid, dst_lid, properties)); + } + } + } + } + } + } + + pub fn load(&mut self) -> GDBResult<()> { + create_dir_all(&self.partition_dir)?; + + let v_label_num = self.graph_schema.vertex_type_to_id.len() as LabelId; + for v_label_i in 0..v_label_num { + let cols = self + .graph_schema + .get_vertex_header(v_label_i) + .unwrap(); + let mut header = vec![]; + for pair in cols.iter() { + header.push((pair.1.clone(), pair.0.clone())); + } + let mut table = ColTable::new(header); + let vertex_file_strings = self + .input_schema + .get_vertex_file(v_label_i) + .unwrap(); + let vertex_files = get_files_list(&self.input_dir, vertex_file_strings).unwrap(); + + for vertex_file in vertex_files.iter() { + if vertex_file + .clone() + .to_str() + .unwrap() + .ends_with(".csv") + { + let rdr = ReaderBuilder::new() + .delimiter(self.delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(self.skip_header) + .from_reader(BufReader::new(File::open(&vertex_file).unwrap())); + self.load_vertices( + v_label_i, + rdr, + &mut table, + self.graph_schema.is_static_vertex(v_label_i), + ); + } else if vertex_file + .clone() + .to_str() + .unwrap() + .ends_with(".csv.gz") + { + let rdr = ReaderBuilder::new() + .delimiter(self.delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(self.skip_header) + .from_reader(BufReader::new(GzReader::from_path(&vertex_file).unwrap())); + self.load_vertices( + v_label_i, + rdr, + &mut table, + self.graph_schema.is_static_vertex(v_label_i), + ); + } + } + + let table_path = self + .partition_dir + .join(format!("vp_{}", self.graph_schema.vertex_label_names()[v_label_i as usize])); + let table_path_str = table_path.to_str().unwrap().to_string(); + info!( + "vertex {}, size: {}", + self.graph_schema.vertex_label_names()[v_label_i as usize], + table.row_num() + ); + table.serialize_table(&table_path_str); + } + + let e_label_num = self.graph_schema.edge_type_to_id.len() as LabelId; + for e_label_i in 0..e_label_num { + let edge_label_name = self.graph_schema.edge_label_names()[e_label_i as usize].clone(); + + for src_label_i in 0..v_label_num { + for dst_label_i in 0..v_label_num { + let src_num = self.vertex_map.vertex_num(src_label_i); + let dst_num = self.vertex_map.vertex_num(dst_label_i); + let mut idegree = vec![0_i32; dst_num as usize]; + let mut odegree = vec![0_i32; src_num as usize]; + let mut parsed_edges = vec![]; + + if let Some(edge_file_strings) = + self.input_schema + .get_edge_file(src_label_i, e_label_i, dst_label_i) + { + for i in edge_file_strings { + info!("{}", i); + } + let edge_files = get_files_list(&self.input_dir, edge_file_strings).unwrap(); + for edge_file in edge_files.iter() { + info!("reading from file: {}", edge_file.clone().to_str().unwrap()); + if edge_file + .clone() + .to_str() + .unwrap() + .ends_with(".csv") + { + let rdr = ReaderBuilder::new() + .delimiter(self.delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(self.skip_header) + .from_reader(BufReader::new(File::open(&edge_file).unwrap())); + self.load_edges( + src_label_i, + dst_label_i, + e_label_i, + self.graph_schema.is_static_vertex(src_label_i), + self.graph_schema.is_static_vertex(dst_label_i), + rdr, + &mut idegree, + &mut odegree, + &mut parsed_edges, + ); + } else if edge_file + .clone() + .to_str() + .unwrap() + .ends_with(".csv.gz") + { + let rdr = ReaderBuilder::new() + .delimiter(self.delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(self.skip_header) + .from_reader(BufReader::new(GzReader::from_path(&edge_file).unwrap())); + self.load_edges( + src_label_i, + dst_label_i, + e_label_i, + self.graph_schema.is_static_vertex(src_label_i), + self.graph_schema.is_static_vertex(dst_label_i), + rdr, + &mut idegree, + &mut odegree, + &mut parsed_edges, + ); + } + } + } + if parsed_edges.is_empty() { + continue; + } + let src_label_name = + self.graph_schema.vertex_label_names()[src_label_i as usize].clone(); + let dst_label_name = + self.graph_schema.vertex_label_names()[dst_label_i as usize].clone(); + let cols = self + .graph_schema + .get_edge_header(src_label_i, e_label_i, dst_label_i) + .unwrap(); + let mut header = vec![]; + for pair in cols.iter() { + header.push((pair.1.clone(), pair.0.clone())); + } + let mut ie_edge_properties = ColTable::new(header.clone()); + let mut oe_edge_properties = ColTable::new(header.clone()); + if self + .graph_schema + .is_single_ie(src_label_i, e_label_i, dst_label_i) + { + let mut ie_csr_builder = BatchMutableSingleCsrBuilder::::new(); + let mut oe_csr_builder = BatchMutableCsrBuilder::::new(); + ie_csr_builder.init(&idegree, 1.2); + oe_csr_builder.init(&odegree, 1.2); + for e in parsed_edges.iter() { + let ie_offset = ie_csr_builder.put_edge(e.1, e.0).unwrap(); + let oe_offset = oe_csr_builder.put_edge(e.0, e.1).unwrap(); + if e.2.len() > 0 { + ie_edge_properties.insert(ie_offset, &e.2); + oe_edge_properties.insert(oe_offset, &e.2); + } + } + + let ie_csr = ie_csr_builder.finish().unwrap(); + let oe_csr = oe_csr_builder.finish().unwrap(); + + info!("start export ie"); + let ie_path = self + .partition_dir + .join(format!("ie_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let ie_path_str = ie_path.to_str().unwrap().to_string(); + ie_csr.serialize(&ie_path_str); + info!("start export oe"); + let oe_path = self + .partition_dir + .join(format!("oe_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let oe_path_str = oe_path.to_str().unwrap().to_string(); + oe_csr.serialize(&oe_path_str); + info!("finished export"); + } else if self + .graph_schema + .is_single_oe(src_label_i, e_label_i, dst_label_i) + { + let mut ie_csr_builder = BatchMutableCsrBuilder::::new(); + let mut oe_csr_builder = BatchMutableSingleCsrBuilder::::new(); + ie_csr_builder.init(&idegree, 1.2); + oe_csr_builder.init(&odegree, 1.2); + for e in parsed_edges.iter() { + let ie_offset = ie_csr_builder.put_edge(e.1, e.0).unwrap(); + let oe_offset = oe_csr_builder.put_edge(e.0, e.1).unwrap(); + if e.2.len() > 0 { + ie_edge_properties.insert(ie_offset, &e.2); + oe_edge_properties.insert(oe_offset, &e.2); + } + } + + let ie_csr = ie_csr_builder.finish().unwrap(); + let oe_csr = oe_csr_builder.finish().unwrap(); + + info!("start export ie"); + let ie_path = self + .partition_dir + .join(format!("ie_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let ie_path_str = ie_path.to_str().unwrap().to_string(); + ie_csr.serialize(&ie_path_str); + info!("start export oe"); + let oe_path = self + .partition_dir + .join(format!("oe_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let oe_path_str = oe_path.to_str().unwrap().to_string(); + oe_csr.serialize(&oe_path_str); + info!("finished export"); + } else { + let mut ie_csr_builder = BatchMutableCsrBuilder::::new(); + let mut oe_csr_builder = BatchMutableCsrBuilder::::new(); + ie_csr_builder.init(&idegree, 1.2); + oe_csr_builder.init(&odegree, 1.2); + for e in parsed_edges.iter() { + let ie_offset = ie_csr_builder.put_edge(e.1, e.0).unwrap(); + let oe_offset = oe_csr_builder.put_edge(e.0, e.1).unwrap(); + if e.2.len() > 0 { + ie_edge_properties.insert(ie_offset, &e.2); + oe_edge_properties.insert(oe_offset, &e.2); + } + } + + let ie_csr = ie_csr_builder.finish().unwrap(); + let oe_csr = oe_csr_builder.finish().unwrap(); + + info!("start export ie, edge size {}", ie_csr.edge_num()); + let ie_path = self + .partition_dir + .join(format!("ie_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let ie_path_str = ie_path.to_str().unwrap().to_string(); + ie_csr.serialize(&ie_path_str); + info!("start export oe"); + let oe_path = self + .partition_dir + .join(format!("oe_{}_{}_{}", src_label_name, edge_label_name, dst_label_name)); + let oe_path_str = oe_path.to_str().unwrap().to_string(); + oe_csr.serialize(&oe_path_str); + info!("finished export"); + } + if oe_edge_properties.row_num() > 0 { + let edge_property_path = self.partition_dir.join(format!( + "oep_{}_{}_{}", + self.graph_schema.vertex_label_names()[src_label_i as usize], + self.graph_schema.edge_label_names()[e_label_i as usize], + self.graph_schema.vertex_label_names()[dst_label_i as usize] + )); + let edge_property_path_str = edge_property_path.to_str().unwrap().to_string(); + oe_edge_properties.serialize_table(&edge_property_path_str); + } + if ie_edge_properties.row_num() > 0 { + let edge_property_path = self.partition_dir.join(format!( + "iep_{}_{}_{}", + self.graph_schema.vertex_label_names()[src_label_i as usize], + self.graph_schema.edge_label_names()[e_label_i as usize], + self.graph_schema.vertex_label_names()[dst_label_i as usize] + )); + let edge_property_path_str = edge_property_path.to_str().unwrap().to_string(); + ie_edge_properties.serialize_table(&edge_property_path_str); + } + } + } + } + + let vm_path = self.partition_dir.join("vm"); + let vm_path_str = vm_path.to_str().unwrap().to_string(); + self.vertex_map.serialize(&vm_path_str); + + Ok(()) + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/graph_modifier.rs b/interactive_engine/executor/store/bmcsr/src/graph_modifier.rs new file mode 100644 index 000000000000..53624b0006a1 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/graph_modifier.rs @@ -0,0 +1,3054 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::any::Any; +use std::collections::{HashMap, HashSet}; +use std::fmt::{Debug, Display, Formatter}; +use std::fs::File; +use std::io::{BufReader, Write}; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::time::Instant; + +use csv::ReaderBuilder; +use pegasus_common::codec::{Decode, Encode}; +use pegasus_common::io::{ReadExt, WriteExt}; +use rayon::prelude::*; +use rust_htslib::bgzf::Reader as GzReader; + +use crate::bmscsr::BatchMutableSingleCsr; +use crate::col_table::{parse_properties, parse_properties_by_mappings, ColTable}; +use crate::columns::*; +use crate::columns::*; +use crate::csr::CsrTrait; +use crate::date::Date; +use crate::date_time::DateTime; +use crate::error::GDBResult; +use crate::graph::{Direction, IndexType}; +use crate::graph_db::GraphDB; +use crate::graph_loader::{get_files_list, get_files_list_beta}; +use crate::ldbc_parser::{LDBCEdgeParser, LDBCVertexParser}; +use crate::schema::{CsrGraphSchema, InputSchema, Schema}; +use crate::types::{DefaultId, LabelId}; + +#[derive(Clone, Copy)] +pub enum WriteType { + Insert, + Delete, + Set, +} + +#[derive(Clone)] +pub struct ColumnInfo { + index: i32, + name: String, + data_type: DataType, +} + +impl ColumnInfo { + pub fn index(&self) -> i32 { + self.index + } + + pub fn name(&self) -> &String { + &self.name + } + + pub fn data_type(&self) -> DataType { + self.data_type + } +} + +#[derive(Clone)] +pub struct ColumnMappings { + column: ColumnInfo, + property_name: String, +} + +impl ColumnMappings { + pub fn new(index: i32, name: String, data_type: DataType, property_name: String) -> Self { + ColumnMappings { column: ColumnInfo { index, name, data_type }, property_name } + } +} + +impl Encode for ColumnMappings { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + writer.write_i32(self.column.index); + self.column.name.write_to(writer)?; + self.column.data_type.write_to(writer)?; + self.property_name.write_to(writer)?; + Ok(()) + } +} + +impl Decode for ColumnMappings { + fn read_from(reader: &mut R) -> std::io::Result { + let index = reader.read_i32()?; + let name = String::read_from(reader)?; + let data_type = DataType::read_from(reader)?; + let property_name = String::read_from(reader)?; + Ok(ColumnMappings { column: ColumnInfo { index, name, data_type }, property_name }) + } +} + +impl ColumnMappings { + pub fn column(&self) -> &ColumnInfo { + &self.column + } + + pub fn property_name(&self) -> &String { + &self.property_name + } +} + +#[derive(Clone, Copy, PartialEq)] +pub enum DataSource { + File, + Memory, +} + +#[derive(Clone)] +pub struct FileInput { + pub delimiter: String, + pub header_row: bool, + pub quoting: bool, + pub quote_char: String, + pub double_quote: bool, + pub escape_char: String, + pub block_size: String, + pub location: String, +} + +impl Encode for FileInput { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + self.delimiter.write_to(writer)?; + self.header_row.write_to(writer)?; + self.quoting.write_to(writer)?; + self.quote_char.write_to(writer)?; + self.double_quote.write_to(writer)?; + self.escape_char.write_to(writer)?; + self.block_size.write_to(writer)?; + self.location.write_to(writer)?; + Ok(()) + } +} + +impl Decode for FileInput { + fn read_from(reader: &mut R) -> std::io::Result { + let delimiter = String::read_from(reader)?; + let header_row = bool::read_from(reader)?; + let quoting = bool::read_from(reader)?; + let quote_char = String::read_from(reader)?; + let double_quote = bool::read_from(reader)?; + let escape_char = String::read_from(reader)?; + let block_size = String::read_from(reader)?; + let location = String::read_from(reader)?; + Ok(FileInput { + delimiter, + header_row, + quoting, + quote_char, + double_quote, + escape_char, + block_size, + location, + }) + } +} + +impl FileInput { + pub fn new(delimiter: String, header_row: bool, location: String) -> Self { + FileInput { + delimiter, + header_row, + quoting: true, + quote_char: "'".to_string(), + double_quote: true, + escape_char: "".to_string(), + block_size: "4Mb".to_string(), + location, + } + } +} + +fn write_column(column: &Box, writer: &mut W) -> std::io::Result<()> { + if let Some(int32_column) = column.as_any().downcast_ref::() { + writer.write_u8(0); + writer.write_u64(column.len() as u64); + for i in int32_column.data.iter() { + writer.write_i32(*i); + } + } + if let Some(uint32_column) = column.as_any().downcast_ref::() { + writer.write_u8(1); + writer.write_u64(column.len() as u64); + for i in uint32_column.data.iter() { + writer.write_u32(*i); + } + } + if let Some(int64_column) = column.as_any().downcast_ref::() { + writer.write_u8(2); + writer.write_u64(column.len() as u64); + for i in int64_column.data.iter() { + writer.write_i64(*i); + } + } + if let Some(uint64_column) = column.as_any().downcast_ref::() { + writer.write_u8(3); + writer.write_u64(column.len() as u64); + for i in uint64_column.data.iter() { + writer.write_u64(*i); + } + } + if let Some(id_column) = column.as_any().downcast_ref::() { + writer.write_u8(4); + writer.write_u64(column.len() as u64); + for i in id_column.data.iter() { + writer.write_u64(*i as u64); + } + } + if let Some(double_column) = column.as_any().downcast_ref::() { + writer.write_u8(5); + writer.write_u64(column.len() as u64); + for i in double_column.data.iter() { + writer.write_f64(*i); + } + } + if let Some(string_column) = column.as_any().downcast_ref::() { + writer.write_u8(6); + writer.write_u64(column.len() as u64); + for i in string_column.data.iter() { + i.write_to(writer); + } + } + if let Some(lc_string_column) = column.as_any().downcast_ref::() { + writer.write_u8(7); + writer.write_u64(lc_string_column.list.len() as u64); + for i in lc_string_column.list.iter() { + i.write_to(writer); + } + writer.write_u64(lc_string_column.data.len() as u64); + for i in lc_string_column.data.iter() { + writer.write_u16(*i); + } + } + if let Some(date_column) = column.as_any().downcast_ref::() { + writer.write_u8(8); + writer.write_u64(column.len() as u64); + for i in date_column.data.iter() { + writer.write_i32(i.to_i32()); + } + } + if let Some(datetime_column) = column.as_any().downcast_ref::() { + writer.write_u8(9); + writer.write_u64(column.len() as u64); + for i in datetime_column.data.iter() { + writer.write_i64(i.to_i64()); + } + } + Ok(()) +} + +fn read_column(reader: &mut R) -> std::io::Result> { + let data: Box = match reader.read_u8()? { + 0 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_i32()?); + } + Box::new(Int32Column { data }) + } + 1 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_u32()?); + } + Box::new(UInt32Column { data }) + } + 2 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_i64()?); + } + Box::new(Int64Column { data }) + } + 3 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_u64()?); + } + Box::new(UInt64Column { data }) + } + 4 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_u64()? as usize); + } + Box::new(IDColumn { data }) + } + 5 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_f64()?); + } + Box::new(DoubleColumn { data }) + } + 6 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(String::read_from(reader)?); + } + Box::new(StringColumn { data }) + } + 7 => { + let mut list = Vec::::new(); + let mut table = HashMap::::new(); + let list_len = reader.read_u64()? as usize; + for i in 0..list_len { + let name = String::read_from(reader)?; + list.push(name.clone()); + table.insert(name, i as u16); + } + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(reader.read_u16()?); + } + Box::new(LCStringColumn { data, table, list }) + } + 8 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(Date::from_i32(reader.read_i32()?)); + } + Box::new(DateColumn { data }) + } + 9 => { + let data_len = reader.read_u64()? as usize; + let mut data = ColumnContainer::::with_capacity(data_len); + for i in 0..data_len { + data.push(DateTime::new(reader.read_i64()?)); + } + Box::new(DateTimeColumn { data }) + } + _ => panic!("Unknown column type"), + }; + Ok(data) +} + +fn clone_column(input: &Box) -> Box { + if let Some(int32_column) = input.as_any().downcast_ref::() { + Box::new(Int32Column::clone_from(int32_column)) + } else if let Some(uint32_column) = input.as_any().downcast_ref::() { + Box::new(UInt32Column::clone_from(uint32_column)) + } else if let Some(int64_column) = input.as_any().downcast_ref::() { + Box::new(Int64Column::clone_from(int64_column)) + } else if let Some(uint64_column) = input.as_any().downcast_ref::() { + Box::new(UInt64Column::clone_from(uint64_column)) + } else if let Some(id_column) = input.as_any().downcast_ref::() { + Box::new(IDColumn::clone_from(id_column)) + } else if let Some(doule_column) = input.as_any().downcast_ref::() { + Box::new(DoubleColumn::clone_from(doule_column)) + } else if let Some(string_column) = input.as_any().downcast_ref::() { + Box::new(StringColumn::clone_from(string_column)) + } else if let Some(lc_string_column) = input.as_any().downcast_ref::() { + Box::new(LCStringColumn::clone_from(lc_string_column)) + } else if let Some(date_column) = input.as_any().downcast_ref::() { + Box::new(DateColumn::clone_from(date_column)) + } else if let Some(datetime_column) = input.as_any().downcast_ref::() { + Box::new(DateTimeColumn::clone_from(datetime_column)) + } else { + panic!("Unknown column type") + } +} + +pub struct ColumnMetadata { + data: Box, + column_name: String, + data_type: DataType, +} + +impl Encode for ColumnMetadata { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + write_column(&self.data, writer)?; + self.column_name.write_to(writer)?; + self.data_type.write_to(writer)?; + Ok(()) + } +} + +impl Decode for ColumnMetadata { + fn read_from(reader: &mut R) -> std::io::Result { + let data: Box = read_column(reader)?; + let column_name = String::read_from(reader)?; + let data_type = DataType::read_from(reader)?; + Ok(ColumnMetadata { data, column_name, data_type }) + } +} + +impl Clone for ColumnMetadata { + fn clone(&self) -> Self { + let data = clone_column(&self.data); + ColumnMetadata { data, column_name: self.column_name.clone(), data_type: self.data_type.clone() } + } +} + +impl ColumnMetadata { + pub fn new(data: Box, column_name: String, data_type: DataType) -> Self { + ColumnMetadata { data, column_name, data_type } + } + + pub fn data(&self) -> &Box { + &self.data + } + + pub fn take_data(&mut self) -> Box { + std::mem::replace(&mut self.data, Box::new(Int32Column::new())) + } + + pub fn column_name(&self) -> String { + self.column_name.clone() + } + + pub fn data_type(&self) -> DataType { + self.data_type + } +} + +#[derive(Clone)] +pub struct DataFrame { + columns: Vec, +} + +impl Encode for DataFrame { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + self.columns.write_to(writer)?; + Ok(()) + } +} + +impl Decode for DataFrame { + fn read_from(reader: &mut R) -> std::io::Result { + let columns = Vec::::read_from(reader)?; + Ok(DataFrame { columns }) + } +} + +impl DataFrame { + pub fn new_vertices_ids(data: Vec) -> Self { + let columns = + vec![ColumnMetadata::new(Box::new(UInt64Column { data }), "id".to_string(), DataType::ID)]; + DataFrame { columns } + } + + pub fn new_edges_ids(data: Vec) -> Self { + let columns = + vec![ColumnMetadata::new(Box::new(IDColumn { data }), "id".to_string(), DataType::ID)]; + DataFrame { columns } + } + + pub fn add_column(&mut self, column: ColumnMetadata) { + self.columns.push(column); + } + + pub fn columns(&self) -> &Vec { + &self.columns + } + + pub fn take_columns(&mut self) -> Vec { + std::mem::replace(&mut self.columns, Vec::new()) + } +} + +#[derive(Clone)] +pub struct Input { + data_source: DataSource, + file_input: Option, + memory_data: Option, +} + +impl Encode for Input { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + match self.data_source { + DataSource::File => writer.write_u8(0), + DataSource::Memory => writer.write_u8(1), + }; + self.file_input.write_to(writer)?; + self.memory_data.write_to(writer)?; + Ok(()) + } +} + +impl Decode for Input { + fn read_from(reader: &mut R) -> std::io::Result { + let data_source = match reader.read_u8()? { + 0 => DataSource::File, + 1 => DataSource::Memory, + _ => panic!("Unknown DataSource type"), + }; + let file_input = Option::::read_from(reader)?; + let memory_data = Option::::read_from(reader)?; + Ok(Input { data_source, file_input, memory_data }) + } +} + +impl Input { + pub fn data_source(&self) -> DataSource { + self.data_source + } + + pub fn file_input(&self) -> Option<&FileInput> { + self.file_input.as_ref() + } + + pub fn memory_data(&self) -> Option<&DataFrame> { + self.memory_data.as_ref() + } + + pub fn take_memory_data(&mut self) -> Option { + self.memory_data.take() + } + + pub fn file(file: FileInput) -> Self { + Input { data_source: DataSource::File, file_input: Some(file), memory_data: None } + } + + pub fn memory(memory_data: DataFrame) -> Self { + Input { data_source: DataSource::Memory, file_input: None, memory_data: Some(memory_data) } + } +} + +#[derive(Clone)] +pub struct VertexMappings { + label_id: LabelId, + inputs: Vec, + column_mappings: Vec, +} + +impl Encode for VertexMappings { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + writer.write_u8(self.label_id); + self.inputs.write_to(writer)?; + self.column_mappings.write_to(writer)?; + Ok(()) + } +} + +impl Decode for VertexMappings { + fn read_from(reader: &mut R) -> std::io::Result { + let label_id = reader.read_u8()?; + let inputs = Vec::::read_from(reader)?; + let column_mappings = Vec::::read_from(reader)?; + Ok(VertexMappings { label_id, inputs, column_mappings }) + } +} + +impl VertexMappings { + pub fn new(label_id: LabelId, inputs: Vec, column_mappings: Vec) -> Self { + VertexMappings { label_id, inputs, column_mappings } + } + + pub fn vertex_label(&self) -> LabelId { + self.label_id + } + + pub fn inputs(&self) -> &Vec { + &self.inputs + } + + pub fn take_inputs(&mut self) -> Vec { + std::mem::replace(&mut self.inputs, Vec::new()) + } + + pub fn column_mappings(&self) -> &Vec { + &self.column_mappings + } +} + +#[derive(Clone)] +pub struct EdgeMappings { + src_label: LabelId, + edge_label: LabelId, + dst_label: LabelId, + inputs: Vec, + src_column_mappings: Vec, + dst_column_mappings: Vec, + column_mappings: Vec, +} + +impl Encode for EdgeMappings { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + writer.write_u8(self.src_label); + writer.write_u8(self.edge_label); + writer.write_u8(self.dst_label); + self.inputs.write_to(writer)?; + self.src_column_mappings.write_to(writer)?; + self.dst_column_mappings.write_to(writer)?; + self.column_mappings.write_to(writer)?; + Ok(()) + } +} + +impl Decode for EdgeMappings { + fn read_from(reader: &mut R) -> std::io::Result { + let src_label = reader.read_u8()?; + let edge_label = reader.read_u8()?; + let dst_label = reader.read_u8()?; + let inputs = Vec::::read_from(reader)?; + let src_column_mappings = Vec::::read_from(reader)?; + let dst_column_mappings = Vec::::read_from(reader)?; + let column_mappings = Vec::::read_from(reader)?; + Ok(EdgeMappings { + src_label, + edge_label, + dst_label, + inputs, + src_column_mappings, + dst_column_mappings, + column_mappings, + }) + } +} + +impl EdgeMappings { + pub fn new( + src_label: LabelId, edge_label: LabelId, dst_label: LabelId, inputs: Vec, + src_column_mappings: Vec, dst_column_mappings: Vec, + column_mappings: Vec, + ) -> Self { + EdgeMappings { + src_label, + edge_label, + dst_label, + inputs, + src_column_mappings, + dst_column_mappings, + column_mappings, + } + } + + pub fn src_label(&self) -> LabelId { + self.src_label + } + + pub fn edge_label(&self) -> LabelId { + self.edge_label + } + + pub fn dst_label(&self) -> LabelId { + self.dst_label + } + + pub fn inputs(&self) -> &Vec { + &self.inputs + } + + pub fn take_inputs(&mut self) -> Vec { + std::mem::replace(&mut self.inputs, Vec::new()) + } + + pub fn src_column_mappings(&self) -> &Vec { + &self.src_column_mappings + } + + pub fn dst_column_mappings(&self) -> &Vec { + &self.dst_column_mappings + } + + pub fn column_mappings(&self) -> &Vec { + &self.column_mappings + } +} + +#[derive(Clone)] +pub struct WriteOperation { + write_type: WriteType, + vertex_mappings: Option, + edge_mappings: Option, +} + +impl Debug for WriteOperation { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "This is a write operation") + } +} + +impl Encode for WriteOperation { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + match self.write_type { + WriteType::Insert => writer.write_u8(0), + WriteType::Delete => writer.write_u8(1), + WriteType::Set => writer.write_u8(2), + }; + self.vertex_mappings.write_to(writer)?; + self.edge_mappings.write_to(writer)?; + Ok(()) + } +} + +impl Decode for WriteOperation { + fn read_from(reader: &mut R) -> std::io::Result { + let write_type = match reader.read_u8()? { + 0 => WriteType::Insert, + 1 => WriteType::Delete, + 2 => WriteType::Set, + _ => panic!("Unknown write type"), + }; + let vertex_mappings = Option::::read_from(reader)?; + let edge_mappings = Option::::read_from(reader)?; + Ok(WriteOperation { write_type, vertex_mappings, edge_mappings }) + } +} + +unsafe impl Send for WriteOperation {} + +unsafe impl Sync for WriteOperation {} + +impl WriteOperation { + pub fn insert_vertices(vertex_mappings: VertexMappings) -> Self { + WriteOperation { + write_type: WriteType::Insert, + vertex_mappings: Some(vertex_mappings), + edge_mappings: None, + } + } + + pub fn insert_edges(edge_mappings: EdgeMappings) -> Self { + WriteOperation { + write_type: WriteType::Insert, + vertex_mappings: None, + edge_mappings: Some(edge_mappings), + } + } + + pub fn delete_vertices(vertex_mappings: VertexMappings) -> Self { + WriteOperation { + write_type: WriteType::Delete, + vertex_mappings: Some(vertex_mappings), + edge_mappings: None, + } + } + + pub fn delete_edges(edge_mappings: EdgeMappings) -> Self { + WriteOperation { + write_type: WriteType::Delete, + vertex_mappings: None, + edge_mappings: Some(edge_mappings), + } + } + + pub fn set_vertices(vertex_mappings: VertexMappings) -> Self { + WriteOperation { + write_type: WriteType::Set, + vertex_mappings: Some(vertex_mappings), + edge_mappings: None, + } + } + + pub fn set_edges(edge_mappings: EdgeMappings) -> Self { + WriteOperation { + write_type: WriteType::Set, + vertex_mappings: None, + edge_mappings: Some(edge_mappings), + } + } + + pub fn write_type(&self) -> WriteType { + self.write_type + } + + pub fn has_vertex_mappings(&self) -> bool { + self.vertex_mappings.is_some() + } + + pub fn vertex_mappings(&self) -> Option<&VertexMappings> { + self.vertex_mappings.as_ref() + } + + pub fn take_vertex_mappings(&mut self) -> Option { + self.vertex_mappings.take() + } + + pub fn has_edge_mappings(&self) -> bool { + self.edge_mappings.is_some() + } + + pub fn edge_mappings(&self) -> Option<&EdgeMappings> { + self.edge_mappings.as_ref() + } + + pub fn take_edge_mappings(&mut self) -> Option { + self.edge_mappings.take() + } +} + +pub struct AliasData { + pub alias_index: i32, + pub column_data: Box, +} + +impl Debug for AliasData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Alias index: {}, data: {:?}", self.alias_index, self.column_data) + } +} + +impl Encode for AliasData { + fn write_to(&self, writer: &mut W) -> std::io::Result<()> { + writer.write_i32(self.alias_index)?; + write_column(&self.column_data, writer)?; + Ok(()) + } +} + +impl Decode for AliasData { + fn read_from(reader: &mut R) -> std::io::Result { + let alias_index = reader.read_i32()?; + let column_data = read_column(reader)?; + Ok(AliasData { alias_index, column_data }) + } +} + +impl Clone for AliasData { + fn clone(&self) -> Self { + let column_data = clone_column(&self.column_data); + AliasData { alias_index: self.alias_index, column_data } + } +} + +unsafe impl Send for AliasData {} + +unsafe impl Sync for AliasData {} + +pub fn apply_write_operations( + graph: &mut GraphDB, mut write_operations: Vec, parallel: u32, +) { + let mut merged_delete_vertices_data: HashMap> = HashMap::new(); + for mut write_op in write_operations.drain(..) { + match write_op.write_type() { + WriteType::Insert => { + if let Some(mut vertex_mappings) = write_op.take_vertex_mappings() { + let vertex_label = vertex_mappings.vertex_label(); + let inputs = vertex_mappings.inputs(); + let column_mappings = vertex_mappings.column_mappings(); + for input in inputs.iter() { + insert_vertices(graph, vertex_label, input, column_mappings, parallel); + } + } + if let Some(edge_mappings) = write_op.take_edge_mappings() { + let src_label = edge_mappings.src_label(); + let edge_label = edge_mappings.edge_label(); + let dst_label = edge_mappings.dst_label(); + let inputs = edge_mappings.inputs(); + let src_column_mappings = edge_mappings.src_column_mappings(); + let dst_column_mappings = edge_mappings.dst_column_mappings(); + let column_mappings = edge_mappings.column_mappings(); + for input in inputs.iter() { + insert_edges( + graph, + src_label, + edge_label, + dst_label, + input, + src_column_mappings, + dst_column_mappings, + column_mappings, + parallel, + ); + } + } + } + WriteType::Delete => { + if let Some(mut vertex_mappings) = write_op.take_vertex_mappings() { + let vertex_label = vertex_mappings.vertex_label(); + let inputs = vertex_mappings.take_inputs(); + let column_mappings = vertex_mappings.column_mappings(); + for mut input in inputs.into_iter() { + match input.data_source() { + DataSource::Memory => { + let mut id_col = -1; + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let property_name = column_mapping.property_name(); + if property_name == "id" { + id_col = column_index; + break; + } + } + if input.data_source() == DataSource::Memory { + let mut memory_data = input.take_memory_data().unwrap(); + let mut data = memory_data.take_columns(); + let mut vertex_id_column = data + .get_mut(id_col as usize) + .expect("Failed to get id column"); + let mut data = vertex_id_column.take_data(); + if let Some(uint64_column) = + data.as_any().downcast_ref::() + { + if let Some(mut combined_data) = + merged_delete_vertices_data.get_mut(&vertex_label) + { + combined_data.append(&mut uint64_column.data.clone()) + } else { + merged_delete_vertices_data + .insert(vertex_label, uint64_column.data.clone()); + } + } else { + panic!("Unknown data type"); + } + } + continue; + } + _ => {} + } + delete_vertices(graph, vertex_label, &input, column_mappings, parallel); + } + } + if let Some(edge_mappings) = write_op.take_edge_mappings() { + let src_label = edge_mappings.src_label(); + let edge_label = edge_mappings.edge_label(); + let dst_label = edge_mappings.dst_label(); + let inputs = edge_mappings.inputs(); + let src_column_mappings = edge_mappings.src_column_mappings(); + let dst_column_mappings = edge_mappings.dst_column_mappings(); + let column_mappings = edge_mappings.column_mappings(); + for input in inputs.iter() { + delete_edges( + graph, + src_label, + edge_label, + dst_label, + input, + src_column_mappings, + dst_column_mappings, + column_mappings, + parallel, + ); + } + } + } + WriteType::Set => { + if let Some(mut vertex_mappings) = write_op.take_vertex_mappings() { + let vertex_label = vertex_mappings.vertex_label(); + let mut inputs = vertex_mappings.take_inputs(); + let column_mappings = vertex_mappings.column_mappings(); + for mut input in inputs.drain(..) { + set_vertices(graph, vertex_label, input, column_mappings, parallel); + } + } + if let Some(mut edge_mappings) = write_op.take_edge_mappings() { + let src_label = edge_mappings.src_label(); + let edge_label = edge_mappings.edge_label(); + let dst_label = edge_mappings.dst_label(); + let mut inputs = edge_mappings.take_inputs(); + let src_column_mappings = edge_mappings.src_column_mappings(); + let dst_column_mappings = edge_mappings.dst_column_mappings(); + let column_mappings = edge_mappings.column_mappings(); + for mut input in inputs.drain(..) { + set_edges( + graph, + src_label, + edge_label, + dst_label, + input, + src_column_mappings, + dst_column_mappings, + column_mappings, + parallel, + ); + } + } + } + }; + } + for (vertex_label, vertex_ids) in merged_delete_vertices_data.into_iter() { + let column_mappings = + vec![ColumnMappings::new(0, "id".to_string(), DataType::ID, "id".to_string())]; + let input = Input::memory(DataFrame::new_vertices_ids(vertex_ids)); + delete_vertices(graph, vertex_label, &input, &column_mappings, parallel); + } +} + +fn insert_vertices( + graph: &mut GraphDB, vertex_label: LabelId, input: &Input, column_mappings: &Vec, + parallel: u32, +) where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, +{ + let mut column_map = HashMap::new(); + let mut max_col = 0; + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + column_map.insert(property_name.clone(), (column_index, data_type)); + if column_index >= max_col { + max_col = column_index + 1; + } + } + let mut id_col = -1; + if let Some((column_index, _)) = column_map.get("id") { + id_col = *column_index; + } + match input.data_source() { + DataSource::File => { + if let Some(file_input) = input.file_input() { + let file_location = &file_input.location; + let path = Path::new(file_location); + let input_dir = path + .parent() + .unwrap_or(Path::new("")) + .to_str() + .unwrap() + .to_string(); + let filename = path + .file_name() + .expect("Can not find filename") + .to_str() + .unwrap_or("") + .to_string(); + let filenames = vec![filename]; + let mut modifier = GraphModifier::new(input_dir); + if file_input.header_row { + modifier.skip_header(); + } + modifier.parallel(parallel); + let mut mappings = vec![-1; max_col as usize]; + if let Some(vertex_header) = graph + .graph_schema + .get_vertex_header(vertex_label) + { + for (i, (property_name, data_type)) in vertex_header.iter().enumerate() { + if let Some((column_index, column_data_type)) = column_map.get(property_name) { + mappings[*column_index as usize] = i as i32; + } + } + } else { + panic!("vertex label {} not found", vertex_label) + } + modifier + .apply_vertices_insert_with_filename(graph, vertex_label, &filenames, id_col, &mappings) + .unwrap(); + } + } + DataSource::Memory => { + if let Some(memory_data) = input.memory_data() { + todo!() + } + } + } +} + +pub fn insert_edges( + graph: &mut GraphDB, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, input: &Input, + src_vertex_mappings: &Vec, dst_vertex_mappings: &Vec, + column_mappings: &Vec, parallel: u32, +) where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, +{ + let mut column_map = HashMap::new(); + let mut max_col = 0; + for column_mapping in src_vertex_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + if property_name == "id" { + column_map.insert("src_id".to_string(), (column_index, data_type)); + } + if column_index >= max_col { + max_col = column_index + 1; + } + } + for column_mapping in dst_vertex_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + if property_name == "id" { + column_map.insert("dst_id".to_string(), (column_index, data_type)); + } + if column_index >= max_col { + max_col = column_index + 1; + } + } + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + column_map.insert(property_name.clone(), (column_index, data_type)); + if column_index >= max_col { + max_col = column_index + 1; + } + } + let mut src_id_col = -1; + let mut dst_id_col = -1; + if let Some((column_index, _)) = column_map.get("src_id") { + src_id_col = *column_index; + } + if let Some((column_index, _)) = column_map.get("dst_id") { + dst_id_col = *column_index; + } + match input.data_source() { + DataSource::File => { + if let Some(file_input) = input.file_input() { + let file_location = &file_input.location; + let path = Path::new(file_location); + let input_dir = path + .parent() + .unwrap_or(Path::new("")) + .to_str() + .unwrap() + .to_string(); + let filename = path + .file_name() + .expect("Can not find filename") + .to_str() + .unwrap_or("") + .to_string(); + let filenames = vec![filename]; + let mut modifier = GraphModifier::new(input_dir); + if file_input.header_row { + modifier.skip_header(); + } + modifier.parallel(parallel); + let mut mappings = vec![-1; max_col as usize]; + if let Some(edge_header) = graph + .graph_schema + .get_edge_header(src_label, edge_label, dst_label) + { + for (i, (property_name, _)) in edge_header.iter().enumerate() { + if let Some((column_index, _)) = column_map.get(property_name) { + mappings[*column_index as usize] = i as i32; + } + } + } else { + panic!("edge label {}_{}_{} not found", src_label, edge_label, dst_label) + } + modifier + .apply_edges_insert_with_filename( + graph, src_label, edge_label, dst_label, &filenames, src_id_col, dst_id_col, + &mappings, + ) + .unwrap(); + } + } + DataSource::Memory => { + if let Some(memory_data) = input.memory_data() { + todo!() + } + } + } +} + +pub fn delete_vertices( + graph: &mut GraphDB, vertex_label: LabelId, input: &Input, + column_mappings: &Vec, parallel: u32, +) { + let mut column_map = HashMap::new(); + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + column_map.insert(property_name.clone(), (column_index, data_type)); + } + let mut id_col = -1; + if let Some((column_index, _)) = column_map.get("id") { + id_col = *column_index; + } + match input.data_source() { + DataSource::File => { + if let Some(file_input) = input.file_input() { + let file_location = &file_input.location; + let path = Path::new(file_location); + let input_dir = path + .parent() + .unwrap_or(Path::new("")) + .to_str() + .unwrap() + .to_string(); + let filename = path + .file_name() + .expect("Can not find filename") + .to_str() + .unwrap_or("") + .to_string(); + let filenames = vec![filename]; + let mut modifier = GraphModifier::new(input_dir); + if file_input.header_row { + modifier.skip_header(); + } + modifier.parallel(parallel); + modifier + .apply_vertices_delete_with_filename(graph, vertex_label, &filenames, id_col) + .unwrap(); + } + } + DataSource::Memory => { + if let Some(memory_data) = input.memory_data() { + let data = memory_data.columns(); + let vertex_id_column = data + .get(id_col as usize) + .expect("Failed to get id column"); + if let Some(uint64_column) = vertex_id_column + .data() + .as_any() + .downcast_ref::() + { + let data = uint64_column + .data + .iter() + .map(|&x| x as usize) + .collect(); + delete_vertices_by_ids(graph, vertex_label, &data, parallel); + } + } + } + } +} + +pub fn delete_edges( + graph: &mut GraphDB, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + input: &Input, src_vertex_mappings: &Vec, dst_vertex_mappings: &Vec, + column_mappings: &Vec, parallel: u32, +) { + let mut column_map = HashMap::new(); + for column_mapping in src_vertex_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + if property_name == "id" { + column_map.insert("src_id".to_string(), (column_index, data_type)); + } + } + for column_mapping in dst_vertex_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + if property_name == "id" { + column_map.insert("dst_id".to_string(), (column_index, data_type)); + } + } + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + column_map.insert(property_name.clone(), (column_index, data_type)); + } + let mut src_id_col = -1; + let mut dst_id_col = -1; + if let Some((column_index, _)) = column_map.get("src_id") { + src_id_col = *column_index; + } + if let Some((column_index, _)) = column_map.get("dst_id") { + dst_id_col = *column_index; + } + match input.data_source() { + DataSource::File => { + if let Some(file_input) = input.file_input() { + let file_location = &file_input.location; + let path = Path::new(file_location); + let input_dir = path + .parent() + .unwrap_or(Path::new("")) + .to_str() + .unwrap() + .to_string(); + let filename = path + .file_name() + .expect("Can not find filename") + .to_str() + .unwrap_or("") + .to_string(); + let filenames = vec![filename]; + let mut modifier = GraphModifier::new(input_dir); + if file_input.header_row { + modifier.skip_header(); + } + modifier.parallel(parallel); + + modifier + .apply_edges_delete_with_filename( + graph, src_label, edge_label, dst_label, &filenames, src_id_col, dst_id_col, + ) + .unwrap(); + } + } + DataSource::Memory => { + if let Some(memory_data) = input.memory_data() { + todo!() + } + } + } +} + +pub fn delete_vertices_by_ids( + graph: &mut GraphDB, vertex_label: LabelId, global_ids: &Vec, parallel: u32, +) where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, +{ + let mut lids = HashSet::new(); + for v in global_ids.iter() { + if v.index() as u64 == u64::MAX { + continue; + } + if let Some(internal_id) = graph.vertex_map.get_internal_id(*v) { + lids.insert(internal_id.1); + } + } + let vertex_label_num = graph.vertex_label_num; + let edge_label_num = graph.edge_label_num; + for e_label_i in 0..edge_label_num { + for src_label_i in 0..vertex_label_num { + if graph + .graph_schema + .get_edge_header(src_label_i as LabelId, e_label_i as LabelId, vertex_label as LabelId) + .is_none() + { + continue; + } + let index = graph.edge_label_to_index( + src_label_i as LabelId, + vertex_label as LabelId, + e_label_i as LabelId, + Direction::Outgoing, + ); + let mut ie_csr = + std::mem::replace(&mut graph.ie[index], Box::new(BatchMutableSingleCsr::new())); + let mut ie_prop = graph.ie_edge_prop_table.remove(&index); + let mut oe_csr = + std::mem::replace(&mut graph.oe[index], Box::new(BatchMutableSingleCsr::new())); + let mut oe_prop = graph.oe_edge_prop_table.remove(&index); + let mut ie_to_delete = Vec::new(); + for v in lids.iter() { + if let Some(ie_list) = ie_csr.get_edges(*v) { + for e in ie_list { + ie_to_delete.push((*e, *v)); + } + } + } + ie_csr.delete_vertices(&lids); + if let Some(table) = oe_prop.as_mut() { + oe_csr.parallel_delete_edges_with_props(&ie_to_delete, false, table, parallel); + } else { + oe_csr.parallel_delete_edges(&ie_to_delete, false, parallel); + } + graph.ie[index] = ie_csr; + if let Some(table) = ie_prop { + graph.ie_edge_prop_table.insert(index, table); + } + graph.oe[index] = oe_csr; + if let Some(table) = oe_prop { + graph.oe_edge_prop_table.insert(index, table); + } + } + for dst_label_i in 0..vertex_label_num { + if graph + .graph_schema + .get_edge_header(vertex_label as LabelId, e_label_i as LabelId, dst_label_i as LabelId) + .is_none() + { + continue; + } + let index = graph.edge_label_to_index( + vertex_label as LabelId, + dst_label_i as LabelId, + e_label_i as LabelId, + Direction::Outgoing, + ); + let mut ie_csr = + std::mem::replace(&mut graph.ie[index], Box::new(BatchMutableSingleCsr::new())); + let mut ie_prop = graph.ie_edge_prop_table.remove(&index); + let mut oe_csr = + std::mem::replace(&mut graph.oe[index], Box::new(BatchMutableSingleCsr::new())); + let mut oe_prop = graph.oe_edge_prop_table.remove(&index); + let mut oe_to_delete = Vec::new(); + for v in lids.iter() { + if let Some(oe_list) = oe_csr.get_edges(*v) { + for e in oe_list { + oe_to_delete.push((*v, *e)); + } + } + } + oe_csr.delete_vertices(&lids); + if let Some(table) = ie_prop.as_mut() { + ie_csr.parallel_delete_edges_with_props(&oe_to_delete, true, table, parallel); + } else { + ie_csr.parallel_delete_edges(&oe_to_delete, true, parallel); + } + graph.ie[index] = ie_csr; + if let Some(table) = ie_prop { + graph.ie_edge_prop_table.insert(index, table); + } + graph.oe[index] = oe_csr; + if let Some(table) = oe_prop { + graph.oe_edge_prop_table.insert(index, table); + } + } + } + + // delete vertices + for v in lids.iter() { + graph.vertex_map.remove_vertex(vertex_label, v); + } +} + +pub fn set_vertices( + graph: &mut GraphDB, vertex_label: LabelId, mut input: Input, + column_mappings: &Vec, parallel: u32, +) { + let mut column_map = HashMap::new(); + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + column_map.insert(property_name.clone(), (column_index, data_type)); + } + let mut id_col = -1; + if let Some((column_index, _)) = column_map.get("id") { + id_col = *column_index; + } + match input.data_source() { + DataSource::File => { + todo!() + } + DataSource::Memory => { + if let Some(mut memory_data) = input.take_memory_data() { + let mut column_data = memory_data.take_columns(); + let id_column = column_data + .get_mut(id_col as usize) + .expect("Failed to find id column"); + let mut data = id_column.take_data(); + let global_ids = { + if let Some(id_column) = data.as_any().downcast_ref::() { + id_column.data.clone() + } else if let Some(uint64_column) = data.as_any().downcast_ref::() { + let mut lid = vec![]; + for i in uint64_column.data.iter() { + lid.push(graph.get_internal_id(*i as usize)); + } + lid + } else { + panic!("DataType of id col is not VertexId") + } + }; + for (k, v) in column_map.iter() { + if k == "id" { + continue; + } + let column_index = v.0; + let column_data_type = v.1; + graph.init_vertex_index_prop(k.clone(), vertex_label, column_data_type); + let column = column_data + .get_mut(column_index as usize) + .expect("Failed to find column"); + graph.set_vertex_index_prop(k.clone(), vertex_label, &global_ids, column.take_data()); + } + } + } + } +} + +pub fn set_edges( + graph: &mut GraphDB, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + mut input: Input, src_vertex_mappings: &Vec, dst_vertex_mappings: &Vec, + column_mappings: &Vec, parallel: u32, +) { + let mut column_map = HashMap::new(); + for column_mapping in column_mappings { + let column = column_mapping.column(); + let column_index = column.index(); + let data_type = column.data_type(); + let property_name = column_mapping.property_name(); + column_map.insert(property_name.clone(), (column_index, data_type)); + } + match input.data_source() { + DataSource::File => { + todo!() + } + DataSource::Memory => { + if let Some(mut memory_data) = input.take_memory_data() { + let mut column_data = memory_data.take_columns(); + if !src_vertex_mappings.is_empty() { + let offset_col_id = src_vertex_mappings[0].column().index(); + let offset_column = column_data + .get_mut(offset_col_id as usize) + .expect("Failed to find id column"); + let mut data = offset_column.take_data(); + let offsets = { + if let Some(id_column) = data.as_any().downcast_ref::() { + id_column.data.clone() + } else { + panic!("DataType of id col is not VertexId") + } + }; + for (k, v) in column_map.iter() { + let column_index = v.0; + let column_data_type = v.1; + graph.init_edge_index_prop( + k.clone(), + src_label, + edge_label, + dst_label, + column_data_type, + ); + let mut column = column_data + .get_mut(column_index as usize) + .expect("Failed to find column"); + graph.set_edge_index_prop( + k.clone(), + src_label, + edge_label, + dst_label, + None, + None, + Some(&offsets), + Some(column.take_data()), + ); + } + } + if !dst_vertex_mappings.is_empty() { + let offset_col_id = dst_vertex_mappings[0].column().index(); + let offset_column = column_data + .get_mut(offset_col_id as usize) + .expect("Failed to find id column"); + let mut data = offset_column.take_data(); + let offsets = { + if let Some(id_column) = data.as_any().downcast_ref::() { + id_column.data.clone() + } else { + panic!("DataType of id col is not VertexId") + } + }; + for (k, v) in column_map.iter() { + let column_index = v.0; + let column_data_type = v.1; + graph.init_edge_index_prop( + k.clone(), + src_label, + edge_label, + dst_label, + column_data_type, + ); + let mut column = column_data + .get_mut(column_index as usize) + .expect("Failed to find column"); + graph.set_edge_index_prop( + k.clone(), + src_label, + edge_label, + dst_label, + Some(&offsets), + Some(column.take_data()), + None, + None, + ); + } + } + } + } + } +} + +fn process_csv_rows(path: &PathBuf, mut process_row: F, skip_header: bool, delim: u8) +where + F: FnMut(&csv::StringRecord), +{ + if let Some(path_str) = path.clone().to_str() { + if path_str.ends_with(".csv.gz") { + if let Ok(gz_reader) = GzReader::from_path(&path) { + let mut rdr = ReaderBuilder::new() + .delimiter(delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(skip_header) + .from_reader(gz_reader); + for result in rdr.records() { + if let Ok(record) = result { + process_row(&record); + } + } + } + } else if path_str.ends_with(".csv") { + if let Ok(file) = File::open(&path) { + let reader = BufReader::new(file); + let mut rdr = ReaderBuilder::new() + .delimiter(delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(skip_header) + .from_reader(reader); + for result in rdr.records() { + if let Ok(record) = result { + process_row(&record); + } + } + } + } + } +} + +pub struct DeleteGenerator { + input_dir: PathBuf, + + delim: u8, + skip_header: bool, + + persons: Vec<(String, G)>, + comments: Vec<(String, G)>, + posts: Vec<(String, G)>, + forums: Vec<(String, G)>, + + person_set: HashSet, + comment_set: HashSet, + post_set: HashSet, + forum_set: HashSet, +} + +impl DeleteGenerator { + pub fn new(input_dir: &PathBuf) -> DeleteGenerator { + Self { + input_dir: input_dir.clone(), + delim: b'|', + skip_header: false, + + persons: vec![], + comments: vec![], + posts: vec![], + forums: vec![], + + person_set: HashSet::new(), + comment_set: HashSet::new(), + post_set: HashSet::new(), + forum_set: HashSet::new(), + } + } + + fn load_vertices(&self, input_prefix: PathBuf, label: LabelId) -> Vec<(String, G)> { + let mut ret = vec![]; + + let suffixes = vec!["*.csv.gz".to_string(), "*.csv".to_string()]; + let files = get_files_list(&input_prefix, &suffixes); + if files.is_err() { + warn!( + "Get vertex files {:?}/{:?} failed: {:?}", + &input_prefix, + &suffixes, + files.err().unwrap() + ); + return ret; + } + let files = files.unwrap(); + if files.is_empty() { + return ret; + } + let parser = LDBCVertexParser::::new(label, 1); + for file in files { + process_csv_rows( + &file, + |record| { + let vertex_meta = parser.parse_vertex_meta(&record); + ret.push(( + record + .get(0) + .unwrap() + .parse::() + .unwrap(), + vertex_meta.global_id, + )); + }, + self.skip_header, + self.delim, + ); + } + + ret + } + + pub fn with_delimiter(mut self, delim: u8) -> Self { + self.delim = delim; + self + } + + pub fn skip_header(&mut self) { + self.skip_header = true; + } + + fn iterate_persons(&mut self, graph: &GraphDB) + where + I: Send + Sync + IndexType, + { + let person_label = graph + .graph_schema + .get_vertex_label_id("PERSON") + .unwrap(); + + let comment_label = graph + .graph_schema + .get_vertex_label_id("COMMENT") + .unwrap(); + let post_label = graph + .graph_schema + .get_vertex_label_id("POST") + .unwrap(); + let forum_label = graph + .graph_schema + .get_vertex_label_id("FORUM") + .unwrap(); + + let hasCreator_label = graph + .graph_schema + .get_edge_label_id("HASCREATOR") + .unwrap(); + let hasModerator_label = graph + .graph_schema + .get_edge_label_id("HASMODERATOR") + .unwrap(); + + let comment_hasCreator_person = + graph.get_sub_graph(person_label, hasCreator_label, comment_label, Direction::Incoming); + let post_hasCreator_person = + graph.get_sub_graph(person_label, hasCreator_label, post_label, Direction::Incoming); + let forum_hasModerator_person = + graph.get_sub_graph(person_label, hasModerator_label, forum_label, Direction::Incoming); + + let forum_title_column = graph.vertex_prop_table[forum_label as usize] + .get_column_by_name("title") + .as_any() + .downcast_ref::() + .unwrap(); + + for (dt, id) in self.persons.iter() { + if let Some((got_label, lid)) = graph.vertex_map.get_internal_id(*id) { + if got_label != person_label { + warn!("Vertex {} is not a person", LDBCVertexParser::::get_original_id(*id)); + continue; + } + for e in comment_hasCreator_person + .get_adj_list(lid) + .unwrap() + { + let oid = graph + .vertex_map + .get_global_id(comment_label, *e) + .unwrap(); + self.comments.push((dt.clone(), oid)); + } + + for e in post_hasCreator_person + .get_adj_list(lid) + .unwrap() + { + let oid = graph + .vertex_map + .get_global_id(post_label, *e) + .unwrap(); + self.posts.push((dt.clone(), oid)); + } + + for e in forum_hasModerator_person + .get_adj_list(lid) + .unwrap() + { + let title = forum_title_column.get(e.index()).unwrap(); + let title_string = title.to_string(); + if title_string.starts_with("Album") || title_string.starts_with("Wall") { + let oid = graph + .vertex_map + .get_global_id(forum_label, *e) + .unwrap(); + self.forums.push((dt.clone(), oid)); + } + } + } else { + warn!("Vertex Person - {} does not exist", LDBCVertexParser::::get_original_id(*id)); + continue; + } + } + } + + fn iterate_forums(&mut self, graph: &GraphDB) + where + I: Send + Sync + IndexType, + { + let forum_label = graph + .graph_schema + .get_vertex_label_id("FORUM") + .unwrap(); + let post_label = graph + .graph_schema + .get_vertex_label_id("POST") + .unwrap(); + + let containerOf_label = graph + .graph_schema + .get_edge_label_id("CONTAINEROF") + .unwrap(); + + let forum_containerOf_post = + graph.get_sub_graph(forum_label, containerOf_label, post_label, Direction::Outgoing); + for (dt, id) in self.forums.iter() { + if let Some((got_label, lid)) = graph.vertex_map.get_internal_id(*id) { + if got_label != forum_label { + warn!("Vertex {} is not a forum", LDBCVertexParser::::get_original_id(*id)); + continue; + } + + for e in forum_containerOf_post + .get_adj_list(lid) + .unwrap() + { + let oid = graph + .vertex_map + .get_global_id(post_label, *e) + .unwrap(); + self.posts.push((dt.clone(), oid)); + } + } else { + warn!("Vertex Forum - {} does not exist", LDBCVertexParser::::get_original_id(*id)); + continue; + } + } + } + + fn iterate_posts(&mut self, graph: &GraphDB) + where + I: Send + Sync + IndexType, + { + let post_label = graph + .graph_schema + .get_vertex_label_id("POST") + .unwrap(); + let comment_label = graph + .graph_schema + .get_vertex_label_id("COMMENT") + .unwrap(); + + let replyOf_label = graph + .graph_schema + .get_edge_label_id("REPLYOF") + .unwrap(); + + let comment_replyOf_post = + graph.get_sub_graph(post_label, replyOf_label, comment_label, Direction::Incoming); + for (dt, id) in self.posts.iter() { + if let Some((got_label, lid)) = graph.vertex_map.get_internal_id(*id) { + if got_label != post_label { + warn!("Vertex {} is not a post", LDBCVertexParser::::get_original_id(*id)); + continue; + } + + for e in comment_replyOf_post.get_adj_list(lid).unwrap() { + let oid = graph + .vertex_map + .get_global_id(comment_label, *e) + .unwrap(); + self.comments.push((dt.clone(), oid)); + } + } else { + warn!("Vertex Post - {} does not exist", LDBCVertexParser::::get_original_id(*id)); + continue; + } + } + } + + fn iterate_comments(&mut self, graph: &GraphDB) + where + I: Send + Sync + IndexType, + { + let comment_label = graph + .graph_schema + .get_vertex_label_id("COMMENT") + .unwrap(); + + let replyOf_label = graph + .graph_schema + .get_edge_label_id("REPLYOF") + .unwrap(); + + let comment_replyOf_comment = + graph.get_sub_graph(comment_label, replyOf_label, comment_label, Direction::Incoming); + let mut index = 0; + while index < self.comments.len() { + let (dt, id) = self.comments[index].clone(); + if let Some((got_label, lid)) = graph.vertex_map.get_internal_id(id) { + if got_label != comment_label { + warn!("Vertex {} is not a comment", LDBCVertexParser::::get_original_id(id)); + index += 1; + continue; + } + + for e in comment_replyOf_comment + .get_adj_list(lid) + .unwrap() + { + let oid = graph + .vertex_map + .get_global_id(comment_label, *e) + .unwrap(); + self.comments.push((dt.clone(), oid)); + } + index += 1; + } else { + warn!("Vertex Comment - {} does not exist", LDBCVertexParser::::get_original_id(id)); + index += 1; + continue; + } + } + } + + pub fn generate(&mut self, graph: &GraphDB, batch_id: &str) + where + I: Send + Sync + IndexType, + { + let output_dir = self + .input_dir + .join("extra_deletes") + .join("dynamic"); + std::fs::create_dir_all(&output_dir).unwrap(); + + let prefix = self.input_dir.join("deletes").join("dynamic"); + + let person_label = graph + .graph_schema + .get_vertex_label_id("PERSON") + .unwrap(); + self.persons = self.load_vertices( + prefix + .clone() + .join("Person") + .join(format!("batch_id={}", batch_id)), + person_label, + ); + self.person_set = self.persons.iter().map(|(_, id)| *id).collect(); + + let comment_label = graph + .graph_schema + .get_vertex_label_id("COMMENT") + .unwrap(); + self.comments = self.load_vertices( + prefix + .clone() + .join("Comment") + .join(format!("batch_id={}", batch_id)), + comment_label, + ); + self.comment_set = self + .comments + .iter() + .map(|(_, id)| *id) + .collect(); + + let post_label = graph + .graph_schema + .get_vertex_label_id("POST") + .unwrap(); + self.posts = self.load_vertices( + prefix + .clone() + .join("Post") + .join(format!("batch_id={}", batch_id)), + post_label, + ); + self.post_set = self.posts.iter().map(|(_, id)| *id).collect(); + + let forum_label = graph + .graph_schema + .get_vertex_label_id("FORUM") + .unwrap(); + self.forums = self.load_vertices( + prefix + .clone() + .join("Forum") + .join(format!("batch_id={}", batch_id)), + forum_label, + ); + self.forum_set = self.forums.iter().map(|(_, id)| *id).collect(); + + self.iterate_persons(graph); + self.iterate_forums(graph); + self.iterate_posts(graph); + self.iterate_comments(graph); + + let batch_dir = format!("batch_id={}", batch_id); + + let person_dir_path = output_dir + .clone() + .join("Person") + .join(&batch_dir); + std::fs::create_dir_all(&person_dir_path).unwrap(); + let mut person_file = File::create(person_dir_path.join("part-0.csv")).unwrap(); + writeln!(person_file, "deletionDate|id").unwrap(); + for (dt, id) in self.persons.iter() { + if !self.person_set.contains(id) { + self.person_set.insert(*id); + writeln!(person_file, "{}|{}", dt, LDBCVertexParser::::get_original_id(*id)).unwrap(); + } + } + + let forum_dir_path = output_dir + .clone() + .join("Forum") + .join(&batch_dir); + std::fs::create_dir_all(&forum_dir_path).unwrap(); + let mut forum_file = File::create(forum_dir_path.join("part-0.csv")).unwrap(); + writeln!(forum_file, "deletionDate|id").unwrap(); + for (dt, id) in self.forums.iter() { + if !self.forum_set.contains(id) { + self.forum_set.insert(*id); + writeln!(forum_file, "{}|{}", dt, LDBCVertexParser::::get_original_id(*id)).unwrap(); + } + } + + let post_dir_path = output_dir.clone().join("Post").join(&batch_dir); + std::fs::create_dir_all(&post_dir_path).unwrap(); + let mut post_file = File::create(post_dir_path.join("part-0.csv")).unwrap(); + writeln!(post_file, "deletionDate|id").unwrap(); + for (dt, id) in self.posts.iter() { + if !self.post_set.contains(id) { + self.post_set.insert(*id); + writeln!(post_file, "{}|{}", dt, LDBCVertexParser::::get_original_id(*id)).unwrap(); + } + } + + let comment_dir_path = output_dir + .clone() + .join("Comment") + .join(&batch_dir); + std::fs::create_dir_all(&comment_dir_path).unwrap(); + let mut comment_file = File::create(comment_dir_path.join("part-0.csv")).unwrap(); + writeln!(comment_file, "deletionDate|id").unwrap(); + for (dt, id) in self.comments.iter() { + if !self.comment_set.contains(id) { + self.comment_set.insert(*id); + writeln!(comment_file, "{}|{}", dt, LDBCVertexParser::::get_original_id(*id)).unwrap(); + } + } + } +} + +pub struct GraphModifier { + input_dir: PathBuf, + + delim: u8, + skip_header: bool, + parallel: u32, +} + +struct CsrRep { + src_label: LabelId, + edge_label: LabelId, + dst_label: LabelId, + + ie_csr: Box>, + ie_prop: Option, + oe_csr: Box>, + oe_prop: Option, +} + +impl GraphModifier { + pub fn new>(input_dir: D) -> GraphModifier { + Self { input_dir: input_dir.as_ref().to_path_buf(), delim: b'|', skip_header: false, parallel: 0 } + } + + pub fn with_delimiter(mut self, delim: u8) -> Self { + self.delim = delim; + self + } + + pub fn skip_header(&mut self) { + self.skip_header = true; + } + + pub fn parallel(&mut self, parallel: u32) { + self.parallel = parallel; + } + + fn take_csr( + &self, graph: &mut GraphDB, src_label_i: LabelId, dst_label_i: LabelId, e_label_i: LabelId, + ) -> CsrRep + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let index = graph.edge_label_to_index(src_label_i, dst_label_i, e_label_i, Direction::Outgoing); + + CsrRep { + src_label: src_label_i, + edge_label: e_label_i, + dst_label: dst_label_i, + + ie_csr: std::mem::replace(&mut graph.ie[index], Box::new(BatchMutableSingleCsr::new())), + ie_prop: graph.ie_edge_prop_table.remove(&index), + oe_csr: std::mem::replace(&mut graph.oe[index], Box::new(BatchMutableSingleCsr::new())), + oe_prop: graph.oe_edge_prop_table.remove(&index), + } + } + + fn take_csrs_with_label(&self, graph: &mut GraphDB, label: LabelId) -> Vec> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let vertex_label_num = graph.vertex_label_num; + let edge_label_num = graph.edge_label_num; + let mut results = vec![]; + for e_label_i in 0..edge_label_num { + for label_i in 0..vertex_label_num { + if !graph + .graph_schema + .get_edge_header(label as LabelId, e_label_i as LabelId, label_i as LabelId) + .is_none() + { + let index = graph.edge_label_to_index( + label as LabelId, + label_i as LabelId, + e_label_i as LabelId, + Direction::Outgoing, + ); + results.push(CsrRep { + src_label: label as LabelId, + edge_label: e_label_i as LabelId, + dst_label: label_i as LabelId, + ie_csr: std::mem::replace( + &mut graph.ie[index], + Box::new(BatchMutableSingleCsr::new()), + ), + ie_prop: graph.ie_edge_prop_table.remove(&index), + oe_csr: std::mem::replace( + &mut graph.oe[index], + Box::new(BatchMutableSingleCsr::new()), + ), + oe_prop: graph.oe_edge_prop_table.remove(&index), + }); + } + if !graph + .graph_schema + .get_edge_header(label_i as LabelId, e_label_i as LabelId, label as LabelId) + .is_none() + { + if label_i as LabelId != label { + let index = graph.edge_label_to_index( + label_i as LabelId, + label as LabelId, + e_label_i as LabelId, + Direction::Outgoing, + ); + results.push(CsrRep { + src_label: label_i as LabelId, + edge_label: e_label_i as LabelId, + dst_label: label as LabelId, + ie_csr: std::mem::replace( + &mut graph.ie[index], + Box::new(BatchMutableSingleCsr::new()), + ), + ie_prop: graph.ie_edge_prop_table.remove(&index), + oe_csr: std::mem::replace( + &mut graph.oe[index], + Box::new(BatchMutableSingleCsr::new()), + ), + oe_prop: graph.oe_edge_prop_table.remove(&index), + }); + } + } + } + } + results + } + fn take_csrs(&self, graph: &mut GraphDB) -> Vec> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let vertex_label_num = graph.vertex_label_num; + let edge_label_num = graph.edge_label_num; + let mut results = vec![]; + + for e_label_i in 0..edge_label_num { + for src_label_i in 0..vertex_label_num { + for dst_label_i in 0..vertex_label_num { + if graph + .graph_schema + .get_edge_header( + src_label_i as LabelId, + e_label_i as LabelId, + dst_label_i as LabelId, + ) + .is_none() + { + continue; + } + + let index = graph.edge_label_to_index( + src_label_i as LabelId, + dst_label_i as LabelId, + e_label_i as LabelId, + Direction::Outgoing, + ); + + results.push(CsrRep { + src_label: src_label_i as LabelId, + edge_label: e_label_i as LabelId, + dst_label: dst_label_i as LabelId, + + ie_csr: std::mem::replace( + &mut graph.ie[index], + Box::new(BatchMutableSingleCsr::new()), + ), + ie_prop: graph.ie_edge_prop_table.remove(&index), + oe_csr: std::mem::replace( + &mut graph.oe[index], + Box::new(BatchMutableSingleCsr::new()), + ), + oe_prop: graph.oe_edge_prop_table.remove(&index), + }); + } + } + } + + results + } + + fn set_csr(&self, graph: &mut GraphDB, reps: CsrRep) + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let index = + graph.edge_label_to_index(reps.src_label, reps.dst_label, reps.edge_label, Direction::Outgoing); + + graph.ie[index] = reps.ie_csr; + if let Some(table) = reps.ie_prop { + graph.ie_edge_prop_table.insert(index, table); + } + graph.oe[index] = reps.oe_csr; + if let Some(table) = reps.oe_prop { + graph.oe_edge_prop_table.insert(index, table); + } + } + + fn set_csrs(&self, graph: &mut GraphDB, mut reps: Vec>) + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + for result in reps.drain(..) { + let index = graph.edge_label_to_index( + result.src_label, + result.dst_label, + result.edge_label, + Direction::Outgoing, + ); + + graph.ie[index] = result.ie_csr; + if let Some(table) = result.ie_prop { + graph.ie_edge_prop_table.insert(index, table); + } + graph.oe[index] = result.oe_csr; + if let Some(table) = result.oe_prop { + graph.oe_edge_prop_table.insert(index, table); + } + } + } + + fn parallel_delete_rep( + &self, input: &mut CsrRep, graph: &GraphDB, edge_file_strings: &Vec, + input_header: &[(String, DataType)], delete_sets: &Vec>, p: u32, + ) where + G: FromStr + Send + Sync + IndexType + Eq, + I: Send + Sync + IndexType, + { + let src_label = input.src_label; + let edge_label = input.edge_label; + let dst_label = input.dst_label; + + let graph_header = graph + .graph_schema + .get_edge_header(src_label, edge_label, dst_label); + if graph_header.is_none() { + return (); + } + + let src_delete_set = &delete_sets[src_label as usize]; + let dst_delete_set = &delete_sets[dst_label as usize]; + let mut delete_edge_set = Vec::new(); + + let mut src_col_id = 0; + let mut dst_col_id = 1; + + for (index, (n, _)) in input_header.iter().enumerate() { + if n == "start_id" { + src_col_id = index; + } + if n == "end_id" { + dst_col_id = index; + } + } + + let mut parser = LDBCEdgeParser::::new(src_label, dst_label, edge_label); + parser.with_endpoint_col_id(src_col_id, dst_col_id); + + let edge_files = get_files_list(&self.input_dir.clone(), edge_file_strings); + if edge_files.is_err() { + return (); + } + + let edge_files = edge_files.unwrap(); + for edge_file in edge_files.iter() { + process_csv_rows( + edge_file, + |record| { + let edge_meta = parser.parse_edge_meta(&record); + if let Some((got_src_label, src_lid)) = graph + .vertex_map + .get_internal_id(edge_meta.src_global_id) + { + if let Some((got_dst_label, dst_lid)) = graph + .vertex_map + .get_internal_id(edge_meta.dst_global_id) + { + if got_src_label != src_label || got_dst_label != dst_label { + return; + } + if src_delete_set.contains(&src_lid) || dst_delete_set.contains(&dst_lid) { + return; + } + delete_edge_set.push((src_lid, dst_lid)); + } + } + }, + self.skip_header, + self.delim, + ); + } + + if src_delete_set.is_empty() && dst_delete_set.is_empty() && delete_edge_set.is_empty() { + return (); + } + + let mut oe_to_delete = Vec::new(); + let mut ie_to_delete = Vec::new(); + + for v in src_delete_set.iter() { + if let Some(oe_list) = input.oe_csr.get_edges(*v) { + for e in oe_list { + if !dst_delete_set.contains(e) { + oe_to_delete.push((*v, *e)); + } + } + } + } + for v in dst_delete_set.iter() { + if let Some(ie_list) = input.ie_csr.get_edges(*v) { + for e in ie_list { + if !src_delete_set.contains(e) { + ie_to_delete.push((*e, *v)); + } + } + } + } + + input.oe_csr.delete_vertices(src_delete_set); + if let Some(table) = input.oe_prop.as_mut() { + input + .oe_csr + .parallel_delete_edges_with_props(&delete_edge_set, false, table, p); + input + .oe_csr + .parallel_delete_edges_with_props(&ie_to_delete, false, table, p); + } else { + input + .oe_csr + .parallel_delete_edges(&delete_edge_set, false, p); + input + .oe_csr + .parallel_delete_edges(&ie_to_delete, false, p); + } + + input.ie_csr.delete_vertices(dst_delete_set); + if let Some(table) = input.ie_prop.as_mut() { + input + .ie_csr + .parallel_delete_edges_with_props(&delete_edge_set, true, table, p); + input + .ie_csr + .parallel_delete_edges_with_props(&oe_to_delete, true, table, p); + } else { + input + .ie_csr + .parallel_delete_edges(&delete_edge_set, true, p); + input + .ie_csr + .parallel_delete_edges(&oe_to_delete, true, p); + } + } + + pub fn apply_vertices_delete_with_filename( + &mut self, graph: &mut GraphDB, label: LabelId, filenames: &Vec, id_col: i32, + ) -> GDBResult<()> + where + G: FromStr + Send + Sync + IndexType + Eq, + I: Send + Sync + IndexType, + { + let mut delete_sets = vec![HashSet::new(); graph.vertex_label_num as usize]; + let mut delete_set = HashSet::new(); + info!("Deleting vertex - {}", graph.graph_schema.vertex_label_names()[label as usize]); + let vertex_files_prefix = self.input_dir.clone(); + let vertex_files = get_files_list(&vertex_files_prefix, filenames).unwrap(); + if vertex_files.is_empty() { + return Ok(()); + } + + let parser = LDBCVertexParser::::new(label as LabelId, id_col as usize); + for vertex_file in vertex_files.iter() { + process_csv_rows( + vertex_file, + |record| { + let vertex_meta = parser.parse_vertex_meta(&record); + let (got_label, lid) = graph + .vertex_map + .get_internal_id(vertex_meta.global_id) + .unwrap(); + if got_label == label as LabelId { + delete_set.insert(lid); + } + }, + self.skip_header, + self.delim, + ); + } + + delete_sets[label as usize] = delete_set; + + let mut input_reps = self.take_csrs_with_label(graph, label); + input_reps.iter_mut().for_each(|rep| { + let edge_file_strings = vec![]; + let input_header = graph + .graph_schema + .get_edge_header(rep.src_label, rep.edge_label, rep.dst_label) + .unwrap(); + self.parallel_delete_rep( + rep, + graph, + &edge_file_strings, + &input_header, + &delete_sets, + self.parallel, + ); + }); + self.set_csrs(graph, input_reps); + let delete_set = &delete_sets[label as usize]; + for v in delete_set.iter() { + graph.vertex_map.remove_vertex(label, v); + } + + Ok(()) + } + + pub fn apply_edges_delete_with_filename( + &mut self, graph: &mut GraphDB, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + filenames: &Vec, src_id_col: i32, dst_id_col: i32, + ) -> GDBResult<()> + where + G: FromStr + Send + Sync + IndexType + Eq, + I: Send + Sync + IndexType, + { + let mut input_resp = self.take_csr(graph, src_label, dst_label, edge_label); + let mut input_header: Vec<(String, DataType)> = vec![]; + input_header.resize( + std::cmp::max(src_id_col as usize, dst_id_col as usize) + 1, + ("".to_string(), DataType::NULL), + ); + input_header[src_id_col as usize] = ("start_id".to_string(), DataType::ID); + input_header[dst_id_col as usize] = ("end_id".to_string(), DataType::ID); + let delete_sets = vec![HashSet::new(); graph.vertex_label_num as usize]; + self.parallel_delete_rep( + &mut input_resp, + graph, + filenames, + &input_header, + &delete_sets, + self.parallel, + ); + self.set_csr(graph, input_resp); + Ok(()) + } + + fn apply_deletes( + &mut self, graph: &mut GraphDB, delete_schema: &InputSchema, + ) -> GDBResult<()> + where + G: FromStr + Send + Sync + IndexType + Eq, + I: Send + Sync + IndexType, + { + let vertex_label_num = graph.vertex_label_num; + let mut delete_sets = vec![]; + for v_label_i in 0..vertex_label_num { + let mut delete_set = HashSet::new(); + if let Some(vertex_file_strings) = delete_schema.get_vertex_file(v_label_i as LabelId) { + if !vertex_file_strings.is_empty() { + info!( + "Deleting vertex - {}", + graph.graph_schema.vertex_label_names()[v_label_i as usize] + ); + let vertex_files_prefix = self.input_dir.clone(); + let vertex_files = get_files_list_beta(&vertex_files_prefix, &vertex_file_strings); + if vertex_files.is_empty() { + delete_sets.push(delete_set); + continue; + } + let input_header = delete_schema + .get_vertex_header(v_label_i as LabelId) + .unwrap(); + let mut id_col = 0; + for (index, (n, _)) in input_header.iter().enumerate() { + if n == "id" { + id_col = index; + break; + } + } + let parser = LDBCVertexParser::::new(v_label_i as LabelId, id_col); + for vertex_file in vertex_files.iter() { + process_csv_rows( + vertex_file, + |record| { + let vertex_meta = parser.parse_vertex_meta(&record); + let (got_label, lid) = graph + .vertex_map + .get_internal_id(vertex_meta.global_id) + .unwrap(); + if got_label == v_label_i as LabelId { + delete_set.insert(lid); + } + }, + self.skip_header, + self.delim, + ); + } + } + } + delete_sets.push(delete_set); + } + + let mut input_reps = self.take_csrs(graph); + input_reps.iter_mut().for_each(|rep| { + let default_vec: Vec = vec![]; + let edge_file_strings = delete_schema + .get_edge_file(rep.src_label, rep.edge_label, rep.dst_label) + .unwrap_or_else(|| &default_vec); + let input_header = delete_schema + .get_edge_header(rep.src_label, rep.edge_label, rep.dst_label) + .unwrap_or_else(|| &[]); + + self.parallel_delete_rep( + rep, + graph, + &edge_file_strings, + &input_header, + &delete_sets, + self.parallel, + ); + }); + self.set_csrs(graph, input_reps); + + for v_label_i in 0..vertex_label_num { + let delete_set = &delete_sets[v_label_i as usize]; + if delete_set.is_empty() { + continue; + } + for v in delete_set.iter() { + graph + .vertex_map + .remove_vertex(v_label_i as LabelId, v); + } + } + + Ok(()) + } + + pub fn apply_vertices_insert_with_filename( + &mut self, graph: &mut GraphDB, label: LabelId, filenames: &Vec, id_col: i32, + mappings: &Vec, + ) -> GDBResult<()> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let graph_header = graph + .graph_schema + .get_vertex_header(label as LabelId) + .unwrap(); + let header = graph_header.to_vec(); + + let parser = LDBCVertexParser::::new(label as LabelId, id_col as usize); + let vertex_files_prefix = self.input_dir.clone(); + + let vertex_files = get_files_list(&vertex_files_prefix, filenames); + if vertex_files.is_err() { + warn!( + "Get vertex files {:?}/{:?} failed: {:?}", + &vertex_files_prefix, + filenames, + vertex_files.err().unwrap() + ); + return Ok(()); + } + let vertex_files = vertex_files.unwrap(); + if vertex_files.is_empty() { + return Ok(()); + } + for vertex_file in vertex_files.iter() { + process_csv_rows( + vertex_file, + |record| { + let vertex_meta = parser.parse_vertex_meta(&record); + if let Ok(properties) = parse_properties_by_mappings(&record, &header, mappings) { + graph.insert_vertex(vertex_meta.label, vertex_meta.global_id, Some(properties)); + } + }, + self.skip_header, + self.delim, + ); + } + + Ok(()) + } + + fn apply_vertices_inserts( + &mut self, graph: &mut GraphDB, input_schema: &InputSchema, + ) -> GDBResult<()> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let v_label_num = graph.vertex_label_num; + for v_label_i in 0..v_label_num { + if let Some(vertex_file_strings) = input_schema.get_vertex_file(v_label_i as LabelId) { + if vertex_file_strings.is_empty() { + continue; + } + + let input_header = input_schema + .get_vertex_header(v_label_i as LabelId) + .unwrap(); + let graph_header = graph + .graph_schema + .get_vertex_header(v_label_i as LabelId) + .unwrap(); + let mut keep_set = HashSet::new(); + for pair in graph_header { + keep_set.insert(pair.0.clone()); + } + let mut selected = vec![false; input_header.len()]; + let mut id_col_id = 0; + for (index, (n, _)) in input_header.iter().enumerate() { + if keep_set.contains(n) { + selected[index] = true; + } + if n == "id" { + id_col_id = index; + } + } + let parser = LDBCVertexParser::::new(v_label_i as LabelId, id_col_id); + let vertex_files_prefix = self.input_dir.clone(); + + let vertex_files = get_files_list(&vertex_files_prefix, &vertex_file_strings); + if vertex_files.is_err() { + warn!( + "Get vertex files {:?}/{:?} failed: {:?}", + &vertex_files_prefix, + &vertex_file_strings, + vertex_files.err().unwrap() + ); + continue; + } + let vertex_files = vertex_files.unwrap(); + if vertex_files.is_empty() { + continue; + } + for vertex_file in vertex_files.iter() { + process_csv_rows( + vertex_file, + |record| { + let vertex_meta = parser.parse_vertex_meta(&record); + if let Ok(properties) = + parse_properties(&record, input_header, selected.as_slice()) + { + graph.insert_vertex( + vertex_meta.label, + vertex_meta.global_id, + Some(properties), + ); + } + }, + self.skip_header, + self.delim, + ); + } + } + } + + Ok(()) + } + + fn load_insert_edges( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + input_header: &[(String, DataType)], graph_schema: &CsrGraphSchema, files: &Vec, + ) -> GDBResult<(Vec<(G, G)>, Option)> + where + G: FromStr + Send + Sync + IndexType + Eq, + { + let mut edges = vec![]; + + let graph_header = graph_schema + .get_edge_header(src_label, edge_label, dst_label) + .unwrap(); + let mut table_header = vec![]; + let mut keep_set = HashSet::new(); + for pair in graph_header { + table_header.push((pair.1.clone(), pair.0.clone())); + keep_set.insert(pair.0.clone()); + } + + let mut selected = vec![false; input_header.len()]; + let mut src_col_id = 0; + let mut dst_col_id = 1; + for (index, (n, _)) in input_header.iter().enumerate() { + if keep_set.contains(n) { + selected[index] = true; + } + if n == "start_id" { + src_col_id = index; + } + if n == "end_id" { + dst_col_id = index; + } + } + + let mut parser = LDBCEdgeParser::::new(src_label, dst_label, edge_label); + parser.with_endpoint_col_id(src_col_id, dst_col_id); + + if table_header.is_empty() { + for file in files.iter() { + process_csv_rows( + file, + |record| { + let edge_meta = parser.parse_edge_meta(&record); + edges.push((edge_meta.src_global_id, edge_meta.dst_global_id)); + }, + self.skip_header, + self.delim, + ); + } + Ok((edges, None)) + } else { + let mut prop_table = ColTable::new(table_header); + for file in files.iter() { + process_csv_rows( + file, + |record| { + let edge_meta = parser.parse_edge_meta(&record); + let properties = + parse_properties(&record, input_header, selected.as_slice()).unwrap(); + edges.push((edge_meta.src_global_id, edge_meta.dst_global_id)); + prop_table.push(&properties); + }, + self.skip_header, + self.delim, + ) + } + Ok((edges, Some(prop_table))) + } + } + + fn parallel_insert_rep( + &self, input: &mut CsrRep, graph: &GraphDB, edge_file_strings: &Vec, + input_header: &[(String, DataType)], p: u32, + ) where + G: FromStr + Send + Sync + IndexType + Eq, + I: Send + Sync + IndexType, + { + let t = Instant::now(); + let src_label = input.src_label; + let edge_label = input.edge_label; + let dst_label = input.dst_label; + + let graph_header = graph + .graph_schema + .get_edge_header(src_label, edge_label, dst_label); + if graph_header.is_none() { + return; + } + + if edge_file_strings.is_empty() { + return; + } + + let edge_files = get_files_list(&self.input_dir.clone(), edge_file_strings); + if edge_files.is_err() { + return; + } + let edge_files = edge_files.unwrap(); + if edge_files.is_empty() { + return; + } + + let (edges, table) = self + .load_insert_edges::( + src_label, + edge_label, + dst_label, + input_header, + &graph.graph_schema, + &edge_files, + ) + .unwrap(); + + let parsed_edges: Vec<(I, I)> = edges + .par_iter() + .map(|(src, dst)| { + let (got_src_label, src_lid) = graph.vertex_map.get_internal_id(*src).unwrap(); + let (got_dst_label, dst_lid) = graph.vertex_map.get_internal_id(*dst).unwrap(); + if got_src_label != src_label || got_dst_label != dst_label { + warn!("insert edges with wrong label"); + (::max(), ::max()) + } else { + (src_lid, dst_lid) + } + }) + .collect(); + + let new_src_num = graph.vertex_map.vertex_num(src_label); + input.oe_prop = if let Some(old_table) = input.oe_prop.take() { + Some(input.oe_csr.insert_edges_with_prop( + new_src_num, + &parsed_edges, + table.as_ref().unwrap(), + false, + p, + old_table, + )) + } else { + input + .oe_csr + .insert_edges(new_src_num, &parsed_edges, false, p); + None + }; + + let new_dst_num = graph.vertex_map.vertex_num(dst_label); + input.ie_prop = if let Some(old_table) = input.ie_prop.take() { + Some(input.ie_csr.insert_edges_with_prop( + new_dst_num, + &parsed_edges, + table.as_ref().unwrap(), + true, + p, + old_table, + )) + } else { + input + .ie_csr + .insert_edges(new_dst_num, &parsed_edges, true, p); + None + }; + + println!( + "insert edge (parallel{}): {} - {} - {}: {}", + p, + graph.graph_schema.vertex_label_names()[src_label as usize], + graph.graph_schema.edge_label_names()[edge_label as usize], + graph.graph_schema.vertex_label_names()[dst_label as usize], + t.elapsed().as_secs_f32(), + ); + } + + pub fn apply_edges_insert_with_filename( + &mut self, graph: &mut GraphDB, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + filenames: &Vec, src_id_col: i32, dst_id_col: i32, mappings: &Vec, + ) -> GDBResult<()> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let mut parser = LDBCEdgeParser::::new(src_label, dst_label, edge_label); + parser.with_endpoint_col_id(src_id_col as usize, dst_id_col as usize); + + let edge_files_prefix = self.input_dir.clone(); + let edge_files = get_files_list(&edge_files_prefix, filenames); + if edge_files.is_err() { + warn!( + "Get vertex files {:?}/{:?} failed: {:?}", + &edge_files_prefix, + filenames, + edge_files.err().unwrap() + ); + return Ok(()); + } + let edge_files = edge_files.unwrap(); + let mut input_reps = self.take_csr(graph, src_label, dst_label, edge_label); + let mut edges = vec![]; + let graph_header = graph + .graph_schema + .get_edge_header(src_label, edge_label, dst_label) + .unwrap(); + let mut table_header = vec![]; + for pair in graph_header { + table_header.push((pair.1.clone(), pair.0.clone())); + } + let mut prop_table = ColTable::new(table_header.clone()); + if table_header.is_empty() { + for file in edge_files { + process_csv_rows( + &file, + |record| { + let edge_meta = parser.parse_edge_meta(&record); + edges.push((edge_meta.src_global_id, edge_meta.dst_global_id)); + }, + self.skip_header, + self.delim, + ); + } + } else { + for file in edge_files { + process_csv_rows( + &file, + |record| { + let edge_meta = parser.parse_edge_meta(&record); + edges.push((edge_meta.src_global_id, edge_meta.dst_global_id)); + if let Ok(properties) = + parse_properties_by_mappings(&record, &graph_header, mappings) + { + prop_table.push(&properties); + } + }, + self.skip_header, + self.delim, + ) + } + } + + let parsed_edges: Vec<(I, I)> = edges + .par_iter() + .map(|(src, dst)| { + let (got_src_label, src_lid) = graph.vertex_map.get_internal_id(*src).unwrap(); + let (got_dst_label, dst_lid) = graph.vertex_map.get_internal_id(*dst).unwrap(); + if got_src_label != src_label || got_dst_label != dst_label { + warn!("insert edges with wrong label"); + (::max(), ::max()) + } else { + (src_lid, dst_lid) + } + }) + .collect(); + let new_src_num = graph.vertex_map.vertex_num(src_label); + input_reps.oe_prop = if let Some(old_table) = input_reps.oe_prop.take() { + Some(input_reps.oe_csr.insert_edges_with_prop( + new_src_num, + &parsed_edges, + &prop_table, + false, + self.parallel, + old_table, + )) + } else { + input_reps + .oe_csr + .insert_edges(new_src_num, &parsed_edges, false, self.parallel); + None + }; + + let new_dst_num = graph.vertex_map.vertex_num(dst_label); + input_reps.ie_prop = if let Some(old_table) = input_reps.ie_prop.take() { + Some(input_reps.ie_csr.insert_edges_with_prop( + new_dst_num, + &parsed_edges, + &prop_table, + true, + self.parallel, + old_table, + )) + } else { + input_reps + .ie_csr + .insert_edges(new_dst_num, &parsed_edges, true, self.parallel); + None + }; + self.set_csr(graph, input_reps); + Ok(()) + } + + fn apply_edges_inserts( + &mut self, graph: &mut GraphDB, input_schema: &InputSchema, + ) -> GDBResult<()> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + let mut input_reps = self.take_csrs(graph); + for ir in input_reps.iter_mut() { + let edge_files = input_schema.get_edge_file(ir.src_label, ir.edge_label, ir.dst_label); + if edge_files.is_none() { + continue; + } + let input_header = input_schema + .get_edge_header(ir.src_label, ir.edge_label, ir.dst_label) + .unwrap(); + self.parallel_insert_rep(ir, graph, edge_files.unwrap(), input_header, self.parallel); + } + self.set_csrs(graph, input_reps); + + Ok(()) + } + + pub fn insert(&mut self, graph: &mut GraphDB, insert_schema: &InputSchema) -> GDBResult<()> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + self.apply_vertices_inserts(graph, &insert_schema)?; + self.apply_edges_inserts(graph, &insert_schema)?; + Ok(()) + } + + pub fn delete(&mut self, graph: &mut GraphDB, delete_schema: &InputSchema) -> GDBResult<()> + where + I: Send + Sync + IndexType, + G: FromStr + Send + Sync + IndexType + Eq, + { + self.apply_deletes(graph, &delete_schema)?; + Ok(()) + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/ldbc_parser.rs b/interactive_engine/executor/store/bmcsr/src/ldbc_parser.rs new file mode 100644 index 000000000000..2d7680c9e00b --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/ldbc_parser.rs @@ -0,0 +1,143 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::marker::PhantomData; +use std::str::FromStr; + +use abomonation_derive::Abomonation; +use csv::StringRecord; + +use crate::graph::IndexType; +use crate::types::*; + +/// The vertex's meta data including global_id and label_id +#[derive(Abomonation, PartialEq, Clone, Debug)] +pub struct VertexMeta { + pub global_id: G, + pub label: LabelId, +} + +/// The edge's meta data after parsing from the csv file. +#[derive(Abomonation, PartialEq, Clone, Debug)] +pub struct EdgeMeta { + pub src_global_id: G, + pub src_label_id: LabelId, + pub dst_global_id: G, + pub dst_label_id: LabelId, + pub label_id: LabelId, +} + +/// Define parsing a LDBC vertex +#[derive(Clone)] +pub struct LDBCVertexParser { + vertex_type: LabelId, + id_index: usize, + ph: PhantomData, +} + +pub const LABEL_SHIFT_BITS: usize = 8 * (std::mem::size_of::() - std::mem::size_of::()); + +impl LDBCVertexParser { + pub fn to_global_id(ldbc_id: usize, label_id: LabelId) -> G { + let global_id: usize = ((label_id as usize) << LABEL_SHIFT_BITS) | ldbc_id; + G::new(global_id) + } + + pub fn encode_local_id(local_id: usize, label_id: LabelId) -> G { + let encode_id: usize = ((label_id as usize) << LABEL_SHIFT_BITS) | local_id; + G::new(encode_id) + } + + pub fn decode_local_id(encoded_id: usize) -> (LabelId, G) { + let label_id = (encoded_id >> LABEL_SHIFT_BITS) as LabelId; + let local_id: usize = ((1_usize << LABEL_SHIFT_BITS) - 1) & encoded_id.index(); + return (label_id, G::new(local_id)); + } + + pub fn get_label_id(global_id: G) -> LabelId { + (global_id.index() >> LABEL_SHIFT_BITS) as LabelId + } + + pub fn get_original_id(global_id: G) -> G { + let mask = (1_usize << LABEL_SHIFT_BITS) - 1; + G::new(global_id.index() & mask) + } +} + +impl LDBCVertexParser { + pub fn new(vertex_type: LabelId, id_index: usize) -> Self { + Self { vertex_type, id_index, ph: PhantomData } + } + + pub fn parse_vertex_meta(&self, record: &StringRecord) -> VertexMeta { + let global_id = Self::to_global_id( + record + .get(self.id_index) + .unwrap() + .parse::() + .unwrap(), + self.vertex_type, + ); + VertexMeta { global_id, label: self.vertex_type } + } +} + +/// Define parsing a LDBC edge +#[derive(Clone)] +pub struct LDBCEdgeParser { + src_vertex_type: LabelId, + dst_vertex_type: LabelId, + edge_type: LabelId, + src_col_id: usize, + dst_col_id: usize, + ph: PhantomData, +} + +impl LDBCEdgeParser { + pub fn new(src_vertex_type: LabelId, dst_vertex_type: LabelId, edge_type: LabelId) -> Self { + Self { src_vertex_type, dst_vertex_type, edge_type, src_col_id: 0, dst_col_id: 1, ph: PhantomData } + } + + pub fn with_endpoint_col_id(&mut self, src_col_id: usize, dst_col_id: usize) { + self.src_col_id = src_col_id; + self.dst_col_id = dst_col_id; + } + + pub fn parse_edge_meta(&self, record: &StringRecord) -> EdgeMeta { + let src_global_id = LDBCVertexParser::to_global_id( + record + .get(self.src_col_id) + .unwrap() + .parse::() + .unwrap(), + self.src_vertex_type, + ); + let dst_global_id = LDBCVertexParser::to_global_id( + record + .get(self.dst_col_id) + .unwrap() + .parse::() + .unwrap(), + self.dst_vertex_type, + ); + EdgeMeta { + src_global_id, + src_label_id: self.src_vertex_type, + dst_global_id, + dst_label_id: self.dst_vertex_type, + label_id: self.edge_type, + } + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/lib.rs b/interactive_engine/executor/store/bmcsr/src/lib.rs new file mode 100644 index 000000000000..22da529e96ba --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/lib.rs @@ -0,0 +1,45 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +extern crate abomonation_derive; +#[macro_use] +extern crate dyn_type; +#[macro_use] +extern crate log; +extern crate core; +extern crate serde; +extern crate serde_derive; +extern crate serde_json; + +pub mod bmcsr; +pub mod bmscsr; +pub mod col_table; +pub mod columns; +pub mod csr; +pub mod date; +pub mod date_time; +pub mod edge_trim; +pub mod error; +pub mod graph; +pub mod graph_db; +pub mod graph_loader; +pub mod graph_modifier; +pub mod ldbc_parser; +pub mod schema; +pub mod sub_graph; +pub mod traverse; +pub mod types; +pub mod utils; +pub mod vertex_map; diff --git a/interactive_engine/executor/store/bmcsr/src/schema.rs b/interactive_engine/executor/store/bmcsr/src/schema.rs new file mode 100644 index 000000000000..ef4c9cd89b57 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/schema.rs @@ -0,0 +1,745 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::fmt::Debug; +use std::fs::File; +use std::hash::Hash; +use std::path::Path; + +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use crate::columns::DataType; +use crate::types::*; + +/// The starting id field in an edge file +pub const START_ID_FIELD: &'static str = "start_id"; +/// The end id field in an edge file +pub const END_ID_FIELD: &'static str = "end_id"; + +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)] +pub enum PartitionType { + Dynamic, + Static, + Null, +} + +impl<'a> From<&'a str> for PartitionType { + fn from(_token: &'a str) -> Self { + let token_str = _token.to_uppercase(); + let token = token_str.as_str(); + if token == "DYNAMIC" { + PartitionType::Dynamic + } else if token == "STATIC" { + PartitionType::Static + } else { + error!("Unsupported type {:?}", token); + PartitionType::Null + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)] +pub enum EdgeStrategy { + Single, + Multiple, + Null, +} + +impl<'a> From<&'a str> for EdgeStrategy { + fn from(_token: &'a str) -> Self { + let token_str = _token.to_uppercase(); + let token = token_str.as_str(); + if token == "SINGLE" { + EdgeStrategy::Single + } else if token == "MULTI" { + EdgeStrategy::Multiple + } else { + error!("Unsupported type {:?}", token); + EdgeStrategy::Null + } + } +} + +/// An edge's label is consisted of three elements: +/// edge_label, src_vertex_label and dst_vertex_label. +#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)] +pub struct EdgeLabelTuple { + pub edge_label: LabelId, + pub src_vertex_label: LabelId, + pub dst_vertex_label: LabelId, +} + +pub trait Schema { + /// Get the header for the certain type of vertex if any + fn get_vertex_header(&self, vertex_type_id: LabelId) -> Option<&[(String, DataType)]>; + + /// Get the header for the certain type of edge if any + fn get_edge_header( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + ) -> Option<&[(String, DataType)]>; + + /// Get the schema for the certain type of vertex if any. + fn get_vertex_schema(&self, vertex_type_id: LabelId) -> Option<&HashMap>; + + /// Get the schema for the certain + /// type of edge if any. + fn get_edge_schema( + &self, edge_type_id: (LabelId, LabelId, LabelId), + ) -> Option<&HashMap>; + + /// Get a certain vertex type's id if any + fn get_vertex_label_id(&self, vertex_type: &str) -> Option; + + /// Get a certain edge type's id + fn get_edge_label_id(&self, edge_type: &str) -> Option; +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct CsrGraphSchema { + /// Map from vertex types to labelid + pub vertex_type_to_id: HashMap, + /// Map from edge types to `EdgeLabelTuple` + pub edge_type_to_id: HashMap, + /// Map from vertex/edge (labelid) to its property name, data types and index in the row + vertex_prop_meta: HashMap>, + vertex_prop_vec: HashMap>, + vertex_partition_type: HashMap, + + edge_prop_meta: HashMap<(LabelId, LabelId, LabelId), HashMap>, + edge_prop_vec: HashMap<(LabelId, LabelId, LabelId), Vec<(String, DataType)>>, + edge_single_ie: HashSet<(LabelId, LabelId, LabelId)>, + edge_single_oe: HashSet<(LabelId, LabelId, LabelId)>, +} + +impl CsrGraphSchema { + pub fn vertex_label_names(&self) -> Vec { + let mut ret = vec![]; + let vertex_label_num = self.vertex_type_to_id.len(); + for _ in 0..vertex_label_num { + ret.push(String::new()); + } + for pair in self.vertex_type_to_id.iter() { + ret[*pair.1 as usize] = pair.0.clone(); + } + + ret + } + + pub fn edge_label_names(&self) -> Vec { + let mut ret = vec![]; + let edge_label_num = self.edge_type_to_id.len(); + for _ in 0..edge_label_num { + ret.push(String::new()); + } + for pair in self.edge_type_to_id.iter() { + ret[*pair.1 as usize] = pair.0.clone(); + } + + ret + } + + pub fn is_static_vertex(&self, vertex_label: LabelId) -> bool { + *self + .vertex_partition_type + .get(&vertex_label) + .unwrap() + == PartitionType::Static + } + + pub fn is_single_ie(&self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId) -> bool { + if self + .edge_single_ie + .contains(&(src_label, edge_label, dst_label)) + { + true + } else { + false + } + } + + pub fn is_single_oe(&self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId) -> bool { + if self + .edge_single_oe + .contains(&(src_label, edge_label, dst_label)) + { + true + } else { + false + } + } + + pub fn desc(&self) { + info!( + "vertex label num: {}, edge label num: {}", + self.vertex_type_to_id.len(), + self.edge_type_to_id.len() + ); + let mut vertex_type_names = vec![]; + let mut edge_type_names = vec![]; + vertex_type_names.resize(self.vertex_type_to_id.len(), "".to_string()); + edge_type_names.resize(self.edge_type_to_id.len(), "".to_string()); + + for pair in self.vertex_type_to_id.iter() { + vertex_type_names[*pair.1 as usize] = pair.0.clone(); + info!("vertex label: {}, id: {}", pair.0.clone(), pair.1); + } + for pair in self.edge_type_to_id.iter() { + edge_type_names[*pair.1 as usize] = pair.0.clone(); + info!("edge label: {}, id: {}", pair.0.clone(), pair.1); + } + + info!("Single IE: "); + for tup in self.edge_single_ie.iter() { + info!( + "\t{} - {} - {}", + vertex_type_names[tup.0 as usize], + edge_type_names[tup.1 as usize], + vertex_type_names[tup.2 as usize] + ); + } + info!("Single OE: "); + for tup in self.edge_single_oe.iter() { + info!( + "\t{} - {} - {}", + vertex_type_names[tup.0 as usize], + edge_type_names[tup.1 as usize], + vertex_type_names[tup.2 as usize] + ); + } + } + + pub fn from_json_file>(path: P) -> std::io::Result { + let file = File::open(path)?; + let schema_json = + serde_json::from_reader::(file).map_err(std::io::Error::from)?; + Ok(CsrGraphSchema::from(&schema_json)) + } + + pub fn to_json_file>(&self, path: P) -> std::io::Result<()> { + let file = File::create(path)?; + let schema_json = CsrGraphSchemaJson::from(self); + serde_json::to_writer_pretty::(file, &schema_json) + .map_err(std::io::Error::from) + } + + /// Get a certain edge type's id, together with its start- and edge- vertices's type + /// while giving the `full_edge_type` that is "__" + pub fn get_edge_label_tuple(&self, full_edge_type: &str) -> Option { + let mut parts = full_edge_type.split("_"); + let src_label_id = + if let Some(src_label) = parts.next() { self.get_vertex_label_id(src_label) } else { None }; + let edge_label_id = + if let Some(edge_label) = parts.next() { self.get_edge_label_id(edge_label) } else { None }; + let dst_label_id = + if let Some(dst_label) = parts.next() { self.get_vertex_label_id(dst_label) } else { None }; + + if src_label_id.is_some() && edge_label_id.is_some() && dst_label_id.is_some() { + Some(EdgeLabelTuple { + edge_label: edge_label_id.unwrap(), + src_vertex_label: src_label_id.unwrap(), + dst_vertex_label: dst_label_id.unwrap(), + }) + } else { + None + } + } + + pub fn add_vertex_index_prop( + &mut self, index_name: String, vertex_label: LabelId, data_type: DataType, + ) -> Option { + if let Some(mut prop_meta) = self.vertex_prop_meta.get_mut(&vertex_label) { + if let Some(mut prop_list) = self.vertex_prop_vec.get_mut(&vertex_label) { + if let Some((_, index_label)) = prop_meta.get(&index_name) { + return Some(*index_label); + } else { + let index_label = prop_list.len(); + prop_meta.insert(index_name.clone(), (data_type, index_label)); + prop_list.push((index_name.clone(), data_type)); + return Some(index_label); + } + } + } + None + } + + pub fn add_edge_index_prop( + &mut self, index_name: String, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + data_type: DataType, + ) -> Option { + if let Some(mut prop_meta) = self + .edge_prop_meta + .get_mut(&(src_label, edge_label, dst_label)) + { + if let Some(mut prop_list) = self + .edge_prop_vec + .get_mut(&(src_label, edge_label, dst_label)) + { + if let Some((_, index_label)) = prop_meta.get(&index_name) { + return Some(*index_label); + } else { + let index_label = prop_list.len(); + prop_meta.insert(index_name.clone(), (data_type, index_label)); + prop_list.push((index_name.clone(), data_type)); + return Some(index_label); + } + } + } + None + } +} + +fn is_map_eq( + map1: &HashMap, map2: &HashMap, +) -> bool { + map1.iter().sorted().eq(map2.iter().sorted()) +} + +impl PartialEq for CsrGraphSchema { + fn eq(&self, other: &Self) -> bool { + let mut is_eq = is_map_eq(&self.vertex_type_to_id, &other.vertex_type_to_id) + && is_map_eq(&self.edge_type_to_id, &other.edge_type_to_id) + && is_map_eq(&self.vertex_prop_vec, &other.vertex_prop_vec) + && is_map_eq(&self.edge_prop_vec, &other.edge_prop_vec) + && self.vertex_prop_meta.len() == other.vertex_prop_meta.len() + && self.edge_prop_meta.len() == other.edge_prop_meta.len(); + + if is_eq { + for ((k1, v1), (k2, v2)) in self + .vertex_prop_meta + .iter() + .sorted_by(|e1, e2| e1.0.cmp(e2.0)) + .zip( + other + .vertex_prop_meta + .iter() + .sorted_by(|e1, e2| e1.0.cmp(e2.0)), + ) + { + is_eq = k1 == k2 && is_map_eq(v1, v2); + if !is_eq { + break; + } + } + + for ((k1, v1), (k2, v2)) in self + .edge_prop_meta + .iter() + .sorted_by(|e1, e2| e1.0.cmp(e2.0)) + .zip( + other + .edge_prop_meta + .iter() + .sorted_by(|e1, e2| e1.0.cmp(e2.0)), + ) + { + is_eq = k1 == k2 && is_map_eq(v1, v2); + if !is_eq { + break; + } + } + } + + is_eq + } +} + +impl<'a> From<&'a CsrGraphSchemaJson> for CsrGraphSchema { + fn from(schema_json: &'a CsrGraphSchemaJson) -> Self { + let mut vertex_type_to_id = HashMap::new(); + let mut vertex_partition_type = HashMap::new(); + let mut vertex_label = 0 as LabelId; + for vertex_info in &schema_json.vertex { + vertex_type_to_id.insert(vertex_info.label.clone(), vertex_label); + vertex_partition_type.insert(vertex_label, vertex_info.partition_type.clone()); + vertex_label += 1; + } + let mut edge_type_to_id = HashMap::new(); + let mut edge_label = 0 as LabelId; + for edge_info in &schema_json.edge { + if !edge_type_to_id.contains_key(&edge_info.label) { + edge_type_to_id.insert(edge_info.label.clone(), edge_label); + edge_label += 1; + } + } + let mut vertex_prop_meta: HashMap> = + HashMap::with_capacity(schema_json.vertex.len()); + let mut vertex_prop_vec: HashMap> = + HashMap::with_capacity(schema_json.vertex.len()); + let mut edge_prop_meta: HashMap<(LabelId, LabelId, LabelId), HashMap> = + HashMap::with_capacity(schema_json.edge.len()); + let mut edge_prop_vec: HashMap<(LabelId, LabelId, LabelId), Vec<(String, DataType)>> = + HashMap::with_capacity(schema_json.edge.len()); + let mut edge_single_ie = HashSet::new(); + let mut edge_single_oe = HashSet::new(); + + for vertex_info in &schema_json.vertex { + let label_id = vertex_type_to_id[&vertex_info.label]; + let vertex_map = vertex_prop_meta + .entry(label_id) + .or_insert_with(HashMap::new); + let vertex_vec = vertex_prop_vec + .entry(label_id) + .or_insert_with(Vec::new); + + for (index, column) in vertex_info.properties.iter().enumerate() { + vertex_map.insert(column.name.clone(), (column.data_type.clone(), index)); + vertex_vec.push((column.name.clone(), column.data_type.clone())); + } + } + + for edge_info in &schema_json.edge { + let src_label_id = vertex_type_to_id[&edge_info.src_label]; + let dst_label_id = vertex_type_to_id[&edge_info.dst_label]; + let label_id = edge_type_to_id[&edge_info.label]; + let edge_map = edge_prop_meta + .entry((src_label_id, label_id, dst_label_id)) + .or_insert_with(HashMap::new); + let edge_vec = edge_prop_vec + .entry((src_label_id, label_id, dst_label_id)) + .or_insert_with(Vec::new); + + if edge_info.ie_strategy.is_some() + && *edge_info.ie_strategy.as_ref().unwrap() == EdgeStrategy::Single + { + edge_single_ie.insert((src_label_id, label_id, dst_label_id)); + } + + if edge_info.oe_strategy.is_some() + && *edge_info.oe_strategy.as_ref().unwrap() == EdgeStrategy::Single + { + edge_single_oe.insert((src_label_id, label_id, dst_label_id)); + } + + if let Some(properties) = &edge_info.properties { + for (index, column) in properties.iter().enumerate() { + edge_map.insert(column.name.clone(), (column.data_type.clone(), index)); + edge_vec.push((column.name.clone(), column.data_type.clone())); + } + } + } + + Self { + vertex_type_to_id, + edge_type_to_id, + vertex_prop_meta, + vertex_prop_vec, + vertex_partition_type, + edge_prop_meta, + edge_prop_vec, + edge_single_ie, + edge_single_oe, + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct InputSchema { + /// Map from vertex label id to headers in input file + vertex_headers: HashMap>, + /// Map from src_vertex, edge, dst_vertex label id to headers in input file + edge_headers: HashMap<(LabelId, LabelId, LabelId), Vec<(String, DataType)>>, + + /// Map for vertex label id to input file + vertex_files: HashMap>, + /// Map for src_vertex, edge, dst_vertex label id to input file + edge_files: HashMap<(LabelId, LabelId, LabelId), Vec>, +} + +impl InputSchema { + pub fn get_vertex_header(&self, vertex_label: LabelId) -> Option<&[(String, DataType)]> { + self.vertex_headers + .get(&vertex_label) + .map(|vec| vec.as_slice()) + } + + pub fn get_edge_header( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + ) -> Option<&[(String, DataType)]> { + self.edge_headers + .get(&(src_label, edge_label, dst_label)) + .map(|vec| vec.as_slice()) + } + + pub fn get_vertex_file(&self, vertex_label: LabelId) -> Option<&Vec> { + self.vertex_files.get(&vertex_label) + } + + pub fn get_edge_file( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + ) -> Option<&Vec> { + self.edge_files + .get(&(src_label, edge_label, dst_label)) + } + + pub fn from_string(s: String, graph_schema: &CsrGraphSchema) -> std::io::Result { + let input_json: InputSchemaJson = serde_json::from_str(&s)?; + let mut vertex_headers = HashMap::new(); + let mut vertex_files = HashMap::new(); + for vertex in &input_json.vertex { + if let Some(vertex_label) = graph_schema + .vertex_type_to_id + .get(&vertex.label) + { + let mut properties = vec![]; + for column in &vertex.columns { + properties.push((column.name.clone(), column.data_type.clone())); + } + vertex_headers.insert(*vertex_label, properties); + vertex_files.insert(*vertex_label, vertex.files.clone()); + } + } + let mut edge_headers = HashMap::new(); + let mut edge_files = HashMap::new(); + for edge in &input_json.edge { + if let (Some(src_label), Some(edge_label), Some(dst_label)) = ( + graph_schema + .vertex_type_to_id + .get(&edge.src_label), + graph_schema.edge_type_to_id.get(&edge.label), + graph_schema + .vertex_type_to_id + .get(&edge.dst_label), + ) { + let mut properties = vec![]; + for column in &edge.columns { + properties.push((column.name.clone(), column.data_type.clone())); + } + edge_headers.insert((*src_label, *edge_label, *dst_label), properties); + edge_files.insert((*src_label, *edge_label, *dst_label), edge.files.clone()); + } + } + Ok(InputSchema { vertex_headers, edge_headers, vertex_files, edge_files }) + } + + pub fn from_json_file>(path: P, graph_schema: &CsrGraphSchema) -> std::io::Result { + let file = File::open(path)?; + let input_json = + serde_json::from_reader::(file).map_err(std::io::Error::from)?; + let mut vertex_headers = HashMap::new(); + let mut vertex_files = HashMap::new(); + for vertex in &input_json.vertex { + if let Some(vertex_label) = graph_schema + .vertex_type_to_id + .get(&vertex.label) + { + let mut properties = vec![]; + for column in &vertex.columns { + properties.push((column.name.clone(), column.data_type.clone())); + } + vertex_headers.insert(*vertex_label, properties); + vertex_files.insert(*vertex_label, vertex.files.clone()); + } + } + let mut edge_headers = HashMap::new(); + let mut edge_files = HashMap::new(); + for edge in &input_json.edge { + if let (Some(src_label), Some(edge_label), Some(dst_label)) = ( + graph_schema + .vertex_type_to_id + .get(&edge.src_label), + graph_schema.edge_type_to_id.get(&edge.label), + graph_schema + .vertex_type_to_id + .get(&edge.dst_label), + ) { + let mut properties = vec![]; + for column in &edge.columns { + properties.push((column.name.clone(), column.data_type.clone())); + } + edge_headers.insert((*src_label, *edge_label, *dst_label), properties); + edge_files.insert((*src_label, *edge_label, *dst_label), edge.files.clone()); + } + } + Ok(InputSchema { vertex_headers, edge_headers, vertex_files, edge_files }) + } +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct ColumnInfo { + name: String, + data_type: DataType, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct InputVertex { + label: String, + columns: Vec, + files: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct InputEdge { + src_label: String, + dst_label: String, + label: String, + columns: Vec, + files: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct VertexInfo { + label: String, + partition_type: PartitionType, + properties: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct EdgeInfo { + src_label: String, + dst_label: String, + label: String, + #[serde(skip_serializing_if = "Option::is_none")] + ie_strategy: Option, + #[serde(skip_serializing_if = "Option::is_none")] + oe_strategy: Option, + #[serde(skip_serializing_if = "Option::is_none")] + properties: Option>, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct CsrGraphSchemaJson { + vertex: Vec, + edge: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +struct InputSchemaJson { + vertex: Vec, + edge: Vec, +} + +impl<'a> From<&'a CsrGraphSchema> for CsrGraphSchemaJson { + fn from(schema: &'a CsrGraphSchema) -> Self { + let vertex_label_num = schema.vertex_type_to_id.len(); + let edge_label_num = schema.edge_type_to_id.len(); + + let mut vertex_info_vec = vec![ + VertexInfo { + label: "".to_string(), + partition_type: PartitionType::Dynamic, + properties: vec![], + }; + vertex_label_num + ]; + let mut edge_info_vec = vec![]; + + let mut vertex_names = vec!["".to_string(); vertex_label_num]; + let mut edge_names = vec!["".to_string(); edge_label_num]; + + for (vertex_label, label) in &schema.vertex_type_to_id { + vertex_names[*label as usize] = vertex_label.clone(); + let partition_type = schema.vertex_partition_type.get(label).unwrap(); + if let Some(column) = schema.vertex_prop_vec.get(label) { + let mut properties = vec![]; + for (col_name, data_type) in column { + properties.push(ColumnInfo { name: col_name.clone(), data_type: data_type.clone() }); + } + vertex_info_vec[*label as usize] = VertexInfo { + label: vertex_label.clone(), + partition_type: partition_type.clone(), + properties: properties, + } + } + } + + for (edge_label, label) in &schema.edge_type_to_id { + edge_names[*label as usize] = edge_label.clone(); + } + + for ((src_label, label, dst_label), columns) in &schema.edge_prop_vec { + let src_label_name = vertex_names[*src_label as usize].clone(); + let label_name = edge_names[*label as usize].clone(); + let dst_label_name = vertex_names[*dst_label as usize].clone(); + let ie_strategy = if schema + .edge_single_ie + .contains(&(*src_label, *label, *dst_label)) + { + Some(EdgeStrategy::Single) + } else { + None + }; + let oe_strategy = if schema + .edge_single_oe + .contains(&(*src_label, *label, *dst_label)) + { + Some(EdgeStrategy::Single) + } else { + None + }; + if columns.len() > 0 { + let mut properties = vec![]; + for (col_name, data_type) in columns { + properties.push(ColumnInfo { name: col_name.clone(), data_type: data_type.clone() }); + } + edge_info_vec.push(EdgeInfo { + src_label: src_label_name, + dst_label: dst_label_name, + label: label_name, + ie_strategy: ie_strategy, + oe_strategy: oe_strategy, + properties: Some(properties), + }); + } else { + edge_info_vec.push(EdgeInfo { + src_label: src_label_name, + dst_label: dst_label_name, + label: label_name, + ie_strategy: ie_strategy, + oe_strategy: oe_strategy, + properties: None, + }); + } + } + + edge_info_vec + .sort_by(|a, b| schema.edge_type_to_id[&a.label].cmp(&schema.edge_type_to_id[&b.label])); + Self { vertex: vertex_info_vec, edge: edge_info_vec } + } +} + +impl Schema for CsrGraphSchema { + fn get_vertex_header(&self, vertex_type_id: LabelId) -> Option<&[(String, DataType)]> { + self.vertex_prop_vec + .get(&vertex_type_id) + .map(|vec| vec.as_slice()) + } + fn get_edge_header( + &self, src_label: LabelId, edge_label: LabelId, dst_label: LabelId, + ) -> Option<&[(String, DataType)]> { + self.edge_prop_vec + .get(&(src_label, edge_label, dst_label)) + .map(|vec| vec.as_slice()) + } + + fn get_vertex_schema(&self, vertex_type_id: LabelId) -> Option<&HashMap> { + self.vertex_prop_meta.get(&vertex_type_id) + } + + fn get_edge_schema( + &self, edge_type_id: (LabelId, LabelId, LabelId), + ) -> Option<&HashMap> { + self.edge_prop_meta.get(&edge_type_id) + } + + fn get_vertex_label_id(&self, vertex_type: &str) -> Option { + self.vertex_type_to_id.get(vertex_type).cloned() + } + + fn get_edge_label_id(&self, edge_type: &str) -> Option { + self.edge_type_to_id.get(edge_type).cloned() + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/sub_graph.rs b/interactive_engine/executor/store/bmcsr/src/sub_graph.rs new file mode 100644 index 000000000000..5c33b98e7628 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/sub_graph.rs @@ -0,0 +1,128 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use crate::bmcsr::BatchMutableCsr; +use crate::bmscsr::BatchMutableSingleCsr; +use crate::col_table::ColTable; +use crate::csr::{CsrTrait, NbrIter, NbrIterBeta, NbrOffsetIter}; +use crate::graph::IndexType; +use crate::types::{DefaultId, InternalId, LabelId}; +use crate::vertex_map::VertexMap; + +#[derive(Copy, Clone)] +pub struct SubGraph<'a, G: Send + Sync + IndexType = DefaultId, I: Send + Sync + IndexType = InternalId> { + pub csr: &'a BatchMutableCsr, + pub vm: &'a VertexMap, + pub src_label: LabelId, + pub dst_label: LabelId, + pub e_label: LabelId, + + pub vertex_data: &'a ColTable, + pub edge_data: Option<&'a ColTable>, +} + +impl<'a, G: Send + Sync + IndexType, I: Send + Sync + IndexType> SubGraph<'a, G, I> { + pub fn new( + csr: &'a BatchMutableCsr, vm: &'a VertexMap, src_label: LabelId, dst_label: LabelId, + e_label: LabelId, vertex_data: &'a ColTable, edge_data: Option<&'a ColTable>, + ) -> Self { + SubGraph { csr, vm, src_label, dst_label, e_label, vertex_data, edge_data } + } + + pub fn get_vertex_num(&self) -> I { + self.csr.vertex_num() + } + + pub fn get_adj_list(&self, src: I) -> Option> { + self.csr.get_edges(src) + } + + pub fn get_adj_list_beta(&self, src: I) -> NbrIterBeta { + self.csr.get_edges_beta(src) + } + + pub fn get_adj_list_with_offset(&self, src: I) -> Option> { + self.csr.get_edges_with_offset(src) + } + + pub fn get_properties(&self) -> Option<&'a ColTable> { + self.edge_data + } + + pub fn degree(&self, src_id: I) -> i64 { + self.csr.degree(src_id) as i64 + } +} + +#[derive(Copy, Clone)] +pub struct SingleSubGraph< + 'a, + G: Send + Sync + IndexType = DefaultId, + I: Send + Sync + IndexType = InternalId, +> { + pub csr: &'a BatchMutableSingleCsr, + pub vm: &'a VertexMap, + pub src_label: LabelId, + pub dst_label: LabelId, + pub e_label: LabelId, + + pub vertex_data: &'a ColTable, + pub edge_data: Option<&'a ColTable>, +} + +impl<'a, G, I> SingleSubGraph<'a, G, I> +where + G: Send + Sync + IndexType, + I: Send + Sync + IndexType, +{ + pub fn new( + csr: &'a BatchMutableSingleCsr, vm: &'a VertexMap, src_label: LabelId, dst_label: LabelId, + e_label: LabelId, vertex_data: &'a ColTable, edge_data: Option<&'a ColTable>, + ) -> Self { + Self { csr, vm, src_label, dst_label, e_label, vertex_data, edge_data } + } + + pub fn get_vertex_num(&self) -> I { + self.csr.vertex_num() + } + + pub fn get_properties(&self) -> Option<&'a ColTable> { + self.edge_data + } + + pub fn get_adj_list(&self, src: I) -> Option> { + self.csr.get_edges(src) + } + + pub fn get_adj_list_beta(&self, src: I) -> NbrIterBeta { + self.csr.get_edges_beta(src) + } + + pub fn get_adj_list_with_offset(&self, src: I) -> Option> { + self.csr.get_edges_with_offset(src) + } + + pub fn get_edge(&self, src: I) -> Option { + self.csr.get_edge(src) + } + + pub fn get_edge_with_offset(&self, src: I) -> Option<(I, usize)> { + self.csr.get_edge_with_offset(src) + } + + pub fn degree(&self, src_id: I) -> i64 { + self.csr.degree(src_id) as i64 + } +} diff --git a/interactive_engine/executor/store/bmcsr/src/traverse.rs b/interactive_engine/executor/store/bmcsr/src/traverse.rs new file mode 100644 index 000000000000..b8e69f9d324e --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/traverse.rs @@ -0,0 +1,299 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; +use std::str::FromStr; + +use crate::bmcsr::BatchMutableCsr; +use crate::bmscsr::BatchMutableSingleCsr; +use crate::col_table::ColTable; +use crate::columns::DataType; +use crate::csr::CsrTrait; +use crate::graph::{Direction, IndexType}; +use crate::graph_db::GraphDB; +use crate::ldbc_parser::LDBCVertexParser; +use crate::schema::Schema; +use crate::types::LabelId; + +fn traverse_vertices( + graph: &GraphDB, output_dir: &str, +) { + let vertex_label_names = graph.graph_schema.vertex_label_names(); + let output_dir_path = PathBuf::from_str(output_dir).unwrap(); + for n in vertex_label_names.iter() { + if let Some(v_label) = graph.graph_schema.get_vertex_label_id(n) { + let header = graph + .graph_schema + .get_vertex_header(v_label) + .unwrap(); + let file_name = format!("{}.csv", n); + let file_path = output_dir_path.join(file_name); + let mut file = File::create(file_path).unwrap(); + + let v_labels = vec![v_label]; + for v in graph.get_all_vertices(Some(&v_labels)) { + if v.is_valid() { + let id = LDBCVertexParser::::get_original_id(v.get_id()); + write!(file, "{}", id.index()).unwrap(); + for c in header { + if c.1 != DataType::ID { + write!( + file, + "|{}", + v.get_property(c.0.as_str()) + .unwrap() + .to_string() + ) + .unwrap(); + } + } + writeln!(file).unwrap(); + } + } + } + } +} + +fn output_csr( + graph: &GraphDB, output_path: &str, csr: &BatchMutableCsr, prop_table: Option<&ColTable>, + label: LabelId, neighbor_label: LabelId, dir: Direction, +) where + G: Eq + IndexType + Send + Sync, + I: IndexType + Send + Sync, +{ + let mut file = File::create(output_path).unwrap(); + for v in 0..csr.vertex_num().index() { + let src_global_id = graph.get_global_id(I::new(v), label).unwrap(); + let src_oid = LDBCVertexParser::::get_original_id(src_global_id); + if let Some(table) = prop_table { + let col_num = table.col_num(); + if let Some(edges) = csr.get_edges_with_offset(I::new(v)) { + for (nbr, offset) in edges { + let dst_global_id = graph + .get_global_id(nbr, neighbor_label) + .unwrap(); + let dst_oid = LDBCVertexParser::::get_original_id(dst_global_id); + if dir == Direction::Outgoing { + write!(file, "{}|{}", src_oid.index(), dst_oid.index()).unwrap(); + } else { + write!(file, "{}|{}", dst_oid.index(), src_oid.index()).unwrap(); + } + for c in 0..col_num { + write!( + file, + "|{}", + table + .get_item_by_index(c, offset) + .unwrap() + .to_string() + ) + .unwrap(); + } + writeln!(file).unwrap(); + } + } + } else { + if let Some(edges) = csr.get_edges(I::new(v)) { + for e in edges { + let dst_global_id = graph.get_global_id(*e, neighbor_label).unwrap(); + let dst_oid = LDBCVertexParser::::get_original_id(dst_global_id); + if dir == Direction::Outgoing { + writeln!(file, "{}|{}", src_oid.index(), dst_oid.index()).unwrap(); + } else { + writeln!(file, "{}|{}", dst_oid.index(), src_oid.index()).unwrap(); + } + } + } + } + } +} + +fn output_single_csr( + graph: &GraphDB, output_path: &str, csr: &BatchMutableSingleCsr, + prop_table: Option<&ColTable>, label: LabelId, neighbor_label: LabelId, dir: Direction, +) where + G: Eq + IndexType + Send + Sync, + I: IndexType + Send + Sync, +{ + let mut file = File::create(output_path).unwrap(); + for v in 0..csr.vertex_num().index() { + let src_global_id = graph.get_global_id(I::new(v), label).unwrap(); + let src_oid = LDBCVertexParser::::get_original_id(src_global_id); + if let Some(table) = prop_table { + let col_num = table.col_num(); + if let Some(edges) = csr.get_edges_with_offset(I::new(v)) { + for (nbr, offset) in edges { + let dst_global_id = graph + .get_global_id(nbr, neighbor_label) + .unwrap(); + let dst_oid = LDBCVertexParser::::get_original_id(dst_global_id); + if dir == Direction::Outgoing { + write!(file, "{}|{}", src_oid.index(), dst_oid.index()).unwrap(); + } else { + write!(file, "{}|{}", dst_oid.index(), src_oid.index()).unwrap(); + } + for c in 0..col_num { + write!( + file, + "|{}", + table + .get_item_by_index(c, offset) + .unwrap() + .to_string() + ) + .unwrap(); + } + writeln!(file).unwrap(); + } + } + } else { + if let Some(edges) = csr.get_edges(I::new(v)) { + for e in edges { + let dst_global_id = graph.get_global_id(*e, neighbor_label).unwrap(); + let dst_oid = LDBCVertexParser::::get_original_id(dst_global_id); + if dir == Direction::Outgoing { + writeln!(file, "{}|{}", src_oid.index(), dst_oid.index()).unwrap(); + } else { + writeln!(file, "{}|{}", dst_oid.index(), src_oid.index()).unwrap(); + } + } + } + } + } +} + +fn traverse_edges( + graph: &GraphDB, output_dir: &str, +) { + let vertex_label_num = graph.vertex_label_num; + let edge_label_num = graph.edge_label_num; + for src_label in 0..vertex_label_num { + let src_label_name = graph.graph_schema.vertex_label_names()[src_label].clone(); + for dst_label in 0..vertex_label_num { + let dst_label_name = graph.graph_schema.vertex_label_names()[dst_label].clone(); + for edge_label in 0..edge_label_num { + let edge_label_name = graph.graph_schema.edge_label_names()[edge_label].clone(); + if let Some(_) = graph.graph_schema.get_edge_header( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + ) { + let oe_file_name = + format!("oe_{}_{}_{}.csv", src_label_name, edge_label_name, dst_label_name); + let oe_file_path = PathBuf::from_str(output_dir) + .unwrap() + .join(oe_file_name); + + let oe_index = graph.edge_label_to_index( + src_label as LabelId, + dst_label as LabelId, + edge_label as LabelId, + Direction::Outgoing, + ); + if graph.graph_schema.is_single_oe( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + ) { + let csr = graph.oe[oe_index] + .as_any() + .downcast_ref::>() + .unwrap(); + output_single_csr( + graph, + oe_file_path.to_str().unwrap(), + csr, + graph.oe_edge_prop_table.get(&oe_index), + src_label as LabelId, + dst_label as LabelId, + Direction::Outgoing, + ); + } else { + let csr = graph.oe[oe_index] + .as_any() + .downcast_ref::>() + .unwrap(); + info!("output oe csr: {}", oe_file_path.to_str().unwrap()); + output_csr( + graph, + oe_file_path.to_str().unwrap(), + csr, + graph.oe_edge_prop_table.get(&oe_index), + src_label as LabelId, + dst_label as LabelId, + Direction::Outgoing, + ); + } + + let ie_file_name = + format!("ie_{}_{}_{}.csv", src_label_name, edge_label_name, dst_label_name); + let ie_file_path = PathBuf::from_str(output_dir) + .unwrap() + .join(ie_file_name); + // reverse src and dst label + let ie_index = graph.edge_label_to_index( + dst_label as LabelId, + src_label as LabelId, + edge_label as LabelId, + Direction::Incoming, + ); + if graph.graph_schema.is_single_ie( + src_label as LabelId, + edge_label as LabelId, + dst_label as LabelId, + ) { + let csr = graph.ie[ie_index] + .as_any() + .downcast_ref::>() + .unwrap(); + output_single_csr( + graph, + ie_file_path.to_str().unwrap(), + csr, + graph.ie_edge_prop_table.get(&ie_index), + dst_label as LabelId, + src_label as LabelId, + Direction::Incoming, + ); + } else { + let csr = graph.ie[ie_index] + .as_any() + .downcast_ref::>() + .unwrap(); + info!("output ie csr: {}", ie_file_path.to_str().unwrap()); + output_csr( + graph, + ie_file_path.to_str().unwrap(), + csr, + graph.ie_edge_prop_table.get(&ie_index), + dst_label as LabelId, + src_label as LabelId, + Direction::Incoming, + ); + } + } + } + } + } +} + +pub fn traverse( + graph: &GraphDB, output_dir: &str, +) { + traverse_vertices(graph, output_dir); + traverse_edges(graph, output_dir); +} diff --git a/interactive_engine/executor/store/bmcsr/src/types.rs b/interactive_engine/executor/store/bmcsr/src/types.rs new file mode 100644 index 000000000000..8b550bf2bd59 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/types.rs @@ -0,0 +1,28 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +pub type DefaultId = usize; +pub type InternalId = usize; +pub type LabelId = u8; + +pub static INVALID_LABEL_ID: LabelId = 0xff; +pub static VERSION: &str = env!("CARGO_PKG_VERSION"); +pub static NAME: &str = env!("CARGO_PKG_NAME"); + +pub const FILE_SCHEMA: &'static str = "schema.json"; +pub const DIR_GRAPH_SCHEMA: &'static str = "graph_schema"; + +pub const DIR_BINARY_DATA: &'static str = "graph_data_bin"; +pub const DIR_SPLIT_RAW_DATA: &'static str = "graph_split_raw"; diff --git a/interactive_engine/executor/store/bmcsr/src/utils.rs b/interactive_engine/executor/store/bmcsr/src/utils.rs new file mode 100644 index 000000000000..a22e941b5c26 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/utils.rs @@ -0,0 +1,304 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::fs::File; +use std::io::BufReader; +use std::path::{Path, PathBuf}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use csv::{ReaderBuilder, StringRecord}; +use rust_htslib::bgzf::Reader as GzReader; + +use crate::columns::{DataType, Item}; +use crate::error::GDBResult; +use crate::graph::IndexType; +use crate::graph_loader::get_files_list; +use crate::ldbc_parser::LDBCVertexParser; +use crate::types::LabelId; + +pub struct Iter<'a, T> { + inner: Box + 'a + Send>, +} + +impl<'a, T> Iter<'a, T> { + pub fn from_iter + 'a + Send>(iter: I) -> Self { + Iter { inner: Box::new(iter) } + } + + pub fn from_iter_box(iter: Box + 'a + Send>) -> Self { + Iter { inner: iter } + } +} + +impl<'a, T> Iterator for Iter<'a, T> { + type Item = T; + + #[inline(always)] + fn next(&mut self) -> Option { + self.inner.next() + } + + #[inline(always)] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + + #[inline(always)] + fn count(self) -> usize { + self.inner.count() + } +} + +unsafe impl<'a, T> Send for Iter<'a, T> {} + +pub struct Range { + begin: I, + end: I, +} + +pub struct RangeIterator { + cur: I, + end: I, +} + +impl Iterator for RangeIterator { + type Item = I; + + fn next(&mut self) -> Option { + if self.cur == self.end { + None + } else { + let ret = self.cur.clone(); + self.cur += I::new(1); + Some(ret) + } + } +} + +impl Range { + pub fn new(begin: I, end: I) -> Self { + Range { begin, end } + } + + pub fn into_iter(self) -> RangeIterator { + RangeIterator { cur: self.begin.clone(), end: self.end.clone() } + } +} + +pub struct LabeledIterator { + labels: Vec, + iterators: Vec, + cur: usize, +} + +impl LabeledIterator { + pub fn new(labels: Vec, iterators: Vec) -> Self { + Self { labels, iterators, cur: 0 } + } +} + +impl Iterator for LabeledIterator { + type Item = (L, I::Item); + + fn next(&mut self) -> Option { + loop { + if self.cur == self.labels.len() { + return None; + } + if let Some(item) = self.iterators[self.cur].next() { + return Some((self.labels[self.cur], item)); + } else { + self.cur += 1; + } + } + } +} + +unsafe impl Send for LabeledIterator {} + +pub struct LabeledRangeIterator { + labels: Vec, + iterators: Vec>, + cur: usize, +} + +impl LabeledRangeIterator { + pub fn new(labels: Vec, iterators: Vec>) -> Self { + Self { labels, iterators, cur: 0 } + } +} + +impl Iterator for LabeledRangeIterator { + type Item = (L, I); + + fn next(&mut self) -> Option { + loop { + if self.cur == self.labels.len() { + return None; + } + if let Some(item) = self.iterators[self.cur].next() { + return Some((self.labels[self.cur], item)); + } else { + self.cur += 1; + } + } + } + + fn nth(&mut self, n: usize) -> Option { + let mut remaining = n; + while remaining != 0 { + if self.cur == self.labels.len() { + return None; + } + let cur_remaining = self.iterators[self.cur].end.index() - self.iterators[self.cur].cur.index(); + let cur_cur = self.iterators[self.cur].cur.index(); + if cur_remaining >= remaining { + self.iterators[self.cur].cur = I::new(cur_cur + remaining); + return self.next(); + } else { + remaining -= cur_remaining; + self.cur += 1; + } + } + None + } +} + +pub fn get_partition(id: &u64, workers: usize, num_servers: usize) -> u64 { + let id_usize = *id as usize; + let magic_num = id_usize / num_servers; + // The partitioning logics is as follows: + // 1. `R = id - magic_num * num_servers = id % num_servers` routes a given id + // to the machine R that holds its data. + // 2. `R * workers` shifts the worker's id in the machine R. + // 3. `magic_num % workers` then picks up one of the workers in the machine R + // to do the computation. + ((id_usize - magic_num * num_servers) * workers + magic_num % workers) as u64 +} + +pub fn get_2d_partition(id_hi: u64, id_low: u64, workers: usize, num_servers: usize) -> u64 { + let server_id = id_hi % num_servers as u64; + let worker_id = id_low % workers as u64; + server_id * workers as u64 + worker_id +} + +pub fn parse_vertex_id_from_file( + vertex_label: LabelId, id_col: i32, file_locations: Vec, skip_header: bool, delim: u8, id: u32, + parallel: u32, +) -> Vec { + let mut id_list = vec![]; + for file_location in file_locations { + let path = Path::new(&file_location); + let input_dir = path + .parent() + .unwrap_or(Path::new("")) + .to_path_buf(); + let filename = vec![path + .file_name() + .expect("Can not find filename") + .to_str() + .unwrap_or("") + .to_string()]; + let parser = LDBCVertexParser::::new(vertex_label, id_col as usize); + let files = get_files_list(&input_dir, &filename).unwrap(); + for file in files.iter() { + if let Some(path_str) = file.clone().to_str() { + if path_str.ends_with(".csv.gz") { + if let Ok(gz_reader) = GzReader::from_path(&path_str) { + let mut rdr = ReaderBuilder::new() + .delimiter(delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(skip_header) + .from_reader(gz_reader); + for (index, result) in rdr.records().enumerate() { + if index % parallel as usize == id as usize { + if let Ok(record) = result { + let vertex_meta = parser.parse_vertex_meta(&record); + id_list.push(vertex_meta.global_id as u64); + } + } + } + } + } else if file.ends_with(".csv") { + if let Ok(file) = File::open(&file) { + let reader = BufReader::new(file); + let mut rdr = ReaderBuilder::new() + .delimiter(delim) + .buffer_capacity(4096) + .comment(Some(b'#')) + .flexible(true) + .has_headers(skip_header) + .from_reader(reader); + for (index, result) in rdr.records().enumerate() { + if index % parallel as usize == id as usize { + if let Ok(record) = result { + let vertex_meta = parser.parse_vertex_meta(&record); + id_list.push(vertex_meta.global_id as u64); + } + } + } + } + } + } + } + } + id_list +} + +pub fn parse_properties( + record: &StringRecord, header: &[(String, DataType)], selected: &[i32], +) -> GDBResult> { + let mut properties = Vec::new(); + for (index, val) in record.iter().enumerate() { + if selected[index] > 0 { + match header[index].1 { + DataType::Int32 => { + properties.push(Item::Int32(val.parse::()?)); + } + DataType::UInt32 => { + properties.push(Item::UInt32(val.parse::()?)); + } + DataType::Int64 => { + properties.push(Item::Int64(val.parse::()?)); + } + DataType::UInt64 => { + properties.push(Item::UInt64(val.parse::()?)); + } + DataType::String => { + properties.push(Item::String(val.to_string())); + } + DataType::Date => { + properties.push(Item::Date(crate::date::parse_date(val)?)); + } + DataType::DateTime => { + properties.push(Item::DateTime(crate::date_time::parse_datetime(val))); + } + DataType::Double => { + properties.push(Item::Double(val.parse::()?)); + } + DataType::NULL => { + error!("Unexpected field type"); + } + DataType::ID => {} + DataType::LCString => { + properties.push(Item::String(val.to_string())); + } + } + } + } + Ok(properties) +} diff --git a/interactive_engine/executor/store/bmcsr/src/vertex_map.rs b/interactive_engine/executor/store/bmcsr/src/vertex_map.rs new file mode 100644 index 000000000000..765c92e543d2 --- /dev/null +++ b/interactive_engine/executor/store/bmcsr/src/vertex_map.rs @@ -0,0 +1,293 @@ +// +//! Copyright 2020 Alibaba Group Holding Limited. +//! +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! http://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. + +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use dashmap::DashMap; +use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; + +use crate::graph::IndexType; +use crate::ldbc_parser::LDBCVertexParser; +use crate::types::*; + +pub struct VertexMap { + // global_id_to_index: FnvHashMap, + global_id_to_index: DashMap, + labeled_num: Vec, + vertices_num: Vec, + pub index_to_global_id: Vec>, + labeled_corner_num: Vec, + pub index_to_corner_global_id: Vec>, + label_num: LabelId, +} + +impl VertexMap +where + G: Send + Sync + IndexType, + I: Send + Sync + IndexType, +{ + pub fn new(num_labels: usize) -> Self { + let mut labeled_num = Vec::with_capacity(num_labels); + let mut vertices_num = Vec::with_capacity(num_labels); + let mut index_to_global_id = Vec::with_capacity(num_labels); + let mut labeled_corner_num = Vec::with_capacity(num_labels); + let mut index_to_corner_global_id = Vec::with_capacity(num_labels); + for _ in 0..num_labels { + labeled_num.push(0_usize); + vertices_num.push(0_usize); + index_to_global_id.push(Vec::new()); + labeled_corner_num.push(0_usize); + index_to_corner_global_id.push(Vec::new()); + } + Self { + // global_id_to_index: FnvHashMap::default(), + global_id_to_index: DashMap::new(), + labeled_num, + vertices_num, + index_to_global_id, + labeled_corner_num, + index_to_corner_global_id, + label_num: num_labels as LabelId, + } + } + + pub fn remove_vertex(&mut self, label: LabelId, internal_id: &I) { + let internal_id = internal_id.index(); + if internal_id < self.labeled_num[label as usize] { + if self.index_to_global_id[label as usize].len() <= internal_id { + return; + } + let global_id = self.index_to_global_id[label as usize][internal_id].clone(); + self.global_id_to_index.remove(&global_id); + self.index_to_global_id[label as usize][internal_id] = ::max(); + self.vertices_num[label as usize] -= 1; + } else { + let index = ::max().index() - internal_id - 1; + if self.index_to_corner_global_id[label as usize].len() <= index { + return; + } + let global_id = self.index_to_corner_global_id[label as usize][index].clone(); + self.global_id_to_index.remove(&global_id); + self.index_to_corner_global_id[label as usize][index] = ::max(); + } + } + + pub fn add_vertex(&mut self, global_id: G, label: LabelId) -> I { + assert_eq!(label, LDBCVertexParser::get_label_id(global_id)); + + if let Some(vertex) = self.global_id_to_index.get(&global_id) { + assert!(vertex.index() < self.labeled_num[label as usize]); + vertex.clone() + } else { + let v = I::new(self.labeled_num[label as usize]); + self.labeled_num[label as usize] += 1; + self.vertices_num[label as usize] += 1; + self.index_to_global_id[label as usize].push(global_id); + self.global_id_to_index.insert(global_id, v); + v + } + } + + pub fn add_corner_vertex(&mut self, global_id: G, label: LabelId) -> I { + assert_eq!(label, LDBCVertexParser::get_label_id(global_id)); + + if let Some(vertex) = self.global_id_to_index.get(&global_id) { + vertex.clone() + } else { + let v = I::new(::max().index() - self.labeled_corner_num[label as usize] - 1); + self.labeled_corner_num[label as usize] += 1; + self.index_to_corner_global_id[label as usize].push(global_id); + self.global_id_to_index.insert(global_id, v); + v + } + } + + pub fn get_internal_id(&self, global_id: G) -> Option<(LabelId, I)> { + if let Some(internal_id) = self.global_id_to_index.get(&global_id) { + Some((LDBCVertexParser::get_label_id(global_id), *internal_id)) + } else { + None + } + } + + pub fn get_global_id(&self, label: LabelId, internal_id: I) -> Option { + let internal_id = internal_id.index(); + if internal_id < self.labeled_num[label as usize] { + self.index_to_global_id[label as usize] + .get(internal_id) + .cloned() + } else { + self.index_to_corner_global_id[label as usize] + .get(::max().index() - internal_id - 1) + .cloned() + } + } + + pub fn shrink_to_fit(&mut self) { + self.global_id_to_index.shrink_to_fit(); + for list in &mut self.index_to_global_id { + list.shrink_to_fit(); + } + for list in &mut self.index_to_corner_global_id { + list.shrink_to_fit(); + } + } + + pub fn label_num(&self) -> LabelId { + self.label_num + } + + pub fn vertex_num(&self, label: LabelId) -> usize { + self.labeled_num[label as usize] + } + + pub fn actual_vertices_num(&self, label: LabelId) -> usize { + self.vertices_num[label as usize] + } + + pub fn corner_vertex_num(&self, label: LabelId) -> usize { + self.labeled_corner_num[label as usize] + } + + pub fn total_vertex_num(&self) -> usize { + self.labeled_num.iter().sum() + } + + pub fn serialize(&self, path: &String) { + let f = File::create(path).unwrap(); + let mut writer = BufWriter::new(f); + writer.write_u8(self.label_num).unwrap(); + + for n in self.labeled_num.iter() { + writer + .write_u64::(*n as u64) + .unwrap(); + } + for n in self.labeled_corner_num.iter() { + writer + .write_u64::(*n as u64) + .unwrap(); + } + + for i in 0..self.label_num { + assert_eq!(self.index_to_global_id[i as usize].len(), self.labeled_num[i as usize]); + assert_eq!( + self.index_to_corner_global_id[i as usize].len(), + self.labeled_corner_num[i as usize] + ); + + for v in self.index_to_global_id[i as usize].iter() { + v.write(&mut writer).unwrap(); + } + for v in self.index_to_corner_global_id[i as usize].iter() { + v.write(&mut writer).unwrap(); + } + } + + writer.flush().unwrap(); + } + + pub fn deserialize(&mut self, path: &String) { + let f = File::open(path).unwrap(); + let mut reader = BufReader::new(f); + self.label_num = reader.read_u8().unwrap(); + + self.labeled_num.clear(); + self.labeled_corner_num.clear(); + for _ in 0..self.label_num { + self.labeled_num + .push(reader.read_u64::().unwrap() as usize); + } + self.vertices_num = self.labeled_num.clone(); + for _ in 0..self.label_num { + self.labeled_corner_num + .push(reader.read_u64::().unwrap() as usize); + } + + self.index_to_global_id.clear(); + self.index_to_corner_global_id.clear(); + self.global_id_to_index.clear(); + for i in 0..self.label_num { + let iv_num = self.labeled_num[i as usize]; + let mut native_ids = Vec::::with_capacity(iv_num); + for _ in 0..iv_num { + native_ids.push(G::read(&mut reader).unwrap()); + } + + let ov_num = self.labeled_corner_num[i as usize]; + let mut corner_ids = Vec::::with_capacity(ov_num); + for _ in 0..ov_num { + corner_ids.push(G::read(&mut reader).unwrap()); + } + + native_ids + .par_iter() + .enumerate() + .for_each(|(index, v)| { + self.global_id_to_index + .insert(*v, I::new(index)); + }); + + self.index_to_global_id.push(native_ids); + self.index_to_corner_global_id.push(corner_ids); + } + + // for i in 0..self.label_num { + // let mut index = 0_usize; + // for v in self.index_to_global_id[i as usize].iter() { + // self.global_id_to_index + // .insert(*v, I::new(index)); + // index += 1; + // } + // } + } + + pub fn desc(&self) { + info!("label_num = {}, entry num = {}", self.label_num, self.global_id_to_index.len()); + for i in 0..self.label_num { + info!( + "label-{}: native: {}, corner: {}", + i, + self.index_to_global_id[i as usize].len(), + self.index_to_corner_global_id[i as usize].len() + ) + } + } + + pub fn is_same(&self, other: &Self) -> bool { + // if self.global_id_to_index != other.global_id_to_index { + // return false; + // } + if self.label_num != other.label_num { + return false; + } + if self.index_to_global_id != other.index_to_global_id { + return false; + } + if self.labeled_corner_num != other.labeled_corner_num { + return false; + } + if self.index_to_corner_global_id != other.index_to_corner_global_id { + return false; + } + if self.label_num != other.label_num { + return false; + } + + return true; + } +} diff --git a/interactive_engine/executor/store/mcsr/README.md b/interactive_engine/executor/store/mcsr/README.md index a9fcaab93d05..363c1883bfe2 100644 --- a/interactive_engine/executor/store/mcsr/README.md +++ b/interactive_engine/executor/store/mcsr/README.md @@ -1,23 +1,24 @@ # Mutable CSR Store -## Build binary data -```bash -INPUT_PATH=$1 -OUTPUT_PATH=$2 -INPUT_SCHEMA_PATH=$3 -GRAPH_SCHEMA_PATH=$4 -PARTITION_NUM=$5 -PARTITION_ID=$6 +## Prepare schema for data loading +When loading the graph into storage, two schema files must be provided: the input schema and the graph schema. The input schema specifies the path to the input file, while the graph schema defines the structure of the graph. +### Input schema +The input schema contains the following information for the graph loading: +- Mappings from vertex label to file path of raw data +- Mappings from vertex label to column info of raw data +- Mappings from a tuple that includes the labels of the source vertex, edge, and target vertex to file path of raw data +- Mappings from a tuple that includes the labels of the source vertex, edge, and target vertex column info of raw data -cmd="./target/release/build_csr_partition $INPUT_PATH $OUTPUT_PATH $INPUT_SCHEMA_PATH $GRAPH_SCHEMA_PATH -p $PARTITION_NUM -i $PARTITION_ID" -echo $cmd -eval $cmd -``` +The schema file is formatted using Json. We have provided a sampled schema file for modern graph in `data/modern_input.json`. -## Split raw data -You can split raw data before build binary data. -```bash -#!/bin/bash +### Graph schema +The graph schema contains the following information for the graph storage: +- Mapping from vertex label to label id. +- Mapping from edge label to a 3-tuple, which contains edge label id, source vertex label id, and target vertex label id. +- The properties (name and datatype) of each type of vertex/edge. + The schema file is formatted using Json. We have provided a sampled schema file for modern graph in `data/modern_schema.json`. +## Build Binary Data +```bash INPUT_PATH=$1 OUTPUT_PATH=$2 INPUT_SCHEMA_PATH=$3 @@ -25,7 +26,9 @@ GRAPH_SCHEMA_PATH=$4 PARTITION_NUM=$5 PARTITION_ID=$6 -cmd="./target/release/partition_raw_data $INPUT_PATH $OUTPUT_PATH $INPUT_SCHEMA_PATH $GRAPH_SCHEMA_PATH -p $PARTITION_NUM -i $PARTITION_ID" +#USAGE: +# build_csr_partition -i -p [--skip_header] +cmd="./target/release/build_csr_partition $INPUT_PATH $OUTPUT_PATH $INPUT_SCHEMA_PATH $GRAPH_SCHEMA_PATH -p $PARTITION_NUM -i $PARTITION_ID" echo $cmd eval $cmd ``` \ No newline at end of file diff --git a/interactive_engine/executor/store/mcsr/data/modern_graph/created.csv b/interactive_engine/executor/store/mcsr/data/modern_graph/created.csv new file mode 100644 index 000000000000..162928a61317 --- /dev/null +++ b/interactive_engine/executor/store/mcsr/data/modern_graph/created.csv @@ -0,0 +1,5 @@ +src_id|dst_id|weight +1|3|0.4 +4|5|1.0 +4|3|0.4 +6|3|0.2 diff --git a/interactive_engine/executor/store/mcsr/data/modern_graph/knows.csv b/interactive_engine/executor/store/mcsr/data/modern_graph/knows.csv new file mode 100644 index 000000000000..0987e08bb369 --- /dev/null +++ b/interactive_engine/executor/store/mcsr/data/modern_graph/knows.csv @@ -0,0 +1,3 @@ +src_id|dst_id|weight +1|2|0.5 +1|4|1.0 diff --git a/interactive_engine/executor/store/mcsr/data/modern_graph/person.csv b/interactive_engine/executor/store/mcsr/data/modern_graph/person.csv new file mode 100644 index 000000000000..1ec20cd59a3e --- /dev/null +++ b/interactive_engine/executor/store/mcsr/data/modern_graph/person.csv @@ -0,0 +1,5 @@ +id|name|age +2|vadas|27 +6|peter|35 +4|josh|32 +1|marko|29 diff --git a/interactive_engine/executor/store/mcsr/data/modern_graph/software.csv b/interactive_engine/executor/store/mcsr/data/modern_graph/software.csv new file mode 100644 index 000000000000..79c6d8f92ed3 --- /dev/null +++ b/interactive_engine/executor/store/mcsr/data/modern_graph/software.csv @@ -0,0 +1,3 @@ +id|name|lang +3|lop|java +5|ripple|java diff --git a/interactive_engine/executor/store/mcsr/data/modern_input.json b/interactive_engine/executor/store/mcsr/data/modern_input.json new file mode 100644 index 000000000000..fe263a1c3107 --- /dev/null +++ b/interactive_engine/executor/store/mcsr/data/modern_input.json @@ -0,0 +1,90 @@ +{ + "vertex": [ + { + "label": "person", + "columns": [ + { + "name": "id", + "data_type": "ID" + }, + { + "name": "name", + "data_type": "String" + }, + { + "name": "age", + "data_type": "Int32" + } + ], + "files":[ + "person.csv" + ] + }, + { + "label": "software", + "columns": [ + { + "name": "id", + "data_type": "ID" + }, + { + "name": "name", + "data_type": "String" + }, + { + "name": "lang", + "data_type": "String" + } + ], + "files":[ + "software.csv" + ] + } + ], + "edge": [ + { + "src_label": "person", + "dst_label": "person", + "label": "knows", + "columns": [ + { + "name": "start_id", + "data_type": "ID" + }, + { + "name": "end_id", + "data_type": "ID" + }, + { + "name": "weight", + "data_type": "Double" + } + ], + "files": [ + "knows.csv" + ] + }, + { + "src_label": "person", + "dst_label": "software", + "label": "created", + "columns": [ + { + "name": "start_id", + "data_type": "ID" + }, + { + "name": "end_id", + "data_type": "ID" + }, + { + "name": "weight", + "data_type": "Double" + } + ], + "files": [ + "created.csv" + ] + } + ] +} \ No newline at end of file diff --git a/interactive_engine/executor/store/mcsr/data/modern_schema.json b/interactive_engine/executor/store/mcsr/data/modern_schema.json new file mode 100644 index 000000000000..847179ddc85b --- /dev/null +++ b/interactive_engine/executor/store/mcsr/data/modern_schema.json @@ -0,0 +1,56 @@ +{ + "vertex": [ + { + "label": "person", + "partition_type": "Dynamic", + "properties": [ + { + "name": "name", + "data_type": "String" + }, + { + "name": "age", + "data_type": "Int32" + } + ] + }, + { + "label": "software", + "partition_type": "Dynamic", + "properties": [ + { + "name": "name", + "data_type": "String" + }, + { + "name": "lang", + "data_type": "String" + } + ] + } + ], + "edge": [ + { + "src_label": "person", + "dst_label": "person", + "label": "knows", + "properties": [ + { + "name": "weight", + "data_type": "Double" + } + ] + }, + { + "src_label": "person", + "dst_label": "software", + "label": "created", + "properties": [ + { + "name": "weight", + "data_type": "Double" + } + ] + } + ] +} \ No newline at end of file