From bc78dfcd5b9323f590fbfe535e9448933758213e Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 13 Feb 2024 13:32:17 -0500 Subject: [PATCH] remove datafusion for faster compiles --- eggstrain/Cargo.toml | 2 +- eggstrain/src/main.rs | 4 +- eggstrain/src/storage_client/mod.rs | 70 ++++++++++++++--------------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/eggstrain/Cargo.toml b/eggstrain/Cargo.toml index 380808a..cab8161 100644 --- a/eggstrain/Cargo.toml +++ b/eggstrain/Cargo.toml @@ -14,7 +14,7 @@ authors = [ anyhow = "1" arrow = "50" async-trait = "0.1" -datafusion = "35" +# datafusion = "35" serde_json = "1" substrait = { version = "0.24", features = ["pbjson"] } tokio = { version = "1", features = ["full"] } diff --git a/eggstrain/src/main.rs b/eggstrain/src/main.rs index d26262e..91c5bd2 100644 --- a/eggstrain/src/main.rs +++ b/eggstrain/src/main.rs @@ -7,6 +7,6 @@ use execution::substrait::deserialize::get_json; #[tokio::main] async fn main() { println!("Hello, world!"); - // get_json("../substrait/substrait_plan_example.json"); - get_json("../substrait/basic_query.json"); + get_json("../substrait/substrait_plan_example.json"); + // get_json("../substrait/basic_query.json"); } diff --git a/eggstrain/src/storage_client/mod.rs b/eggstrain/src/storage_client/mod.rs index 20b9a7f..bc3f04e 100644 --- a/eggstrain/src/storage_client/mod.rs +++ b/eggstrain/src/storage_client/mod.rs @@ -1,10 +1,10 @@ //! Right now we have this in a submodule `storage_client.rs`, but the IO service //! team would probably create a crate and we could import it easily into our `Cargo.toml` file -use datafusion::execution::SendableRecordBatchStream; +// use datafusion::execution::SendableRecordBatchStream; -use datafusion::common::arrow::array::{Int32Array, RecordBatch}; -use datafusion::common::arrow::datatypes::{DataType, Field, Schema}; +// use datafusion::common::arrow::array::{Int32Array, RecordBatch}; +// use datafusion::common::arrow::datatypes::{DataType, Field, Schema}; use std::sync::Arc; // Placeholder types to let this compile @@ -27,35 +27,35 @@ pub enum BlobData { Tuple(RecordId), } -impl StorageClient { - /// Have some sort of way to create a `StorageClient` on our local node. - pub fn new(_id: usize) -> Self { - Self - } - - /// The only other function we need exposed would be a way to actually get data. - /// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow - /// data in memory. - /// - /// The executor node really should not know what the underlying data is on the Blob data store. - /// In our case it is Parquet, but since the Execution Engine is not in charge or loading - /// those Parquet files, it should just receive it as in-memory Arrow data - /// - /// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion - /// and use that as the return type instead - pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream { - todo!() - } - - pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream { - todo!("Return some sample data") - } - - /// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html - pub async fn request_synchronous_data() -> RecordBatch { - let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); - let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); - - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap() - } -} +// impl StorageClient { +// /// Have some sort of way to create a `StorageClient` on our local node. +// pub fn new(_id: usize) -> Self { +// Self +// } + +// /// The only other function we need exposed would be a way to actually get data. +// /// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow +// /// data in memory. +// /// +// /// The executor node really should not know what the underlying data is on the Blob data store. +// /// In our case it is Parquet, but since the Execution Engine is not in charge or loading +// /// those Parquet files, it should just receive it as in-memory Arrow data +// /// +// /// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion +// /// and use that as the return type instead +// pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream { +// todo!() +// } + +// pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream { +// todo!("Return some sample data") +// } + +// /// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html +// pub async fn request_synchronous_data() -> RecordBatch { +// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); +// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + +// RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap() +// } +// }