Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
remove datafusion for faster compiles
Browse files Browse the repository at this point in the history
  • Loading branch information
connortsui20 committed Feb 13, 2024
1 parent 73cc3c8 commit bc78dfc
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 38 deletions.
2 changes: 1 addition & 1 deletion eggstrain/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ authors = [
anyhow = "1"
arrow = "50"
async-trait = "0.1"
datafusion = "35"
# datafusion = "35"
serde_json = "1"
substrait = { version = "0.24", features = ["pbjson"] }
tokio = { version = "1", features = ["full"] }
Expand Down
4 changes: 2 additions & 2 deletions eggstrain/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ use execution::substrait::deserialize::get_json;
#[tokio::main]
async fn main() {
println!("Hello, world!");
// get_json("../substrait/substrait_plan_example.json");
get_json("../substrait/basic_query.json");
get_json("../substrait/substrait_plan_example.json");
// get_json("../substrait/basic_query.json");
}
70 changes: 35 additions & 35 deletions eggstrain/src/storage_client/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Right now we have this in a submodule `storage_client.rs`, but the IO service
//! team would probably create a crate and we could import it easily into our `Cargo.toml` file

use datafusion::execution::SendableRecordBatchStream;
// use datafusion::execution::SendableRecordBatchStream;

use datafusion::common::arrow::array::{Int32Array, RecordBatch};
use datafusion::common::arrow::datatypes::{DataType, Field, Schema};
// use datafusion::common::arrow::array::{Int32Array, RecordBatch};
// use datafusion::common::arrow::datatypes::{DataType, Field, Schema};
use std::sync::Arc;

// Placeholder types to let this compile
Expand All @@ -27,35 +27,35 @@ pub enum BlobData {
Tuple(RecordId),
}

impl StorageClient {
/// Have some sort of way to create a `StorageClient` on our local node.
pub fn new(_id: usize) -> Self {
Self
}

/// The only other function we need exposed would be a way to actually get data.
/// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow
/// data in memory.
///
/// The executor node really should not know what the underlying data is on the Blob data store.
/// In our case it is Parquet, but since the Execution Engine is not in charge or loading
/// those Parquet files, it should just receive it as in-memory Arrow data
///
/// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion
/// and use that as the return type instead
pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream {
todo!()
}

pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream {
todo!("Return some sample data")
}

/// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html
pub async fn request_synchronous_data() -> RecordBatch {
let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);

RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap()
}
}
// impl StorageClient {
// /// Have some sort of way to create a `StorageClient` on our local node.
// pub fn new(_id: usize) -> Self {
// Self
// }

// /// The only other function we need exposed would be a way to actually get data.
// /// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow
// /// data in memory.
// ///
// /// The executor node really should not know what the underlying data is on the Blob data store.
// /// In our case it is Parquet, but since the Execution Engine is not in charge or loading
// /// those Parquet files, it should just receive it as in-memory Arrow data
// ///
// /// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion
// /// and use that as the return type instead
// pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream {
// todo!()
// }

// pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream {
// todo!("Return some sample data")
// }

// /// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html
// pub async fn request_synchronous_data() -> RecordBatch {
// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);

// RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap()
// }
// }

0 comments on commit bc78dfc

Please sign in to comment.