-
Notifications
You must be signed in to change notification settings - Fork 19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Streaming Writes implementation #305
Changes from all commits
238d365
6927ffe
dd672fe
87c9e3f
b27ddec
7c87452
2930f11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,6 @@ pub mod properties; | |
|
||
#[cfg(feature = "async")] | ||
pub mod fetch; | ||
|
||
#[cfg(feature = "async")] | ||
pub mod stream; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
use futures::AsyncWrite; | ||
|
||
pub struct WrappedWritableStream<'writer> { | ||
pub stream: wasm_streams::writable::IntoAsyncWrite<'writer>, | ||
} | ||
|
||
impl<'writer> AsyncWrite for WrappedWritableStream<'writer> { | ||
fn poll_write( | ||
self: std::pin::Pin<&mut Self>, | ||
cx: &mut std::task::Context<'_>, | ||
buf: &[u8], | ||
) -> std::task::Poll<std::io::Result<usize>> { | ||
AsyncWrite::poll_write(std::pin::Pin::new(&mut self.get_mut().stream), cx, buf) | ||
} | ||
|
||
fn poll_flush( | ||
self: std::pin::Pin<&mut Self>, | ||
cx: &mut std::task::Context<'_>, | ||
) -> std::task::Poll<std::io::Result<()>> { | ||
AsyncWrite::poll_flush(std::pin::Pin::new(&mut self.get_mut().stream), cx) | ||
} | ||
|
||
fn poll_close( | ||
self: std::pin::Pin<&mut Self>, | ||
cx: &mut std::task::Context<'_>, | ||
) -> std::task::Poll<std::io::Result<()>> { | ||
AsyncWrite::poll_close(std::pin::Pin::new(&mut self.get_mut().stream), cx) | ||
} | ||
} | ||
|
||
unsafe impl<'writer> Send for WrappedWritableStream<'writer> {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
use crate::common::stream::WrappedWritableStream; | ||
use crate::error::{ParquetWasmError, Result}; | ||
use async_compat::CompatExt; | ||
use futures::channel::oneshot; | ||
use futures::StreamExt; | ||
use parquet::arrow::async_writer::AsyncArrowWriter; | ||
use wasm_bindgen_futures::spawn_local; | ||
|
||
pub async fn transform_parquet_stream( | ||
batches: impl futures::Stream<Item = Result<arrow_wasm::RecordBatch>> + 'static, | ||
writer_properties: crate::writer_properties::WriterProperties, | ||
) -> Result<wasm_streams::readable::sys::ReadableStream> { | ||
let options = Some(writer_properties.into()); | ||
|
||
let raw_stream = wasm_streams::transform::sys::TransformStream::new(); | ||
if let Ok(raw_stream) = raw_stream { | ||
let (writable_stream, output_stream) = { | ||
let raw_writable = raw_stream.writable(); | ||
let inner_writer = | ||
wasm_streams::WritableStream::from_raw(raw_writable).into_async_write(); | ||
let writable_stream = WrappedWritableStream { | ||
stream: inner_writer, | ||
}; | ||
(writable_stream, raw_stream.readable()) | ||
}; | ||
// construct a channel for the purposes of signalling errors occuring at the start of the stream. | ||
// Errors that occur during writing will have to fuse the stream. | ||
let (sender, receiver) = oneshot::channel::<Result<()>>(); | ||
spawn_local(async move { | ||
let mut adapted_stream = batches.peekable(); | ||
let mut pinned_stream = std::pin::pin!(adapted_stream); | ||
let first_batch = pinned_stream.as_mut().peek().await; | ||
if let Some(Ok(first_batch)) = first_batch { | ||
let schema = first_batch.schema().into_inner(); | ||
let writer = AsyncArrowWriter::try_new(writable_stream.compat(), schema, options); | ||
match writer { | ||
Ok(mut writer) => { | ||
// unblock the calling thread's receiver (indicating that stream initialization was error-free) | ||
let _ = sender.send(Ok(())); | ||
while let Some(batch) = pinned_stream.next().await { | ||
if let Ok(batch) = batch { | ||
let _ = writer.write(&batch.into()).await; | ||
} | ||
} | ||
let _ = writer.close().await; | ||
} | ||
Err(err) => { | ||
let _ = sender.send(Err(ParquetWasmError::ParquetError(Box::new(err)))); | ||
} | ||
} | ||
} else if let Some(Err(err)) = first_batch { | ||
let _ = sender.send(Err(ParquetWasmError::DynCastingError( | ||
err.to_string().into(), | ||
))); | ||
} else { | ||
let _ = sender.send(Err(ParquetWasmError::DynCastingError( | ||
"null first batch".to_string().into(), | ||
))); | ||
} | ||
}); | ||
match receiver.await.unwrap() { | ||
Ok(()) => Ok(output_stream), | ||
Err(err) => Err(err), | ||
} | ||
} else { | ||
Err(ParquetWasmError::PlatformSupportError( | ||
"Failed to create TransformStream".to_string(), | ||
)) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
import * as wasm from "../../pkg/node/parquet_wasm"; | ||
import { readFileSync } from "fs"; | ||
import { tableFromIPC, tableToIPC } from "apache-arrow"; | ||
import { testArrowTablesEqual, readExpectedArrowData } from "./utils"; | ||
import { testArrowTablesEqual, readExpectedArrowData, temporaryServer } from "./utils"; | ||
import { describe, it, expect } from "vitest"; | ||
|
||
// Path from repo root | ||
|
@@ -89,3 +89,21 @@ it("reads empty file", async (t) => { | |
expect(table.numCols).toStrictEqual(0); | ||
// console.log("empty table schema", table.schema); | ||
}); | ||
|
||
it("read stream-write stream-read stream round trip (no writer properties provided)", async (t) => { | ||
const server = await temporaryServer(); | ||
const listeningPort = server.addresses()[0].port; | ||
const rootUrl = `http://localhost:${listeningPort}`; | ||
|
||
const expectedTable = readExpectedArrowData(); | ||
|
||
const url = `${rootUrl}/1-partition-brotli.parquet`; | ||
const originalStream = await wasm.readParquetStream(url); | ||
|
||
const stream = await wasm.transformParquetStream(originalStream); | ||
const accumulatedBuffer = new Uint8Array(await new Response(stream).arrayBuffer()); | ||
Comment on lines
+103
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, how would you write this to a file in Node? Can you pass the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It turns out to be quite ergonomic: const destinationWritable = Writable.toWeb(handle.createWriteStream());
await outputStream.pipeTo(destinationWritable); alternatively: await handle.writeFile(outputStream) Deno's version of the former is pretty succinct too: await outputStream.pipeTo(handle.writable); (there's ~10 different ways to do it (e.g. |
||
const roundtripTable = tableFromIPC(wasm.readParquet(accumulatedBuffer).intoIPCStream()); | ||
|
||
testArrowTablesEqual(expectedTable, roundtripTable); | ||
await server.close(); | ||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a docstring here? Potentially include an example too? (The docstring will be included in the typescript-generated typedefs and seen by JS users)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe include an example of how you can pass in a
File
handle? So you can write out to a Parquet file on disk without materializing the buffer in memory?