-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #139 from effigies/feat/tsv-maxrows
feat: Process TSV files as streams and validate only the first 1000 rows by default
- Loading branch information
Showing
19 changed files
with
401 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,7 @@ | |
"@std/io": "jsr:@std/[email protected]", | ||
"@std/log": "jsr:@std/[email protected]", | ||
"@std/path": "jsr:@std/[email protected]", | ||
"@std/streams": "jsr:@std/[email protected]", | ||
"@std/yaml": "jsr:@std/yaml@^1.0.4" | ||
}, | ||
"tasks": { | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import { assert, assertEquals } from '@std/assert' | ||
import { createUTF8Stream, UnicodeDecodeError } from './streams.ts' | ||
import { streamFromUint8Array, streamFromString } from '../tests/utils.ts' | ||
|
||
Deno.test('createUTF8Stream', async (t) => { | ||
await t.step('should return a TransformStream with UTF8StreamTransformer', () => { | ||
const stream = createUTF8Stream() | ||
assertEquals(stream instanceof TransformStream, true) | ||
}) | ||
|
||
await t.step('should correctly transform UTF-8 input', async () => { | ||
const rawstream = streamFromString('Hello, world!') | ||
const reader = rawstream.pipeThrough(createUTF8Stream()).getReader() | ||
const { value } = await reader.read() | ||
assertEquals(value, 'Hello, world!') | ||
|
||
await reader.cancel() | ||
}) | ||
|
||
await t.step('should throw UnicodeDecodeError for UTF-16 input', async () => { | ||
const rawStream = streamFromUint8Array(new Uint8Array([0xFF, 0xFE, 0x00, 0x00])) | ||
|
||
let reader | ||
try { | ||
// The exception can't be localized to either of the following lines | ||
// but is raised before the second returns | ||
reader = rawStream.pipeThrough(createUTF8Stream()).getReader() | ||
const { value } = await reader.read() | ||
assert(false, 'Expected UnicodeDecodeError, got ' + value) | ||
} catch (e: any) { | ||
assertEquals(e instanceof UnicodeDecodeError, true) | ||
assertEquals(e?.message, 'This file appears to be UTF-16') | ||
} finally { | ||
if (reader) await reader.cancel | ||
} | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/** | ||
* Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters | ||
*/ | ||
export class UnicodeDecodeError extends Error { | ||
constructor(message: string) { | ||
super(message) | ||
this.name = 'UnicodeDecode' | ||
} | ||
} | ||
|
||
/** | ||
* A transformer that ensures the input stream is valid UTF-8 and throws | ||
* a UnicodeDecodeError if UTF-16 BOM is detected | ||
*/ | ||
export class UTF8StreamTransformer implements Transformer<Uint8Array, string> { | ||
private decoder: TextDecoder | ||
private firstChunk: boolean | ||
|
||
constructor() { | ||
this.decoder = new TextDecoder('utf-8') | ||
this.firstChunk = true | ||
} | ||
|
||
transform(chunk: Uint8Array, controller: TransformStreamDefaultController<string>) { | ||
// Check first chunk for UTF-16 BOM | ||
if (this.firstChunk) { | ||
const decoded = this.decoder.decode(chunk, { stream: true }) | ||
if (decoded.startsWith('\uFFFD')) { | ||
throw new UnicodeDecodeError('This file appears to be UTF-16') | ||
} | ||
this.firstChunk = false | ||
controller.enqueue(decoded) | ||
} else { | ||
controller.enqueue(this.decoder.decode(chunk, { stream: true })) | ||
} | ||
} | ||
|
||
flush(controller: TransformStreamDefaultController<string>) { | ||
const final = this.decoder.decode() | ||
if (final) { | ||
controller.enqueue(final) | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Creates a TransformStream that validates and decodes UTF-8 text | ||
*/ | ||
export function createUTF8Stream() { | ||
return new TransformStream(new UTF8StreamTransformer()) | ||
} |
Oops, something went wrong.