Skip to content

Commit

Permalink
Merge pull request #139 from effigies/feat/tsv-maxrows
Browse files Browse the repository at this point in the history
feat: Process TSV files as streams and validate only the first 1000 rows by default
  • Loading branch information
rwblair authored Jan 16, 2025
2 parents da2c6e6 + 4db5044 commit bd92283
Show file tree
Hide file tree
Showing 19 changed files with 401 additions and 90 deletions.
1 change: 1 addition & 0 deletions deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"@std/io": "jsr:@std/[email protected]",
"@std/log": "jsr:@std/[email protected]",
"@std/path": "jsr:@std/[email protected]",
"@std/streams": "jsr:@std/[email protected]",
"@std/yaml": "jsr:@std/yaml@^1.0.4"
},
"tasks": {
Expand Down
20 changes: 16 additions & 4 deletions deno.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion src/files/deno.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import { readAll, readerFromStreamReader } from '@std/io'
import { basename, dirname, fromFileUrl, join } from '@std/path'
import { EOL } from '@std/fs'
import type { FileTree } from '../types/filetree.ts'
import { BIDSFileDeno, readFileTree, UnicodeDecodeError } from './deno.ts'
import { BIDSFileDeno, readFileTree } from './deno.ts'
import { UnicodeDecodeError } from './streams.ts'
import { requestReadPermission } from '../setup/requestPermissions.ts'
import { FileIgnoreRules } from './ignore.ts'

Expand Down
35 changes: 8 additions & 27 deletions src/files/deno.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,9 @@ import { type BIDSFile, FileTree } from '../types/filetree.ts'
import { requestReadPermission } from '../setup/requestPermissions.ts'
import { FileIgnoreRules, readBidsIgnore } from './ignore.ts'
import { logger } from '../utils/logger.ts'
import { createUTF8Stream } from './streams.ts'
export { type BIDSFile, FileTree }

/**
* Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters
*/
export class UnicodeDecodeError extends Error {
constructor(message: string) {
super(message)
this.name = 'UnicodeDecode'
}
}

/**
* Deno implementation of BIDSFile
*/
Expand Down Expand Up @@ -67,27 +58,17 @@ export class BIDSFileDeno implements BIDSFile {
* Read the entire file and decode as utf-8 text
*/
async text(): Promise<string> {
const streamReader = this.stream
.pipeThrough(new TextDecoderStream('utf-8'))
.getReader()
let data = ''
const reader = this.stream.pipeThrough(createUTF8Stream()).getReader()
const chunks: string[] = []
try {
// Read once to check for unicode issues
const { done, value } = await streamReader.read()
// Check for UTF-16 BOM
if (value && value.startsWith('\uFFFD')) {
throw new UnicodeDecodeError('This file appears to be UTF-16')
}
if (done) return data
data += value
// Continue reading the rest of the file if no unicode issues were found
while (true) {
const { done, value } = await streamReader.read()
if (done) return data
data += value
const { done, value } = await reader.read()
if (done) break
chunks.push(value)
}
return chunks.join('')
} finally {
streamReader.releaseLock()
reader.releaseLock()
}
}

Expand Down
6 changes: 5 additions & 1 deletion src/files/filetree.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ import { FileIgnoreRules } from './ignore.ts'

const nullFile = {
size: 0,
stream: new ReadableStream(),
stream: new ReadableStream({
start(controller) {
controller.close()
}
}),
text: () => Promise.resolve(''),
readBytes: async (size: number, offset?: number) => new Uint8Array(),
parent: new FileTree('', '/'),
Expand Down
1 change: 0 additions & 1 deletion src/files/json.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { type assert, assertObjectMatch } from '@std/assert'
import type { BIDSFileDeno, UnicodeDecodeError } from './deno.ts'
import type { BIDSFile } from '../types/filetree.ts'
import type { FileIgnoreRules } from './ignore.ts'

Expand Down
37 changes: 37 additions & 0 deletions src/files/streams.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { assert, assertEquals } from '@std/assert'
import { createUTF8Stream, UnicodeDecodeError } from './streams.ts'
import { streamFromUint8Array, streamFromString } from '../tests/utils.ts'

Deno.test('createUTF8Stream', async (t) => {
await t.step('should return a TransformStream with UTF8StreamTransformer', () => {
const stream = createUTF8Stream()
assertEquals(stream instanceof TransformStream, true)
})

await t.step('should correctly transform UTF-8 input', async () => {
const rawstream = streamFromString('Hello, world!')
const reader = rawstream.pipeThrough(createUTF8Stream()).getReader()
const { value } = await reader.read()
assertEquals(value, 'Hello, world!')

await reader.cancel()
})

await t.step('should throw UnicodeDecodeError for UTF-16 input', async () => {
const rawStream = streamFromUint8Array(new Uint8Array([0xFF, 0xFE, 0x00, 0x00]))

let reader
try {
// The exception can't be localized to either of the following lines
// but is raised before the second returns
reader = rawStream.pipeThrough(createUTF8Stream()).getReader()
const { value } = await reader.read()
assert(false, 'Expected UnicodeDecodeError, got ' + value)
} catch (e: any) {
assertEquals(e instanceof UnicodeDecodeError, true)
assertEquals(e?.message, 'This file appears to be UTF-16')
} finally {
if (reader) await reader.cancel
}
})
})
51 changes: 51 additions & 0 deletions src/files/streams.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/**
* Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters
*/
export class UnicodeDecodeError extends Error {
constructor(message: string) {
super(message)
this.name = 'UnicodeDecode'
}
}

/**
* A transformer that ensures the input stream is valid UTF-8 and throws
* a UnicodeDecodeError if UTF-16 BOM is detected
*/
export class UTF8StreamTransformer implements Transformer<Uint8Array, string> {
private decoder: TextDecoder
private firstChunk: boolean

constructor() {
this.decoder = new TextDecoder('utf-8')
this.firstChunk = true
}

transform(chunk: Uint8Array, controller: TransformStreamDefaultController<string>) {
// Check first chunk for UTF-16 BOM
if (this.firstChunk) {
const decoded = this.decoder.decode(chunk, { stream: true })
if (decoded.startsWith('\uFFFD')) {
throw new UnicodeDecodeError('This file appears to be UTF-16')
}
this.firstChunk = false
controller.enqueue(decoded)
} else {
controller.enqueue(this.decoder.decode(chunk, { stream: true }))
}
}

flush(controller: TransformStreamDefaultController<string>) {
const final = this.decoder.decode()
if (final) {
controller.enqueue(final)
}
}
}

/**
* Creates a TransformStream that validates and decodes UTF-8 text
*/
export function createUTF8Stream() {
return new TransformStream(new UTF8StreamTransformer())
}
Loading

0 comments on commit bd92283

Please sign in to comment.