diff --git a/deno.json b/deno.json index bd997341..3bb91614 100644 --- a/deno.json +++ b/deno.json @@ -41,6 +41,7 @@ "@std/io": "jsr:@std/io@0.225.0", "@std/log": "jsr:@std/log@0.224.9", "@std/path": "jsr:@std/path@1.0.8", + "@std/streams": "jsr:@std/streams@1.0.8", "@std/yaml": "jsr:@std/yaml@^1.0.4" }, "tasks": { diff --git a/deno.lock b/deno.lock index c90229f9..6248251c 100644 --- a/deno.lock +++ b/deno.lock @@ -1,7 +1,7 @@ { "version": "4", "specifiers": { - "jsr:@bids/schema@0.11.3+2": "0.11.3+1", + "jsr:@bids/schema@1.0.0": "1.0.0", "jsr:@cliffy/flags@1.0.0-rc.7": "1.0.0-rc.7", "jsr:@cliffy/internal@1.0.0-rc.7": "1.0.0-rc.7", "jsr:@effigies/cliffy-command@1.0.0-dev.8": "1.0.0-dev.8", @@ -15,6 +15,7 @@ "jsr:@std/assert@~0.213.1": "0.213.1", "jsr:@std/bytes@^1.0.2": "1.0.2", "jsr:@std/bytes@^1.0.2-rc.3": "1.0.2", + "jsr:@std/bytes@^1.0.3": "1.0.4", "jsr:@std/encoding@0.213": "0.213.1", "jsr:@std/encoding@^1.0.5": "1.0.5", "jsr:@std/fmt@1.0.3": "1.0.3", @@ -31,6 +32,7 @@ "jsr:@std/path@1.0.8": "1.0.8", "jsr:@std/path@^1.0.6": "1.0.6", "jsr:@std/path@^1.0.7": "1.0.8", + "jsr:@std/streams@1.0.8": "1.0.8", "jsr:@std/text@~1.0.7": "1.0.8", "jsr:@std/yaml@^1.0.4": "1.0.5", "npm:@bids/nifti-reader-js@0.6.9": "0.6.9", @@ -39,8 +41,8 @@ "npm:ignore@6.0.2": "6.0.2" }, "jsr": { - "@bids/schema@0.11.3+1": { - "integrity": "331d9975fe35175e5fc4990e97abf6459f564bc630309ae80ceee98211123cfb" + "@bids/schema@1.0.0": { + "integrity": "866fe0f636b73e08bf6ba739941821a24199cc72d0f312a81328ec7ecafaceee" }, "@cliffy/flags@1.0.0-rc.7": { "integrity": "318d9be98f6a6417b108e03dec427dea96cdd41a15beb21d2554ae6da450a781", @@ -104,6 +106,9 @@ "@std/bytes@1.0.2": { "integrity": "fbdee322bbd8c599a6af186a1603b3355e59a5fb1baa139f8f4c3c9a1b3e3d57" }, + "@std/bytes@1.0.4": { + "integrity": "11a0debe522707c95c7b7ef89b478c13fb1583a7cfb9a85674cd2cc2e3a28abc" + }, "@std/encoding@0.213.1": { "integrity": "fcbb6928713dde941a18ca5db88ca1544d0755ec8fb20fe61e2dc8144b390c62" }, @@ -154,6 +159,12 @@ "@std/path@1.0.8": { "integrity": "548fa456bb6a04d3c1a1e7477986b6cffbce95102d0bb447c67c4ee70e0364be" }, + "@std/streams@1.0.8": { + "integrity": "b41332d93d2cf6a82fe4ac2153b930adf1a859392931e2a19d9fabfb6f154fb3", + "dependencies": [ + "jsr:@std/bytes@^1.0.3" + ] + }, "@std/text@1.0.8": { "integrity": "40ba34caa095f393e78796e5eda37b8b4e2cc6cfd6f51f34658ad7487b1451e4" }, @@ -402,7 +413,7 @@ }, "workspace": { "dependencies": [ - "jsr:@bids/schema@0.11.3+2", + "jsr:@bids/schema@1.0.0", "jsr:@effigies/cliffy-command@1.0.0-dev.8", "jsr:@effigies/cliffy-table@1.0.0-dev.5", "jsr:@libs/xml@6.0.1", @@ -412,6 +423,7 @@ "jsr:@std/io@0.225.0", "jsr:@std/log@0.224.9", "jsr:@std/path@1.0.8", + "jsr:@std/streams@1.0.8", "jsr:@std/yaml@^1.0.4", "npm:@bids/nifti-reader-js@0.6.9", "npm:ajv@8.17.1", diff --git a/src/files/deno.test.ts b/src/files/deno.test.ts index 5182b271..9b972a3f 100644 --- a/src/files/deno.test.ts +++ b/src/files/deno.test.ts @@ -3,7 +3,8 @@ import { readAll, readerFromStreamReader } from '@std/io' import { basename, dirname, fromFileUrl, join } from '@std/path' import { EOL } from '@std/fs' import type { FileTree } from '../types/filetree.ts' -import { BIDSFileDeno, readFileTree, UnicodeDecodeError } from './deno.ts' +import { BIDSFileDeno, readFileTree } from './deno.ts' +import { UnicodeDecodeError } from './streams.ts' import { requestReadPermission } from '../setup/requestPermissions.ts' import { FileIgnoreRules } from './ignore.ts' diff --git a/src/files/deno.ts b/src/files/deno.ts index 69338667..56e36a88 100644 --- a/src/files/deno.ts +++ b/src/files/deno.ts @@ -7,18 +7,9 @@ import { type BIDSFile, FileTree } from '../types/filetree.ts' import { requestReadPermission } from '../setup/requestPermissions.ts' import { FileIgnoreRules, readBidsIgnore } from './ignore.ts' import { logger } from '../utils/logger.ts' +import { createUTF8Stream } from './streams.ts' export { type BIDSFile, FileTree } -/** - * Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters - */ -export class UnicodeDecodeError extends Error { - constructor(message: string) { - super(message) - this.name = 'UnicodeDecode' - } -} - /** * Deno implementation of BIDSFile */ @@ -67,27 +58,17 @@ export class BIDSFileDeno implements BIDSFile { * Read the entire file and decode as utf-8 text */ async text(): Promise { - const streamReader = this.stream - .pipeThrough(new TextDecoderStream('utf-8')) - .getReader() - let data = '' + const reader = this.stream.pipeThrough(createUTF8Stream()).getReader() + const chunks: string[] = [] try { - // Read once to check for unicode issues - const { done, value } = await streamReader.read() - // Check for UTF-16 BOM - if (value && value.startsWith('\uFFFD')) { - throw new UnicodeDecodeError('This file appears to be UTF-16') - } - if (done) return data - data += value - // Continue reading the rest of the file if no unicode issues were found while (true) { - const { done, value } = await streamReader.read() - if (done) return data - data += value + const { done, value } = await reader.read() + if (done) break + chunks.push(value) } + return chunks.join('') } finally { - streamReader.releaseLock() + reader.releaseLock() } } diff --git a/src/files/filetree.ts b/src/files/filetree.ts index 49d187ad..718a2b14 100644 --- a/src/files/filetree.ts +++ b/src/files/filetree.ts @@ -5,7 +5,11 @@ import { FileIgnoreRules } from './ignore.ts' const nullFile = { size: 0, - stream: new ReadableStream(), + stream: new ReadableStream({ + start(controller) { + controller.close() + } + }), text: () => Promise.resolve(''), readBytes: async (size: number, offset?: number) => new Uint8Array(), parent: new FileTree('', '/'), diff --git a/src/files/json.test.ts b/src/files/json.test.ts index 09056846..6b367892 100644 --- a/src/files/json.test.ts +++ b/src/files/json.test.ts @@ -1,5 +1,4 @@ import { type assert, assertObjectMatch } from '@std/assert' -import type { BIDSFileDeno, UnicodeDecodeError } from './deno.ts' import type { BIDSFile } from '../types/filetree.ts' import type { FileIgnoreRules } from './ignore.ts' diff --git a/src/files/streams.test.ts b/src/files/streams.test.ts new file mode 100644 index 00000000..fdc85ee6 --- /dev/null +++ b/src/files/streams.test.ts @@ -0,0 +1,37 @@ +import { assert, assertEquals } from '@std/assert' +import { createUTF8Stream, UnicodeDecodeError } from './streams.ts' +import { streamFromUint8Array, streamFromString } from '../tests/utils.ts' + +Deno.test('createUTF8Stream', async (t) => { + await t.step('should return a TransformStream with UTF8StreamTransformer', () => { + const stream = createUTF8Stream() + assertEquals(stream instanceof TransformStream, true) + }) + + await t.step('should correctly transform UTF-8 input', async () => { + const rawstream = streamFromString('Hello, world!') + const reader = rawstream.pipeThrough(createUTF8Stream()).getReader() + const { value } = await reader.read() + assertEquals(value, 'Hello, world!') + + await reader.cancel() + }) + + await t.step('should throw UnicodeDecodeError for UTF-16 input', async () => { + const rawStream = streamFromUint8Array(new Uint8Array([0xFF, 0xFE, 0x00, 0x00])) + + let reader + try { + // The exception can't be localized to either of the following lines + // but is raised before the second returns + reader = rawStream.pipeThrough(createUTF8Stream()).getReader() + const { value } = await reader.read() + assert(false, 'Expected UnicodeDecodeError, got ' + value) + } catch (e: any) { + assertEquals(e instanceof UnicodeDecodeError, true) + assertEquals(e?.message, 'This file appears to be UTF-16') + } finally { + if (reader) await reader.cancel + } + }) +}) diff --git a/src/files/streams.ts b/src/files/streams.ts new file mode 100644 index 00000000..d22bfc95 --- /dev/null +++ b/src/files/streams.ts @@ -0,0 +1,51 @@ +/** + * Thrown when a text file is decoded as UTF-8 but contains UTF-16 characters + */ +export class UnicodeDecodeError extends Error { + constructor(message: string) { + super(message) + this.name = 'UnicodeDecode' + } +} + +/** + * A transformer that ensures the input stream is valid UTF-8 and throws + * a UnicodeDecodeError if UTF-16 BOM is detected + */ +export class UTF8StreamTransformer implements Transformer { + private decoder: TextDecoder + private firstChunk: boolean + + constructor() { + this.decoder = new TextDecoder('utf-8') + this.firstChunk = true + } + + transform(chunk: Uint8Array, controller: TransformStreamDefaultController) { + // Check first chunk for UTF-16 BOM + if (this.firstChunk) { + const decoded = this.decoder.decode(chunk, { stream: true }) + if (decoded.startsWith('\uFFFD')) { + throw new UnicodeDecodeError('This file appears to be UTF-16') + } + this.firstChunk = false + controller.enqueue(decoded) + } else { + controller.enqueue(this.decoder.decode(chunk, { stream: true })) + } + } + + flush(controller: TransformStreamDefaultController) { + const final = this.decoder.decode() + if (final) { + controller.enqueue(final) + } + } +} + +/** + * Creates a TransformStream that validates and decodes UTF-8 text + */ +export function createUTF8Stream() { + return new TransformStream(new UTF8StreamTransformer()) +} diff --git a/src/files/tsv.test.ts b/src/files/tsv.test.ts new file mode 100644 index 00000000..217a9b69 --- /dev/null +++ b/src/files/tsv.test.ts @@ -0,0 +1,174 @@ +import { assert, assertEquals, assertNotStrictEquals, assertObjectMatch, assertStrictEquals } from '@std/assert' +import { pathToFile } from './filetree.ts' +import { loadTSV } from './tsv.ts' +import { streamFromString } from '../tests/utils.ts' +import { ColumnsMap } from '../types/columns.ts' + +Deno.test('TSV loading', async (t) => { + await t.step('Empty file produces empty map', async () => { + const file = pathToFile('/empty.tsv') + file.stream = streamFromString('') + + const map = await loadTSV(file) + // map.size looks for a column called map, so work around it + assertEquals(Object.keys(map).length, 0) + }) + + await t.step('Single row file produces header-only map', async () => { + const file = pathToFile('/single_row.tsv') + file.stream = streamFromString('a\tb\tc\n') + + const map = await loadTSV(file) + assertEquals(map.a, []) + assertEquals(map.b, []) + assertEquals(map.c, []) + }) + + await t.step('Single column file produces single column map', async () => { + const file = pathToFile('/single_column.tsv') + file.stream = streamFromString('a\n1\n2\n3\n') + + const map = await loadTSV(file) + assertEquals(map.a, ['1', '2', '3']) + }) + + await t.step('Missing final newline is ignored', async () => { + const file = pathToFile('/missing_newline.tsv') + file.stream = streamFromString('a\n1\n2\n3') + + const map = await loadTSV(file) + assertEquals(map.a, ['1', '2', '3']) + }) + + await t.step('Empty row throws issue', async () => { + const file = pathToFile('/empty_row.tsv') + file.stream = streamFromString('a\tb\tc\n1\t2\t3\n\n4\t5\t6\n') + + try { + await loadTSV(file) + } catch (e: any) { + assertObjectMatch(e, { key: 'TSV_EMPTY_LINE', line: 3 }) + } + }) + + await t.step('Mismatched row length throws issue', async () => { + const file = pathToFile('/mismatched_row.tsv') + file.stream = streamFromString('a\tb\tc\n1\t2\t3\n4\t5\n') + + try { + await loadTSV(file) + } catch (e: any) { + assertObjectMatch(e, { key: 'TSV_EQUAL_ROWS', line: 3 }) + } + }) + + await t.step('maxRows limits the number of rows read', async () => { + const file = pathToFile('/long.tsv') + // Use 1500 to avoid overlap with default initial capacity + const text = 'a\tb\tc\n' + '1\t2\t3\n'.repeat(1500) + file.stream = streamFromString(text) + + let map = await loadTSV(file, 0) + assertEquals(map.a, []) + assertEquals(map.b, []) + assertEquals(map.c, []) + + // Do not assume that caching respects maxRows in this test + loadTSV.cache.clear() + file.stream = streamFromString(text) + map = await loadTSV(file, 1) + assertEquals(map.a, ['1']) + assertEquals(map.b, ['2']) + assertEquals(map.c, ['3']) + + loadTSV.cache.clear() + file.stream = streamFromString(text) + map = await loadTSV(file, 2) + assertEquals(map.a, ['1', '1']) + assertEquals(map.b, ['2', '2']) + assertEquals(map.c, ['3', '3']) + + loadTSV.cache.clear() + file.stream = streamFromString(text) + map = await loadTSV(file, -1) + assertEquals(map.a, Array(1500).fill('1')) + assertEquals(map.b, Array(1500).fill('2')) + assertEquals(map.c, Array(1500).fill('3')) + + loadTSV.cache.clear() + // Check that maxRows does not truncate shorter files + file.stream = streamFromString('a\tb\tc\n1\t2\t3\n4\t5\t6\n7\t8\t9\n') + map = await loadTSV(file, 4) + assertEquals(map.a, ['1', '4', '7']) + assertEquals(map.b, ['2', '5', '8']) + assertEquals(map.c, ['3', '6', '9']) + }) + + await t.step('caching avoids multiple reads', async () => { + loadTSV.cache.clear() + const file = pathToFile('/long.tsv') + // Use 1500 to avoid overlap with default initial capacity + const text = 'a\tb\tc\n' + '1\t2\t3\n'.repeat(1500) + file.stream = streamFromString(text) + + let map = await loadTSV(file, 2) + assertEquals(map.a, ['1', '1']) + assertEquals(map.b, ['2', '2']) + assertEquals(map.c, ['3', '3']) + + // Replace stream to ensure cache does not depend on deep object equality + file.stream = streamFromString(text) + let repeatMap = await loadTSV(file, 2) + assertStrictEquals(map, repeatMap) + + loadTSV.cache.clear() + // DO NOT replace stream so the next read verifies the previous stream wasn't read + repeatMap = await loadTSV(file, 2) + assertEquals(repeatMap.a, ['1', '1']) + assertEquals(repeatMap.b, ['2', '2']) + assertEquals(repeatMap.c, ['3', '3']) + // Same contents, different objects + assertNotStrictEquals(map, repeatMap) + }) + + await t.step('caching is keyed on maxRows', async () => { + const file = pathToFile('/long.tsv') + // Use 1500 to avoid overlap with default initial capacity + const text = 'a\tb\tc\n' + '1\t2\t3\n'.repeat(1500) + file.stream = streamFromString(text) + + let map = await loadTSV(file, 2) + assertEquals(map.a, ['1', '1']) + assertEquals(map.b, ['2', '2']) + assertEquals(map.c, ['3', '3']) + + file.stream = streamFromString(text) + let repeatMap = await loadTSV(file, 3) + assertNotStrictEquals(map, repeatMap) + assertEquals(repeatMap.a, ['1', '1', '1']) + assertEquals(repeatMap.b, ['2', '2', '2']) + assertEquals(repeatMap.c, ['3', '3', '3']) + + file.stream = streamFromString(text) + repeatMap = await loadTSV(file, 2) + assertStrictEquals(map, repeatMap) + assertEquals(repeatMap.a, ['1', '1']) + assertEquals(repeatMap.b, ['2', '2']) + assertEquals(repeatMap.c, ['3', '3']) + }) + + await t.step('Raises issue on duplicate header', async () => { + const file = pathToFile('/duplicate_header.tsv') + file.stream = streamFromString('a\ta\n1\t2\n') + + try { + await loadTSV(file) + assert(false, 'Expected error') + } catch (e: any) { + assertObjectMatch(e, { key: 'TSV_COLUMN_HEADER_DUPLICATE', evidence: 'a, a' }) + } + }) + + // Tests will have populated the memoization cache + loadTSV.cache.clear() +}) diff --git a/src/files/tsv.ts b/src/files/tsv.ts index ea818e04..93aecbfa 100644 --- a/src/files/tsv.ts +++ b/src/files/tsv.ts @@ -2,44 +2,63 @@ * TSV * Module for parsing TSV */ +import { TextLineStream } from '@std/streams' import { ColumnsMap } from '../types/columns.ts' import type { BIDSFile } from '../types/filetree.ts' import { filememoizeAsync } from '../utils/memoize.ts' -import type { WithCache } from '../utils/memoize.ts' +import { createUTF8Stream } from './streams.ts' -const normalizeEOL = (str: string): string => str.replace(/\r\n/g, '\n').replace(/\r/g, '\n') -// Typescript resolved `row && !/^\s*$/.test(row)` as `string | boolean` -const isContentfulRow = (row: string): boolean => !!(row && !/^\s*$/.test(row)) +async function _loadTSV(file: BIDSFile, maxRows: number = -1): Promise { + const reader = file.stream + .pipeThrough(createUTF8Stream()) + .pipeThrough(new TextLineStream()) + .getReader() -async function _loadTSV(file: BIDSFile): Promise { - return await file.text().then(parseTSV) -} + try { + const headerRow = await reader.read() + const headers = (headerRow.done || !headerRow.value) ? [] : headerRow.value.split('\t') -export const loadTSV = filememoizeAsync(_loadTSV) + if (new Set(headers).size !== headers.length) { + throw { key: 'TSV_COLUMN_HEADER_DUPLICATE', evidence: headers.join(', ') } + } -function parseTSV(contents: string) { - const columns = new ColumnsMap() - const rows: string[][] = normalizeEOL(contents) - .split('\n') - .filter(isContentfulRow) - .map((str) => str.split('\t')) - const headers = rows.length ? rows[0] : [] + // Initialize columns in array for construction efficiency + const initialCapacity = maxRows >= 0 ? maxRows : 1000 + const columns: string[][] = headers.map(() => new Array(initialCapacity)) - if (rows.some((row) => row.length !== headers.length)) { - throw { key: 'TSV_EQUAL_ROWS' } - } + maxRows = maxRows >= 0 ? maxRows : Infinity + let rowIndex = 0 // Keep in scope after loop + for (; rowIndex < maxRows; rowIndex++) { + const { done, value } = await reader.read() + if (done) break - headers.map((x) => { - columns[x] = [] - }) - if (headers.length !== Object.keys(columns).length) { - throw { key: 'TSV_COLUMN_HEADER_DUPLICATE', evidence: headers.join(', ') } - } - for (let i = 1; i < rows.length; i++) { - for (let j = 0; j < headers.length; j++) { - const col = columns[headers[j]] as string[] - col.push(rows[i][j]) + // Expect a newline at the end of the file, but otherwise error on empty lines + if (!value) { + const nextRow = await reader.read() + if (nextRow.done) break + throw { key: 'TSV_EMPTY_LINE', line: rowIndex + 2 } + } + + const values = value.split('\t') + if (values.length !== headers.length) { + throw { key: 'TSV_EQUAL_ROWS', line: rowIndex + 2 } + } + columns.forEach((column, columnIndex) => { + // Double array size if we exceed the current capacity + if (rowIndex >= column.length) { + column.length = column.length * 2 + } + column[rowIndex] = values[columnIndex] + }) } + + // Construct map, truncating columns to number of rows read + return new ColumnsMap( + headers.map((header, index) => [header, columns[index].slice(0, rowIndex)]), + ) + } finally { + await reader.cancel() } - return columns } + +export const loadTSV = filememoizeAsync(_loadTSV) diff --git a/src/issues/list.ts b/src/issues/list.ts index edd0a6f5..7bc69059 100644 --- a/src/issues/list.ts +++ b/src/issues/list.ts @@ -84,6 +84,10 @@ export const bidsIssues: IssueDefinitionRecord = { severity: 'error', reason: 'All rows must have the same number of columns as there are headers.', }, + TSV_EMPTY_LINE: { + severity: 'error', + reason: 'An empty line was found in the TSV file.', + }, TSV_COLUMN_MISSING: { severity: 'error', reason: 'A required column is missing', diff --git a/src/schema/associations.ts b/src/schema/associations.ts index bc7eef45..0fc5d62d 100644 --- a/src/schema/associations.ts +++ b/src/schema/associations.ts @@ -36,8 +36,8 @@ const associationLookup = { suffix: 'events', extensions: ['.tsv'], inherit: true, - load: async (file: BIDSFile): Promise => { - const columns = await loadTSV(file) + load: async (file: BIDSFile, options: { maxRows: number }): Promise => { + const columns = await loadTSV(file, options.maxRows) .catch((e) => { return new Map() }) @@ -53,8 +53,9 @@ const associationLookup = { inherit: true, load: async ( file: BIDSFile, + options: { maxRows: number }, ): Promise => { - const columns = await loadTSV(file) + const columns = await loadTSV(file, options.maxRows) .catch((e) => { return new Map() }) @@ -69,7 +70,7 @@ const associationLookup = { suffix: 'm0scan', extensions: ['.nii', '.nii.gz'], inherit: false, - load: (file: BIDSFile): Promise => { + load: (file: BIDSFile, options: any): Promise => { return Promise.resolve({ path: file.path }) }, }, @@ -77,7 +78,7 @@ const associationLookup = { suffix: 'magnitude', extensions: ['.nii', '.nii.gz'], inherit: false, - load: (file: BIDSFile): Promise => { + load: (file: BIDSFile, options: any): Promise => { return Promise.resolve({ path: file.path }) }, }, @@ -85,7 +86,7 @@ const associationLookup = { suffix: 'magnitude1', extensions: ['.nii', '.nii.gz'], inherit: false, - load: (file: BIDSFile): Promise => { + load: (file: BIDSFile, options: any): Promise => { return Promise.resolve({ path: file.path }) }, }, @@ -93,7 +94,7 @@ const associationLookup = { suffix: 'dwi', extensions: ['.bval'], inherit: true, - load: async (file: BIDSFile): Promise => { + load: async (file: BIDSFile, options: any): Promise => { const contents = await file.text() const rows = parseBvalBvec(contents) return { @@ -109,7 +110,7 @@ const associationLookup = { suffix: 'dwi', extensions: ['.bvec'], inherit: true, - load: async (file: BIDSFile): Promise => { + load: async (file: BIDSFile, options: any): Promise => { const contents = await file.text() const rows = parseBvalBvec(contents) @@ -128,8 +129,8 @@ const associationLookup = { suffix: 'channels', extensions: ['.tsv'], inherit: true, - load: async (file: BIDSFile): Promise => { - const columns = await loadTSV(file) + load: async (file: BIDSFile, options: { maxRows: number }): Promise => { + const columns = await loadTSV(file, options.maxRows) .catch((e) => { return new Map() }) @@ -145,7 +146,7 @@ const associationLookup = { suffix: 'coordsystem', extensions: ['.json'], inherit: true, - load: (file: BIDSFile): Promise => { + load: (file: BIDSFile, options: any): Promise => { return Promise.resolve({ path: file.path }) }, }, @@ -154,6 +155,7 @@ const associationLookup = { export async function buildAssociations( source: BIDSFile, issues: DatasetIssues, + maxRows: number = -1, ): Promise { const associations: Associations = {} @@ -177,7 +179,7 @@ export async function buildAssociations( if (file) { // @ts-expect-error Matching load return value to key is hard - associations[key] = await load(file).catch((error) => { + associations[key] = await load(file, { maxRows }).catch((error) => { if (error.key) { issues.add({ code: error.key, location: file.path }) } diff --git a/src/schema/context.ts b/src/schema/context.ts index 273215af..2fa81dce 100644 --- a/src/schema/context.ts +++ b/src/schema/context.ts @@ -232,7 +232,7 @@ export class BIDSContext implements Context { return } - this.columns = await loadTSV(this.file) + this.columns = await loadTSV(this.file, this.dataset.options?.maxRows) .catch((error) => { if (error.key) { this.dataset.issues.add({ code: error.key, location: this.file.path }) @@ -247,7 +247,11 @@ export class BIDSContext implements Context { } async loadAssociations(): Promise { - this.associations = await buildAssociations(this.file, this.dataset.issues) + this.associations = await buildAssociations( + this.file, + this.dataset.issues, + this.dataset.options?.maxRows, + ) return } diff --git a/src/setup/options.test.ts b/src/setup/options.test.ts index d7ef2879..02386b27 100644 --- a/src/setup/options.test.ts +++ b/src/setup/options.test.ts @@ -10,6 +10,7 @@ Deno.test('options parsing', async (t) => { json: true, color: false, blacklistModalities: [], + maxRows: 1000, }) }) }) diff --git a/src/setup/options.ts b/src/setup/options.ts index 9caa0fe3..bed55cd1 100644 --- a/src/setup/options.ts +++ b/src/setup/options.ts @@ -30,6 +30,7 @@ export type ValidatorOptions = { outfile?: string blacklistModalities: string[] prune?: boolean + maxRows?: number } const modalityType = new EnumType( @@ -50,6 +51,11 @@ export const validateCommand: Command = new Com 'Specify a schema version to use for validation', ) .option('-c, --config ', 'Path to a JSON configuration file') + .option( + '--max-rows ', + 'Maximum number of rows to validate in TSVs. Use 0 to validate headers only. Use -1 to validate all.', + { default: 1000 }, + ) .option('-v, --verbose', 'Log more extensive information about issues') .option('--ignoreWarnings', 'Disregard non-critical issues') .option( diff --git a/src/tests/regression.test.ts b/src/tests/regression.test.ts index e5aa0664..a89de5c6 100644 --- a/src/tests/regression.test.ts +++ b/src/tests/regression.test.ts @@ -2,6 +2,7 @@ import { assert } from '@std/assert' import { pathsToTree } from '../files/filetree.ts' import { validate } from '../validators/bids.ts' import type { BIDSFile } from '../types/filetree.ts' +import { streamFromString } from './utils.ts' Deno.test('Regression tests', async (t) => { await t.step('Verify ignored files in scans.tsv do not trigger error', async () => { @@ -17,7 +18,7 @@ Deno.test('Regression tests', async (t) => { // Without ignore, NOT_INCLUDED is triggered for CT, but the scans file is happy let ds = pathsToTree(paths) let scans_tsv = ds.get('sub-01/sub-01_scans.tsv') as BIDSFile - scans_tsv.text = () => Promise.resolve(scans_content) + scans_tsv.stream = streamFromString(scans_content) let result = await validate(ds, { datasetPath: '/dataset', debug: 'ERROR', @@ -30,7 +31,7 @@ Deno.test('Regression tests', async (t) => { // With ignore, NOT_INCLUDED is not triggered for CT, and the scans file is still happy ds = pathsToTree(paths, ignore) scans_tsv = ds.get('sub-01/sub-01_scans.tsv') as BIDSFile - scans_tsv.text = () => Promise.resolve(scans_content) + scans_tsv.stream = streamFromString(scans_content) result = await validate(ds, { datasetPath: '/dataset', debug: 'ERROR', diff --git a/src/tests/utils.ts b/src/tests/utils.ts new file mode 100644 index 00000000..661d9224 --- /dev/null +++ b/src/tests/utils.ts @@ -0,0 +1,12 @@ +export function streamFromUint8Array(arr: Uint8Array): ReadableStream { + return new ReadableStream({ + start(controller) { + controller.enqueue(arr) + controller.close() + }, + }) +} + +export function streamFromString(str: string): ReadableStream { + return streamFromUint8Array(new TextEncoder().encode(str)) +} diff --git a/src/types/columns.ts b/src/types/columns.ts index 77a1a06a..eb5ca0de 100644 --- a/src/types/columns.ts +++ b/src/types/columns.ts @@ -1,9 +1,9 @@ // Allow ColumnsMap to be accessed as an object too export class ColumnsMap extends Map { [key: string]: Map[keyof Map] | string[] - constructor() { + constructor(iterable?: Iterable) { super() - const columns = new Map() as ColumnsMap + const columns = new Map(iterable) as ColumnsMap return new Proxy(columns, columnMapAccessorProxy) } } diff --git a/src/utils/memoize.ts b/src/utils/memoize.ts index 3f4aae25..e32560b0 100644 --- a/src/utils/memoize.ts +++ b/src/utils/memoize.ts @@ -1,5 +1,6 @@ export type WithCache = T & { cache: Map } -interface HasParent { +interface FileLike { + path: string, parent: { path: string } } @@ -14,20 +15,21 @@ export const memoize = ( return cached } -export function filememoizeAsync( - fn: (file: F) => Promise, -): WithCache<(file: F) => Promise> { - const cache = new Map>() - const cached = async function (this: any, file: F): Promise { +export function filememoizeAsync( + fn: (file: F, ...args: any[]) => Promise, +): WithCache<(file: F, ...args: any[]) => Promise> { + const cache = new Map>() + const cached = async function (this: any, file: F, ...args: any[]): Promise { let subcache = cache.get(file.parent.path) if (!subcache) { subcache = new Map() cache.set(file.parent.path, subcache) } - let val = subcache.get(file) + const key = `${file.path}:${args.join(',')}` + let val = subcache.get(key) if (!val) { - val = await fn.call(this, file) - subcache.set(file, val) + val = await fn.call(this, file, ...args) + subcache.set(key, val) } return val }