diff --git a/lib/Validate.ts b/lib/Validate.ts index 3b5818c..17a4440 100644 --- a/lib/Validate.ts +++ b/lib/Validate.ts @@ -1,4 +1,4 @@ -function buildAbsoluteIriRegex(): RegExp { +function buildAbsoluteIriRfc3987Regex(): RegExp { // The syntax is defined in https://www.rfc-editor.org/rfc/rfc3987#section-2.2 // Rules are defined in reversed order @@ -58,13 +58,51 @@ function buildAbsoluteIriRegex(): RegExp { return new RegExp(iri, 'u'); } -const IRI_REGEX: RegExp = buildAbsoluteIriRegex(); +const STRICT_IRI_REGEX: RegExp = buildAbsoluteIriRfc3987Regex(); +// eslint-disable-next-line no-control-regex +const PRAGMATIC_IRI_REGEX = /^[A-Za-z][\d+-.A-Za-z]*:[^\u0000-\u0020"<>\\^`{|}]*$/u; /** - * Validate a given IRI according to RFC 3987. + * Possible ways of validating an IRI + */ +export enum IriValidationStrategy { + /** + * Validates the IRI according to RFC 3987. + */ + Strict = 'strict', + + /** + * Validates that the IRI has a valid scheme and does not contain any character forbidden by the Turtle specification. + */ + Pragmatic = 'pragmatic', + + /** + * Does not validate the IRI at all. + */ + None = 'none' +} + +/** + * Validate a given IRI according to the given strategy. + * + * By default the IRI is fully validated according to RFC 3987. + * But it is possible to do a lighter a faster validation using the "pragmatic" strategy. + * * @param {string} iri a string that may be an IRI. + * @param {IriValidationStrategy} strategy IRI validation strategy. * @return {Error | undefined} An error if the IRI is invalid, or undefined if it is valid. */ -export function validateIri(iri: string): Error | undefined { - return IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RFC 3987: '${iri}'`); +export function validateIri( + iri: string, strategy: IriValidationStrategy = IriValidationStrategy.Strict, +): Error | undefined { + switch (strategy) { + case IriValidationStrategy.Strict: + return STRICT_IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RFC 3987: '${iri}'`); + case IriValidationStrategy.Pragmatic: + return PRAGMATIC_IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RDF Turtle: '${iri}'`); + case IriValidationStrategy.None: + return undefined; + default: + return new Error(`Not supported validation strategy "${strategy}"`); + } } diff --git a/perf/validateIri.ts b/perf/validateIri.ts index ba4d731..67b94f6 100644 --- a/perf/validateIri.ts +++ b/perf/validateIri.ts @@ -1,6 +1,6 @@ /* eslint-disable no-console */ -import { validateIri } from '..'; +import { IriValidationStrategy, validateIri } from '..'; const ITERATIONS = 100_000; @@ -94,21 +94,19 @@ console.time(FULL_TIMER); for (let i = 0; i < ITERATIONS; i++) { for (const iri of VALID_ABSOLUTE_IRIS) { - validateIri(iri); + validateIri(iri, IriValidationStrategy.Strict); } } console.timeEnd(FULL_TIMER); const PARTIAL_TIMER = 'Partial IRI validation'; -// eslint-disable-next-line no-control-regex -const PARTIAL_REGEX = /^[a-zA-Z][a-zA-Z0-9+\-.]*:[^\u0000-\u0020"<>\\^`{|}]$/u; console.time(PARTIAL_TIMER); for (let i = 0; i < ITERATIONS; i++) { for (const iri of VALID_ABSOLUTE_IRIS) { - PARTIAL_REGEX.test(iri); + validateIri(iri, IriValidationStrategy.Pragmatic); } } diff --git a/test/Validate-test.ts b/test/Validate-test.ts index 81934d9..b3195f7 100644 --- a/test/Validate-test.ts +++ b/test/Validate-test.ts @@ -1,4 +1,4 @@ -import { validateIri } from '../lib/Validate'; +import { IriValidationStrategy, validateIri } from '../lib/Validate'; const VALID_ABSOLUTE_IRIS = [ 'file://foo', @@ -96,11 +96,14 @@ const VALID_ABSOLUTE_IRIS = [ 'http://example.com/?\u{E000}', ]; -const INVALID_ABSOLUTE_IRIS = [ +const ALWAYS_INVALID_ABSOLUTE_IRIS = [ '', 'foo', 'http://example.com/beepbeep\u0007\u0007', 'http://example.com/\n', +]; + +const STRICTLY_INVALID_ABSOLUTE_IRIS = [ // "::", // not OK, per Roy Fielding on the W3C uri list on 2004-04-01 // // the following test cases are from a Perl script by David A. Wheeler @@ -183,13 +186,31 @@ const INVALID_ABSOLUTE_IRIS = [ describe('Validate', () => { for (const iri of VALID_ABSOLUTE_IRIS) { test(`the IRI '${iri}' should be valid`, () => { - expect(validateIri(iri)).toBeUndefined(); + expect(validateIri(iri, IriValidationStrategy.Strict)).toBeUndefined(); + expect(validateIri(iri, IriValidationStrategy.Pragmatic)).toBeUndefined(); }); } - for (const iri of INVALID_ABSOLUTE_IRIS) { - test(`the IRI '${iri}' should be invalid`, () => { + for (const iri of ALWAYS_INVALID_ABSOLUTE_IRIS) { + test(`the IRI '${iri}' should be invalid according to pragmatic and strict modes`, () => { + expect(validateIri(iri, IriValidationStrategy.Pragmatic)).toBeInstanceOf(Error); + expect(validateIri(iri, IriValidationStrategy.Strict)).toBeInstanceOf(Error); + }); + } + + for (const iri of STRICTLY_INVALID_ABSOLUTE_IRIS) { + test(`the IRI '${iri}' should be invalid according to strict mode`, () => { expect(validateIri(iri)).toBeInstanceOf(Error); }); } + + test('the validateIri function should not fail on invalid strategy', () => { + // @ts-expect-error + expect(validateIri('http://example.com/', 'foo')).toBeInstanceOf(Error); + }); + + test('the validateIri function should always validate with the none strategy', () => { + expect(validateIri('', IriValidationStrategy.None)).toBeUndefined(); + expect(validateIri('\n', IriValidationStrategy.None)).toBeUndefined(); + }); });