From 7555bdb4cdda0d07e7c15b58783cb9ed5b822cf6 Mon Sep 17 00:00:00 2001 From: Thomas Tanon Date: Mon, 15 Aug 2022 12:02:00 +0200 Subject: [PATCH] Add a pragmatic validation mode Checks if the scheme exists and is valid and that no character forbidden by Turtle is used anywhere else in the IRI. Adds also the "none" validation strategy to easily express in downstream usages that no IRI validation is wanted --- lib/Validate.ts | 48 ++++++++++++++++++++++++++++++++++++++----- perf/validateIri.ts | 8 +++----- test/Validate-test.ts | 31 +++++++++++++++++++++++----- 3 files changed, 72 insertions(+), 15 deletions(-) diff --git a/lib/Validate.ts b/lib/Validate.ts index 3b5818c..17a4440 100644 --- a/lib/Validate.ts +++ b/lib/Validate.ts @@ -1,4 +1,4 @@ -function buildAbsoluteIriRegex(): RegExp { +function buildAbsoluteIriRfc3987Regex(): RegExp { // The syntax is defined in https://www.rfc-editor.org/rfc/rfc3987#section-2.2 // Rules are defined in reversed order @@ -58,13 +58,51 @@ function buildAbsoluteIriRegex(): RegExp { return new RegExp(iri, 'u'); } -const IRI_REGEX: RegExp = buildAbsoluteIriRegex(); +const STRICT_IRI_REGEX: RegExp = buildAbsoluteIriRfc3987Regex(); +// eslint-disable-next-line no-control-regex +const PRAGMATIC_IRI_REGEX = /^[A-Za-z][\d+-.A-Za-z]*:[^\u0000-\u0020"<>\\^`{|}]*$/u; /** - * Validate a given IRI according to RFC 3987. + * Possible ways of validating an IRI + */ +export enum IriValidationStrategy { + /** + * Validates the IRI according to RFC 3987. + */ + Strict = 'strict', + + /** + * Validates that the IRI has a valid scheme and does not contain any character forbidden by the Turtle specification. + */ + Pragmatic = 'pragmatic', + + /** + * Does not validate the IRI at all. + */ + None = 'none' +} + +/** + * Validate a given IRI according to the given strategy. + * + * By default the IRI is fully validated according to RFC 3987. + * But it is possible to do a lighter a faster validation using the "pragmatic" strategy. + * * @param {string} iri a string that may be an IRI. + * @param {IriValidationStrategy} strategy IRI validation strategy. * @return {Error | undefined} An error if the IRI is invalid, or undefined if it is valid. */ -export function validateIri(iri: string): Error | undefined { - return IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RFC 3987: '${iri}'`); +export function validateIri( + iri: string, strategy: IriValidationStrategy = IriValidationStrategy.Strict, +): Error | undefined { + switch (strategy) { + case IriValidationStrategy.Strict: + return STRICT_IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RFC 3987: '${iri}'`); + case IriValidationStrategy.Pragmatic: + return PRAGMATIC_IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RDF Turtle: '${iri}'`); + case IriValidationStrategy.None: + return undefined; + default: + return new Error(`Not supported validation strategy "${strategy}"`); + } } diff --git a/perf/validateIri.ts b/perf/validateIri.ts index ba4d731..67b94f6 100644 --- a/perf/validateIri.ts +++ b/perf/validateIri.ts @@ -1,6 +1,6 @@ /* eslint-disable no-console */ -import { validateIri } from '..'; +import { IriValidationStrategy, validateIri } from '..'; const ITERATIONS = 100_000; @@ -94,21 +94,19 @@ console.time(FULL_TIMER); for (let i = 0; i < ITERATIONS; i++) { for (const iri of VALID_ABSOLUTE_IRIS) { - validateIri(iri); + validateIri(iri, IriValidationStrategy.Strict); } } console.timeEnd(FULL_TIMER); const PARTIAL_TIMER = 'Partial IRI validation'; -// eslint-disable-next-line no-control-regex -const PARTIAL_REGEX = /^[a-zA-Z][a-zA-Z0-9+\-.]*:[^\u0000-\u0020"<>\\^`{|}]$/u; console.time(PARTIAL_TIMER); for (let i = 0; i < ITERATIONS; i++) { for (const iri of VALID_ABSOLUTE_IRIS) { - PARTIAL_REGEX.test(iri); + validateIri(iri, IriValidationStrategy.Pragmatic); } } diff --git a/test/Validate-test.ts b/test/Validate-test.ts index 81934d9..b3195f7 100644 --- a/test/Validate-test.ts +++ b/test/Validate-test.ts @@ -1,4 +1,4 @@ -import { validateIri } from '../lib/Validate'; +import { IriValidationStrategy, validateIri } from '../lib/Validate'; const VALID_ABSOLUTE_IRIS = [ 'file://foo', @@ -96,11 +96,14 @@ const VALID_ABSOLUTE_IRIS = [ 'http://example.com/?\u{E000}', ]; -const INVALID_ABSOLUTE_IRIS = [ +const ALWAYS_INVALID_ABSOLUTE_IRIS = [ '', 'foo', 'http://example.com/beepbeep\u0007\u0007', 'http://example.com/\n', +]; + +const STRICTLY_INVALID_ABSOLUTE_IRIS = [ // "::", // not OK, per Roy Fielding on the W3C uri list on 2004-04-01 // // the following test cases are from a Perl script by David A. Wheeler @@ -183,13 +186,31 @@ const INVALID_ABSOLUTE_IRIS = [ describe('Validate', () => { for (const iri of VALID_ABSOLUTE_IRIS) { test(`the IRI '${iri}' should be valid`, () => { - expect(validateIri(iri)).toBeUndefined(); + expect(validateIri(iri, IriValidationStrategy.Strict)).toBeUndefined(); + expect(validateIri(iri, IriValidationStrategy.Pragmatic)).toBeUndefined(); }); } - for (const iri of INVALID_ABSOLUTE_IRIS) { - test(`the IRI '${iri}' should be invalid`, () => { + for (const iri of ALWAYS_INVALID_ABSOLUTE_IRIS) { + test(`the IRI '${iri}' should be invalid according to pragmatic and strict modes`, () => { + expect(validateIri(iri, IriValidationStrategy.Pragmatic)).toBeInstanceOf(Error); + expect(validateIri(iri, IriValidationStrategy.Strict)).toBeInstanceOf(Error); + }); + } + + for (const iri of STRICTLY_INVALID_ABSOLUTE_IRIS) { + test(`the IRI '${iri}' should be invalid according to strict mode`, () => { expect(validateIri(iri)).toBeInstanceOf(Error); }); } + + test('the validateIri function should not fail on invalid strategy', () => { + // @ts-expect-error + expect(validateIri('http://example.com/', 'foo')).toBeInstanceOf(Error); + }); + + test('the validateIri function should always validate with the none strategy', () => { + expect(validateIri('', IriValidationStrategy.None)).toBeUndefined(); + expect(validateIri('\n', IriValidationStrategy.None)).toBeUndefined(); + }); });