Skip to content

Commit

Permalink
Add a pragmatic validation mode
Browse files Browse the repository at this point in the history
Checks if the scheme exists and is valid and that no character forbidden by Turtle is used anywhere else in the IRI.

Adds also the "none" validation strategy to easily express in downstream usages that no IRI validation is wanted
  • Loading branch information
Tpt authored and rubensworks committed Aug 15, 2022
1 parent 7e8157d commit 7555bdb
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 15 deletions.
48 changes: 43 additions & 5 deletions lib/Validate.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
function buildAbsoluteIriRegex(): RegExp {
function buildAbsoluteIriRfc3987Regex(): RegExp {
// The syntax is defined in https://www.rfc-editor.org/rfc/rfc3987#section-2.2
// Rules are defined in reversed order

Expand Down Expand Up @@ -58,13 +58,51 @@ function buildAbsoluteIriRegex(): RegExp {
return new RegExp(iri, 'u');
}

const IRI_REGEX: RegExp = buildAbsoluteIriRegex();
const STRICT_IRI_REGEX: RegExp = buildAbsoluteIriRfc3987Regex();
// eslint-disable-next-line no-control-regex
const PRAGMATIC_IRI_REGEX = /^[A-Za-z][\d+-.A-Za-z]*:[^\u0000-\u0020"<>\\^`{|}]*$/u;

/**
* Validate a given IRI according to RFC 3987.
* Possible ways of validating an IRI
*/
export enum IriValidationStrategy {
/**
* Validates the IRI according to RFC 3987.
*/
Strict = 'strict',

/**
* Validates that the IRI has a valid scheme and does not contain any character forbidden by the Turtle specification.
*/
Pragmatic = 'pragmatic',

/**
* Does not validate the IRI at all.
*/
None = 'none'
}

/**
* Validate a given IRI according to the given strategy.
*
* By default the IRI is fully validated according to RFC 3987.
* But it is possible to do a lighter a faster validation using the "pragmatic" strategy.
*
* @param {string} iri a string that may be an IRI.
* @param {IriValidationStrategy} strategy IRI validation strategy.
* @return {Error | undefined} An error if the IRI is invalid, or undefined if it is valid.
*/
export function validateIri(iri: string): Error | undefined {
return IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RFC 3987: '${iri}'`);
export function validateIri(
iri: string, strategy: IriValidationStrategy = IriValidationStrategy.Strict,
): Error | undefined {
switch (strategy) {
case IriValidationStrategy.Strict:
return STRICT_IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RFC 3987: '${iri}'`);
case IriValidationStrategy.Pragmatic:
return PRAGMATIC_IRI_REGEX.test(iri) ? undefined : new Error(`Invalid IRI according to RDF Turtle: '${iri}'`);
case IriValidationStrategy.None:
return undefined;
default:
return new Error(`Not supported validation strategy "${strategy}"`);
}
}
8 changes: 3 additions & 5 deletions perf/validateIri.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* eslint-disable no-console */

import { validateIri } from '..';
import { IriValidationStrategy, validateIri } from '..';

const ITERATIONS = 100_000;

Expand Down Expand Up @@ -94,21 +94,19 @@ console.time(FULL_TIMER);

for (let i = 0; i < ITERATIONS; i++) {
for (const iri of VALID_ABSOLUTE_IRIS) {
validateIri(iri);
validateIri(iri, IriValidationStrategy.Strict);
}
}

console.timeEnd(FULL_TIMER);

const PARTIAL_TIMER = 'Partial IRI validation';
// eslint-disable-next-line no-control-regex
const PARTIAL_REGEX = /^[a-zA-Z][a-zA-Z0-9+\-.]*:[^\u0000-\u0020"<>\\^`{|}]$/u;

console.time(PARTIAL_TIMER);

for (let i = 0; i < ITERATIONS; i++) {
for (const iri of VALID_ABSOLUTE_IRIS) {
PARTIAL_REGEX.test(iri);
validateIri(iri, IriValidationStrategy.Pragmatic);
}
}

Expand Down
31 changes: 26 additions & 5 deletions test/Validate-test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { validateIri } from '../lib/Validate';
import { IriValidationStrategy, validateIri } from '../lib/Validate';

const VALID_ABSOLUTE_IRIS = [
'file://foo',
Expand Down Expand Up @@ -96,11 +96,14 @@ const VALID_ABSOLUTE_IRIS = [
'http://example.com/?\u{E000}',
];

const INVALID_ABSOLUTE_IRIS = [
const ALWAYS_INVALID_ABSOLUTE_IRIS = [
'',
'foo',
'http://example.com/beepbeep\u0007\u0007',
'http://example.com/\n',
];

const STRICTLY_INVALID_ABSOLUTE_IRIS = [
// "::", // not OK, per Roy Fielding on the W3C uri list on 2004-04-01
//
// the following test cases are from a Perl script by David A. Wheeler
Expand Down Expand Up @@ -183,13 +186,31 @@ const INVALID_ABSOLUTE_IRIS = [
describe('Validate', () => {
for (const iri of VALID_ABSOLUTE_IRIS) {
test(`the IRI '${iri}' should be valid`, () => {
expect(validateIri(iri)).toBeUndefined();
expect(validateIri(iri, IriValidationStrategy.Strict)).toBeUndefined();
expect(validateIri(iri, IriValidationStrategy.Pragmatic)).toBeUndefined();
});
}

for (const iri of INVALID_ABSOLUTE_IRIS) {
test(`the IRI '${iri}' should be invalid`, () => {
for (const iri of ALWAYS_INVALID_ABSOLUTE_IRIS) {
test(`the IRI '${iri}' should be invalid according to pragmatic and strict modes`, () => {
expect(validateIri(iri, IriValidationStrategy.Pragmatic)).toBeInstanceOf(Error);
expect(validateIri(iri, IriValidationStrategy.Strict)).toBeInstanceOf(Error);
});
}

for (const iri of STRICTLY_INVALID_ABSOLUTE_IRIS) {
test(`the IRI '${iri}' should be invalid according to strict mode`, () => {
expect(validateIri(iri)).toBeInstanceOf(Error);
});
}

test('the validateIri function should not fail on invalid strategy', () => {
// @ts-expect-error
expect(validateIri('http://example.com/', 'foo')).toBeInstanceOf(Error);
});

test('the validateIri function should always validate with the none strategy', () => {
expect(validateIri('', IriValidationStrategy.None)).toBeUndefined();
expect(validateIri('\n', IriValidationStrategy.None)).toBeUndefined();
});
});

0 comments on commit 7555bdb

Please sign in to comment.