Skip to content

Commit

Permalink
Add link filters and their extraction from VoID descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
surilindur committed Nov 26, 2024
1 parent 2d170e2 commit 13a8225
Show file tree
Hide file tree
Showing 22 changed files with 561 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"ccqs:config/rdf-metadata-extract/actors.json",
"ccqs:config/rdf-metadata-extract/mediators.json",
"ccqslt:config/rdf-metadata-extract/actors/traverse.json",
"ccqslt:config/rdf-metadata-extract/actors/link-filter-void.json",
"ccqs:config/rdf-parse/actors.json",
"ccqs:config/rdf-parse/mediators.json",
"ccqs:config/rdf-parse-html/actors.json",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"ccqslt:config/extract-links/actors/content-policies-conditional.json",
"ccqslt:config/extract-links/actors/quad-pattern-query.json",
"ccqslt:config/rdf-resolve-hypermedia-links/actors/traverse-replace-conditional.json",
"ccqslt:config/rdf-resolve-hypermedia-links-queue/actors/wrapper-filter.json",
"ccqslt:config/rdf-resolve-hypermedia-links-queue/actors/wrapper-limit-count.json"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"@context": [
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/runner/^4.0.0/components/context.jsonld",
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/actor-rdf-metadata-extract-link-filter-void/^0.0.0/components/context.jsonld"
],
"@id": "urn:comunica:default:Runner",
"@type": "Runner",
"actors": [
{
"@id": "urn:comunica:default:rdf-metadata-extract/actors#link-filter-void",
"@type": "ActorRdfMetadataExtractLinkFilterVoid"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"@context": [
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/runner/^4.0.0/components/context.jsonld",
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-filter/^0.0.0/components/context.jsonld"
],
"@id": "urn:comunica:default:Runner",
"@type": "Runner",
"actors": [
{
"@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/actors#wrapper-filter",
"@type": "ActorRdfResolveHypermediaLinksQueueWrapperFilter",
"beforeActors": { "@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/actors#fifo" },
"mediatorRdfResolveHypermediaLinksQueue": { "@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/mediators#main" }
}
]
}
2 changes: 2 additions & 0 deletions engines/query-sparql-link-traversal-solid/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@
"@comunica/actor-rdf-metadata-extract-hydra-controls": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-hydra-count": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-hydra-pagesize": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-link-filter-void": "^0.6.0",
"@comunica/actor-rdf-metadata-extract-patch-sparql-update": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-put-accepted": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-request-time": "^4.0.2",
Expand All @@ -282,6 +283,7 @@
"@comunica/actor-rdf-parse-xml-rdfa": "^4.0.2",
"@comunica/actor-rdf-resolve-hypermedia-links-next": "^4.0.2",
"@comunica/actor-rdf-resolve-hypermedia-links-queue-fifo": "^4.0.2",
"@comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-filter": "^0.6.0",
"@comunica/actor-rdf-resolve-hypermedia-links-traverse": "^0.6.0",
"@comunica/actor-rdf-serialize-jsonld": "^4.0.2",
"@comunica/actor-rdf-serialize-n3": "^4.0.2",
Expand Down
2 changes: 2 additions & 0 deletions engines/query-sparql-link-traversal/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@
"@comunica/actor-rdf-metadata-extract-hydra-controls": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-hydra-count": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-hydra-pagesize": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-link-filter-void": "^0.6.0",
"@comunica/actor-rdf-metadata-extract-patch-sparql-update": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-put-accepted": "^4.0.2",
"@comunica/actor-rdf-metadata-extract-request-time": "^4.0.2",
Expand All @@ -278,6 +279,7 @@
"@comunica/actor-rdf-parse-xml-rdfa": "^4.0.2",
"@comunica/actor-rdf-resolve-hypermedia-links-next": "^4.0.2",
"@comunica/actor-rdf-resolve-hypermedia-links-queue-fifo": "^4.0.2",
"@comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-filter": "^0.6.0",
"@comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-limit-count": "^0.6.0",
"@comunica/actor-rdf-resolve-hypermedia-links-traverse": "^0.6.0",
"@comunica/actor-rdf-resolve-hypermedia-links-traverse-replace-conditional": "^0.6.0",
Expand Down
35 changes: 35 additions & 0 deletions packages/actor-rdf-metadata-extract-link-filter-void/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Comunica VoID Link Filter RDF Metadata Extract Actor

[![npm version](https://badge.fury.io/js/%40comunica%2Factor-rdf-metadata-extract-link-filter-void.svg)](https://www.npmjs.com/package/@comunica/actor-rdf-metadata-extract-link-filter-void)

An [RDF Metadata Extract](https://github.com/comunica/comunica/tree/master/packages/bus-rdf-metadata-extract) actor that
creates link filters based on [VoID descriptions](https://www.w3.org/TR/void/) to filter out redundant links.
The filters are added to the context filter list.

This module is part of the [Comunica framework](https://github.com/comunica/comunica),
and should only be used by [developers that want to build their own query engine](https://comunica.dev/docs/modify/).

[Click here if you just want to query with Comunica](https://comunica.dev/docs/query/).

## Install

```bash
$ yarn add @comunica/actor-rdf-metadata-extract-link-filter-void
```

## Configure

After installing, this package can be added to your engine's configuration as follows:
```json
{
"@context": [
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/actor-rdf-metadata-extract-link-filter-void/^0.0.0/components/context.jsonld"
],
"actors": [
{
"@id": "urn:comunica:default:rdf-metadata-extract/actors#link-filter-void",
"@type": "ActorRdfMetadataExtractLinkFilterVoid"
}
]
}
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import type { MediatorExtractLinks } from '@comunica/bus-extract-links';
import type {
IActionRdfMetadataExtract,
IActorRdfMetadataExtractOutput,
IActorRdfMetadataExtractArgs,
} from '@comunica/bus-rdf-metadata-extract';
import { ActorRdfMetadataExtract } from '@comunica/bus-rdf-metadata-extract';
import { KeysRdfResolveHypermediaLinks } from '@comunica/context-entries-link-traversal';
import type { IActorTest, TestResult } from '@comunica/core';
import { passTestVoid, failTest } from '@comunica/core';
import type { ILink } from '@comunica/types';
import type { LinkFilterType } from '@comunica/types-link-traversal';
import type * as RDF from '@rdfjs/types';

/**
* Comunica RDF metadata extract actor to collect link filters from VoID descriptions.
*/
export class ActorRdfMetadataExtractLinkFilterVoid extends ActorRdfMetadataExtract {
private readonly mediatorExtractLinks: MediatorExtractLinks;

public constructor(args: IActorRdfMetadataExtractArgs) {
super(args);
}

public async test(action: IActionRdfMetadataExtract): Promise<TestResult<IActorTest>> {
if (!action.context.has(KeysRdfResolveHypermediaLinks.linkFilters)) {
return failTest('unable to extract link filters without context storage target present');
}
return passTestVoid();
}

public async run(action: IActionRdfMetadataExtract): Promise<IActorRdfMetadataExtractOutput> {
const discoveredFilters = await this.extractFilters(action.metadata);
if (discoveredFilters.length > 0) {
const linkFilters = action.context.getSafe(KeysRdfResolveHypermediaLinks.linkFilters);
linkFilters.push(...discoveredFilters);
}
return { metadata: {}};
}

public async extractFilters(stream: RDF.Stream): Promise<LinkFilterType[]> {
return new Promise<LinkFilterType[]>((resolve, reject) => {
const filters = new Map<string, LinkFilterType>();
const subjectsWithEndpoints = new Set<string>();
stream
.on('error', reject)
.on('data', (quad: RDF.Quad) => {
switch (quad.predicate.value) {
case 'http://rdfs.org/ns/void#sparqlEndpoint':
subjectsWithEndpoints.add(quad.subject.value);
break;
case 'http://rdfs.org/ns/void#uriRegexPattern':
filters.set(quad.subject.value, (link: ILink) => !new RegExp(quad.object.value, 'u').test(link.url));
break;
case 'http://rdfs.org/ns/void#uriSpace':
filters.set(quad.subject.value, (link: ILink) => !link.url.startsWith(quad.object.value));
break;
}
})
.on('end', () => {
const output: LinkFilterType[] = [];
for (const [ subject, filter ] of filters) {
if (subjectsWithEndpoints.has(subject)) {
output.push(filter);
}
}
resolve(output);
});
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './ActorRdfMetadataExtractLinkFilterVoid';
48 changes: 48 additions & 0 deletions packages/actor-rdf-metadata-extract-link-filter-void/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"name": "@comunica/actor-rdf-metadata-extract-link-filter-void",
"version": "0.6.0",
"description": "An actor to extract link filters based on VoID descriptions",
"lsd:module": true,
"license": "MIT",
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/comunica-association"
},
"homepage": "https://comunica.dev/",
"repository": {
"type": "git",
"url": "https://github.com/comunica/comunica-feature-link-traversal.git",
"directory": "packages/actor-rdf-metadata-extract-link-filter-void"
},
"bugs": {
"url": "https://github.com/comunica/comunica-feature-link-traversal/issues"
},
"keywords": [
"comunica",
"actor",
"rdf-metadata-link-filter-void"
],
"sideEffects": false,
"main": "lib/index.js",
"typings": "lib/index",
"publishConfig": {
"access": "public"
},
"files": [
"components",
"lib/**/*.d.ts",
"lib/**/*.js",
"lib/**/*.js.map"
],
"scripts": {
"build": "yarn run build:ts && yarn run build:components",
"build:ts": "node \"../../node_modules/typescript/bin/tsc\"",
"build:components": "componentsjs-generator"
},
"dependencies": {
"@comunica/bus-extract-links": "^0.6.0",
"@comunica/bus-rdf-metadata-extract": "^4.0.2",
"@comunica/context-entries-link-traversal": "^0.6.0",
"@comunica/core": "^4.0.2"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import { KeysRdfResolveHypermediaLinks } from '@comunica/context-entries-link-traversal';
import { Bus, ActionContext } from '@comunica/core';
import type { LinkFilterType } from '@comunica/types-link-traversal';
import { DataFactory } from 'rdf-data-factory';
import '@comunica/utils-jest';

import { ActorRdfMetadataExtractLinkFilterVoid } from '../lib/ActorRdfMetadataExtractLinkFilterVoid';

const streamifyArray = require('streamify-array');

const DF = new DataFactory();

const voidUriSpace = DF.namedNode('http://rdfs.org/ns/void#uriSpace');
const voidUriRegexPattern = DF.namedNode('http://rdfs.org/ns/void#uriRegexPattern');
const voidSparqlEndpoint = DF.namedNode('http://rdfs.org/ns/void#sparqlEndpoint');

describe('ActorRdfMetadataExtractLinkFilterVoid', () => {
let bus: any;
let actor: ActorRdfMetadataExtractLinkFilterVoid;
let linkFilters: LinkFilterType[];

beforeEach(() => {
jest.resetAllMocks();
bus = new Bus({ name: 'bus' });
actor = new ActorRdfMetadataExtractLinkFilterVoid({ bus, name: 'actor' });
linkFilters = [];
});

describe('test', () => {
it('should pass with filter storage in context', async() => {
await expect(actor.test({
context: new ActionContext({ [KeysRdfResolveHypermediaLinks.linkFilters.name]: linkFilters }),
metadata: <any>{},
requestTime: 0,
url: 'url',
})).resolves.toPassTestVoid();
});

it('should fail without filter storage in context', async() => {
await expect(actor.test({
context: new ActionContext(),
metadata: <any>{},
requestTime: 0,
url: 'url',
})).resolves.toFailTest('unable to extract link filters without context storage target present');
});
});

describe('run', () => {
it('should register discovered link filters', async() => {
jest.spyOn(actor, 'extractFilters').mockResolvedValue([ <any>'filter' ]);
await expect(actor.run({
context: new ActionContext({ [KeysRdfResolveHypermediaLinks.linkFilters.name]: linkFilters }),
metadata: <any>{},
requestTime: 0,
url: 'url',
})).resolves.toEqual({ metadata: {}});
expect(linkFilters).toEqual([ 'filter' ]);
});

it('should not register any filters when none are discovered', async() => {
jest.spyOn(actor, 'extractFilters').mockResolvedValue([]);
await expect(actor.run({
context: new ActionContext({ [KeysRdfResolveHypermediaLinks.linkFilters.name]: linkFilters }),
metadata: <any>{},
requestTime: 0,
url: 'url',
})).resolves.toEqual({ metadata: {}});
expect(linkFilters).toEqual([]);
});
});

describe('extractFilters', () => {
it('should parse filters from void:uriSpace', async() => {
const subjectWithEndpoint = DF.blankNode();
const subjectWithoutEndpoint = DF.blankNode();
const stream = streamifyArray([
DF.quad(subjectWithEndpoint, voidSparqlEndpoint, DF.literal('http://localhost/endpoint')),
DF.quad(subjectWithEndpoint, voidUriSpace, DF.literal('http://localhost/')),
DF.quad(subjectWithoutEndpoint, voidUriSpace, DF.literal('http://otherhost/')),
]);
const filters = await actor.extractFilters(stream);
expect(filters).toHaveLength(1);
expect(filters[0]({ url: 'http://localhost/some/uri' })).toBeFalsy();
});

it('should parse filters from void:uriRegexPattern', async() => {
const subjectWithEndpoint = DF.blankNode();
const subjectWithoutEndpoint = DF.blankNode();
const stream = streamifyArray([
DF.quad(subjectWithEndpoint, voidSparqlEndpoint, DF.literal('http://localhost/endpoint')),
DF.quad(subjectWithEndpoint, voidUriRegexPattern, DF.literal('^http://localhost/')),
DF.quad(subjectWithoutEndpoint, voidUriRegexPattern, DF.literal('^http://otherhost/')),
]);
const filters = await actor.extractFilters(stream);
expect(filters).toHaveLength(1);
expect(filters[0]({ url: 'http://localhost/some/uri' })).toBeFalsy();
});
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Comunica Hypermedia Links Queue Wrapper for Link Filtering

[![npm version](https://badge.fury.io/js/%40comunica%2Factor-rdf-resolve-hypermedia-links-queue-wrapper-filter.svg)](https://www.npmjs.com/package/@comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-filter)

An [RDF Resolve Hypermedia Links Queue](https://github.com/comunica/comunica/tree/master/packages/bus-rdf-resolve-hypermedia-links-queue) actor
that wraps over another link queue provided by the bus,
and filters the links that can be taken out of the queue.

This module is part of the [Comunica framework](https://github.com/comunica/comunica),
and should only be used by [developers that want to build their own query engine](https://comunica.dev/docs/modify/).

[Click here if you just want to query with Comunica](https://comunica.dev/docs/query/).

## Install

```bash
$ yarn add @comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-filter
```

## Configure

After installing, this package can be added to your engine's configuration as follows:
```json
{
"@context": [
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/actor-rdf-resolve-hypermedia-links-queue-wrapper-filter/^0.0.0/components/context.jsonld"
],
"actors": [
{
"@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/actors#wrapper-filter",
"@type": "ActorRdfResolveHypermediaLinksQueueWrapperFilter",
"beforeActors": { "@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/actors#fifo" },
"mediatorRdfResolveHypermediaLinksQueue": { "@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/mediators#main" }
}
]
}
```
Loading

0 comments on commit 13a8225

Please sign in to comment.