Skip to content

Commit

Permalink
Updates and bugfixes to Bloom filter handling
Browse files Browse the repository at this point in the history
  • Loading branch information
surilindur committed Mar 4, 2024
1 parent 1052acf commit 78fcc38
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 194 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"@context": [
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/config-query-sparql-link-traversal/^0.0.0/components/context.jsonld"
],
"comment": "Non-adaptive configuration using cAll for link extraction",
"import": [
"ccqslt:config/config-solid-base.json",
"ccqslt:config/extract-links/actors/all.json"
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
"mediatorRdfResolveHypermediaLinksQueue": {
"@id": "urn:comunica:default:rdf-resolve-hypermedia-links-queue/mediators#main"
},
"ignorePatterns": [
"publicTypeIndex$",
"privateTypeIndex$"
]
"ignorePattern": "(public|private)TypeIndex$",
"alwaysReject": "^https?:\/\/www.w3.org\/"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ export class ActorContextPreprocessLinkFilter extends ActorContextPreprocess {
}

public async run(action: IAction): Promise<IActorContextPreprocessOutput> {
return { ...action, context: action.context.set(KeyLinkFilters, []) };
return { ...action, context: action.context.set(KeyLinkFilters, new Map()) };
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,17 @@ export class ActorRdfMetadataExtractLinkFilter extends ActorRdfMetadataExtract {
}

public async run(action: IActionRdfMetadataExtract): Promise<IActorRdfMetadataExtractOutput> {
const filterData = await this.extractFilters(action.metadata);
const filterData = await this.collectFilterData(action.metadata);
if (filterData.size > 0) {
const filters = action.context.getSafe<ILinkFilter[]>(KeyLinkFilters);
for (const data of filterData.values()) {
const parseResult = await this.mediatorRdfParseLinkFilter.mediate({ data, context: action.context });
filters.push(parseResult.filter);
const filters = action.context.getSafe<Map<string, ILinkFilter>>(KeyLinkFilters);
for (const [ filterUri, filterQuads ] of filterData) {
if (!filters.has(filterUri)) {
const parseResult = await this.mediatorRdfParseLinkFilter.mediate({
data: filterQuads,
context: action.context,
});
filters.set(parseResult.filter.uri, parseResult.filter);
}
}
}
return { metadata: {}};
Expand All @@ -48,7 +53,7 @@ export class ActorRdfMetadataExtractLinkFilter extends ActorRdfMetadataExtract {
* @param stream The RDF metadata stream to process
* @returns The collected membership filter data
*/
protected async extractFilters(stream: RDF.Stream): Promise<Map<string, RDF.Quad[]>> {
protected async collectFilterData(stream: RDF.Stream): Promise<Map<string, RDF.Quad[]>> {
return new Promise((resolve, reject) => {
const filters = new Map<string, RDF.Quad[]>();
const quads = new Map<string, RDF.Quad[]>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,30 @@ export class ActorRdfParseLinkFilterBloom extends ActorRdfParseLinkFilter {
}

public async run(action: IActionRdfParseLinkFilter): Promise<IActorRdfParseLinkFilterOutput> {
const uri = action.data.find(quad =>
quad.subject.termType === 'NamedNode' &&
quad.predicate.value === ActorRdfParseLinkFilterBloom.RDF_TYPE &&
quad.object.value === ActorRdfParseLinkFilterBloom.MEM_BLOOMFILTER)!.subject.value;
const hashBits = Number.parseInt(action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_BITSIZE)!.object.value, 10);
const hashCount = Number.parseInt(action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_HASHSIZE)!.object.value, 10);
const dataset = action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_SOURCECOLLECTION)!.object.value;
const property = action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_PROJECTEDPROPERTY ||
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_PROJECTEDRESOURCE)!.object.value;
const projectedProperty = action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_PROJECTEDPROPERTY)?.object.value;
const projectedResource = action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_PROJECTEDRESOURCE)?.object.value;
const buffer = Buffer.from(action.data.find(quad =>
quad.predicate.value === ActorRdfParseLinkFilterBloom.MEM_BINARYREPRESENTATION)!.object.value, 'base64');
return { filter: new LinkFilterBloom({ dataset, hashBits, hashCount, buffer, property }) };
return { filter: new LinkFilterBloom({
uri,
dataset,
hashBits,
hashCount,
buffer,
projectedProperty,
projectedResource,
}) };
}
}
70 changes: 21 additions & 49 deletions packages/actor-rdf-parse-link-filter-bloom/lib/LinkFilterBloom.ts
Original file line number Diff line number Diff line change
@@ -1,77 +1,50 @@
import type { ILinkFilter, ILinkFilterAction } from '@comunica/bus-rdf-parse-link-filter';
import { LinkFilter, type ILinkFilterArgs } from '@comunica/bus-rdf-parse-link-filter';
import type * as RDF from '@rdfjs/types';
import { Bloem } from 'bloem';
import type { Algebra } from 'sparqlalgebrajs';

/**
* An approximate membership filter that is backed by a Bloom filter.
*/
export class LinkFilterBloom implements ILinkFilter {
export class LinkFilterBloom extends LinkFilter {
private readonly filter: Bloem;
private readonly dataset: string;
private readonly property?: string;
private readonly resource?: string;
private readonly projectedProperty?: string;
private readonly projectedResource?: string;

public constructor(args: ILinkFilterBloomArgs) {
super(args);
this.filter = new Bloem(args.hashBits, args.hashCount, args.buffer);
this.dataset = args.dataset;
if (!args.property && !args.resource) {
this.projectedProperty = args.projectedProperty;
this.projectedResource = args.projectedResource;
if (!this.projectedProperty && !this.projectedResource) {
throw new Error('Bloom link filter requires a property or resource to filter by');
}
this.property = args.property;
this.resource = args.resource;
}

public test(action: ILinkFilterAction): boolean {
return action.link.url.startsWith(this.dataset) && action.patterns.some(pattern =>
(this.property && pattern.predicate.termType === 'NamedNode' && pattern.predicate.value === this.property) ||
(this.resource && (
(pattern.subject.termType === 'NamedNode' && pattern.subject.value === this.resource) ||
(pattern.object.termType === 'NamedNode' && pattern.object.value === this.resource)
)));
}

public run(action: ILinkFilterAction): boolean {
for (const pattern of action.patterns) {
if (this.property &&
public answers(patterns: Algebra.Pattern[]): boolean {
for (const pattern of patterns) {
if (this.projectedProperty &&
pattern.predicate.termType === 'NamedNode' &&
pattern.predicate.value === this.property &&
pattern.predicate.value === this.projectedProperty &&
(this.filterHasTerm(pattern.subject) || this.filterHasTerm(pattern.object))
) {
/*
console.log(`Accept <${action.link.url}>`);
console.log(`\tFilter for <${this.dataset}>`);
console.log(`\tContains one of: ${pattern.subject.value}, ${pattern.object.value}`);
*/
return true;
}
if (this.resource) {
if (this.projectedResource) {
if (pattern.subject.termType === 'NamedNode' &&
pattern.subject.value === this.resource &&
pattern.subject.value === this.projectedResource &&
(this.filterHasTerm(pattern.predicate) || this.filterHasTerm(pattern.object))
) {
/*
console.log(`Accept <${action.link.url}>`);
console.log(`\tFilter for <${this.dataset}>`);
console.log(`\tContains one of: ${pattern.predicate.value}, ${pattern.object.value}`);
*/
return true;
}
if (pattern.object.termType === 'NamedNode' &&
pattern.object.value === this.resource &&
pattern.object.value === this.projectedResource &&
(this.filterHasTerm(pattern.predicate) || this.filterHasTerm(pattern.subject))
) {
/*
console.log(`Accept <${action.link.url}>`);
console.log(`\tFilter for <${this.dataset}>`);
console.log(`\tContains one of: ${pattern.predicate.value}, ${pattern.subject.value}`);
*/
return true;
}
}
}
/*
console.log(`Reject <${action.link.url}>`);
*/
return false;
}

Expand All @@ -81,15 +54,14 @@ export class LinkFilterBloom implements ILinkFilter {
* @returns Whether the term is contained in the filter OR the term is of type that cannot be in it.
*/
protected filterHasTerm(term: RDF.Term): boolean {
return term.termType === 'Variable' || (term.termType === 'NamedNode' && this.filter.has(Buffer.from(term.value)));
return term.termType !== 'NamedNode' || this.filter.has(Buffer.from(term.value));
}
}

export interface ILinkFilterBloomArgs {
dataset: string;
property?: string;
resource?: string;
export interface ILinkFilterBloomArgs extends ILinkFilterArgs {
buffer: Buffer;
hashBits: number;
hashCount: number;
buffer: Buffer;
projectedProperty?: string;
projectedResource?: string;
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,28 @@ export class ActorRdfResolveHypermediaLinksQueueWrapperLinkFilter extends ActorR
const { linkQueue } = await this.mediatorRdfResolveHypermediaLinksQueue.mediate({ ...action, context: subContext });
const operation = action.context.getSafe<Algebra.Operation>(KeysInitQuery.query);
const patterns = ActorRdfResolveHypermediaLinksQueueWrapperLinkFilter.extractOperationPatterns(operation);
const filters = action.context.getSafe<ILinkFilter[]>(KeyLinkFilters);
const filters = action.context.getSafe<Map<string, ILinkFilter>>(KeyLinkFilters);
const accept = (link: ILink): boolean => {
let acceptLink = true;
let acceptingFilter: ILinkFilter | undefined;
if (!this.ignorePattern?.test(link.url)) {
if (this.alwaysReject?.test(link.url)) {
acceptLink = false;
} else {
const applicableFilters = filters.filter(filter => filter.test({ link, patterns }));
acceptingFilter = applicableFilters.find(filter => filter.run({ link, patterns }));
acceptLink = applicableFilters.length === 0 || acceptingFilter !== undefined;
let foundApplicableFilters = false;
for (const filter of filters.values()) {
if (link.url.startsWith(filter.dataset)) {
foundApplicableFilters = true;
if (filter.answers(patterns)) {
acceptingFilter = filter;
break;
}
}
}
acceptLink = !foundApplicableFilters || acceptingFilter !== undefined;
}
}
console.log(`${acceptLink ? 'Accept' : 'Reject'} <${link.url}>`, acceptingFilter);
// Debug: console.log(`${acceptLink ? 'Accept' : 'Reject'} <${link.url}> filter ${acceptingFilter?.uri}`);
return acceptLink;
};
return { linkQueue: new LinkQueueWrapperFilter(linkQueue, accept) };
Expand Down
26 changes: 19 additions & 7 deletions packages/bus-rdf-parse-link-filter/lib/LinkFilter.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
import type { ILink } from '@comunica/bus-rdf-resolve-hypermedia-links-queue';
import { ActionContextKey } from '@comunica/core';
import type { Algebra } from 'sparqlalgebrajs';

export interface ILinkFilter {
test: (action: ILinkFilterAction) => boolean;
run: (action: ILinkFilterAction) => boolean;
uri: string;
dataset: string;
answers: (patterns: Algebra.Pattern[]) => boolean;
}

export interface ILinkFilterAction {
link: ILink;
patterns: Algebra.Pattern[];
export abstract class LinkFilter implements ILinkFilter {
public uri: string;
public dataset: string;

public constructor(args: ILinkFilterArgs) {
this.uri = args.uri;
this.dataset = args.dataset;
}

public abstract answers(patterns: Algebra.Pattern[]): boolean;
}

export interface ILinkFilterArgs {
uri: string;
dataset: string;
}

export const KeyLinkFilters = new ActionContextKey<ILinkFilter[]>(
export const KeyLinkFilters = new ActionContextKey<Map<string, ILinkFilter>>(
'@comunica/bus-rdf-parse:link-filters',
);
Loading

0 comments on commit 78fcc38

Please sign in to comment.