Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 16, 2024
1 parent f40a74a commit 542503f
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -303,8 +304,8 @@ private boolean[] needReplacement(CellParser[] cellParsers) {
boolean[] placeholders = new boolean[cellParsers.length];

for (int i = 0; i < cellParsers.length; i++) {
placeholders[i] &= (cellParsers[i].getAboutPlaceholders() != null);
placeholders[i] &= (cellParsers[i].getValuePlaceholders() != null);
placeholders[i] = (cellParsers[i].getAboutPlaceholders().length > 0) ||
(cellParsers[i].getValuePlaceholders().length > 0);
}
return placeholders;
}
Expand All @@ -328,16 +329,15 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse

// check for placeholder / column name that's being used to create subject IRI
int aboutIndex = getAboutIndex(aboutURL, cellParsers);
String placeholder = (aboutIndex > -1) ? "{" + cellParsers[aboutIndex].getNameEncoded() + "}" : null;
String placeholder = (aboutIndex > -1) ? cellParsers[aboutIndex].getNameEncoded() : null;

// check which columns need replacement in aboutURL/valueURL
boolean[] needReplacement = needReplacement(cellParsers);

boolean doReplace = false;
for (int i = 0; i < needReplacement.length; i++) {
if (needReplacement[i]) {
doReplace = true;
return;
break;
}
}

Expand Down Expand Up @@ -374,7 +374,7 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
values.put(cellParsers[i].getNameEncoded(), val.stringValue());
}
if (!cellParsers[i].isSuppressed() && !needReplacement[i]) {
handleStatement(handler, cellParsers[i], cells[i], aboutSubject, val);
handler.handleStatement(buildStatement(cellParsers[i], cells[i], aboutSubject, val));
}
}
// second pass, this time to retrieve replace placeholders in URLs with column values
Expand All @@ -383,16 +383,15 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
continue;
}
if (!cellParsers[i].isSuppressed()) {
handleStatement(handler, cellParsers[i], cells[i], aboutSubject, values);
handler.handleStatement(buildStatement(cellParsers[i], cells[i], aboutSubject, values));
}
}
// virtual columns, if any
for (int i = cells.length; i < cellParsers.length; i++) {
if (doReplace) {
values.put("{_col}", Long.toString(i));
}
System.err.println("column: " + i);
handleStatement(handler, cellParsers[i], aboutSubject, values, needReplacement[i]);
handler.handleStatement(buildStatement(cellParsers[i], null, aboutSubject, values));
}
line++;
}
Expand All @@ -409,14 +408,12 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
* @param cells
* @param aboutSubject
*/
private void handleStatement(RDFHandler handler, CellParser cellParser, String cell, Resource aboutSubject,
Value val) {
private Statement buildStatement(CellParser cellParser, String cell, Resource aboutSubject, Value val) {
Resource s = cellParser.getAboutUrl(cell);
IRI predicate = cellParser.getPropertyIRI();
Resource o = cellParser.getValueUrl(cell);

Statement stmt = Statements.statement((s != null) ? s : aboutSubject, predicate, (o != null) ? o : val, null);
handler.handleStatement(stmt);
return Statements.statement((s != null) ? s : aboutSubject, predicate, (o != null) ? o : val, null);
}

/**
Expand All @@ -427,35 +424,16 @@ private void handleStatement(RDFHandler handler, CellParser cellParser, String c
* @param cells
* @param aboutSubject
*/
private void handleStatement(RDFHandler handler, CellParser cellParser, String cell, Resource aboutSubject,
private Statement buildStatement(CellParser cellParser, String cell, Resource aboutSubject,
Map<String, String> values) {
Resource s = cellParser.getAboutUrl(values);
IRI predicate = cellParser.getPropertyIRI();
Value o = cellParser.getValueUrl(values, cell);
if (o == null) {
if (o == null && cell != null) {
o = cellParser.parse(cell);
}

Statement stmt = Statements.statement((s != null) ? s : aboutSubject, predicate, o, null);
handler.handleStatement(stmt);
}

/**
* Generate statement
*
* @param handler
* @param cellParser
* @param cells
* @param aboutSubject
*/
private void handleStatement(RDFHandler handler, CellParser cellParser, Resource aboutSubject,
Map<String, String> values, boolean needsReplacement) {
Resource s = cellParser.getAboutUrl(values);
IRI predicate = cellParser.getPropertyIRI();
Resource o = (needsReplacement) ? cellParser.getValueUrl(values, null) : cellParser.getValueUrl(null);

Statement stmt = Statements.statement((s != null) ? s : aboutSubject, predicate, o, null);
handler.handleStatement(stmt);
return Statements.statement((s != null) ? s : aboutSubject, predicate, o, null);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ protected static CellParser getCellParser(Model metadata, Resource column) {
Models.getPropertyString(metadata, column, CSVW.TRIM)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));

Models.getPropertyString(metadata, column, CSVW.ABOUT_URL).ifPresent(v -> parser.setAboutUrl(v));
Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueUrl(v));

// use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@

import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.MatchResult;
Expand Down Expand Up @@ -52,10 +50,10 @@ public abstract class CellParser {
private boolean suppressed = false;

private String aboutPlaceholder;
private String[] aboutPlaceholders;
private String[] aboutPlaceholders = new String[0];

private String valuePlaceholder;
private String[] valuePlaceholders;
private String[] valuePlaceholders = new String[0];

/**
* Get name of the column
Expand All @@ -82,7 +80,7 @@ public String getNameEncoded() {
*/
public void setName(String name) {
this.name = name;
this.encodedName = URLEncoder.encode(name, StandardCharsets.UTF_8);
this.encodedName = "{" + URLEncoder.encode(name, StandardCharsets.UTF_8) + "}";
}

/**
Expand Down Expand Up @@ -198,15 +196,7 @@ private String[] getPlaceholders(String template) {
.map(MatchResult::group)
.filter(m -> !m.equals(ownPlaceholder))
.collect(Collectors.toSet());
System.err.println("placeholders " + placeholders);
System.err.println("own placeholders " + ownPlaceholder);

if (placeholders.isEmpty()) {
System.err.println("no placeholder for " + template);
return null;
}
return placeholders.toArray(new String[placeholders.size()]);

return placeholders.toArray(String[]::new);
}

/**
Expand Down Expand Up @@ -304,7 +294,6 @@ public IRI getValueUrl(String cell) {
}
String s = valueUrl;
if (valuePlaceholder != null && cell != null) {
System.err.println("repace " + valuePlaceholder + " " + cell);
s = valueUrl.replace(valuePlaceholder, getValueOrDefault(cell));
}
return Values.iri(s);
Expand Down Expand Up @@ -338,7 +327,6 @@ public IRI getValueUrl(Map<String, String> values, String cell) {
*/
public void setValueUrl(String valueUrl) {
this.valueUrl = valueUrl;
System.err.println("valueurl " + valueUrl);
// check if this URL contains column placeholders
this.valuePlaceholder = getOwnPlaceholder(valueUrl);
this.valuePlaceholders = getPlaceholders(valueUrl);
Expand Down
4 changes: 4 additions & 0 deletions core/rio/csvw/src/test/resources/painters-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
{ "name": "country_id",
"suppressOutput": "true" },
{ "name": "country_name_nl",
"aboutUrl": "https://www.wikidata.org/wiki/{country_id}",
"propertyUrl": "rdfs:label",
"lang": "nl" },
{ "name": "country_name_en",
"aboutUrl": "https://www.wikidata.org/wiki/{country_id}",
"propertyUrl": "rdfs:label",
"lang": "en" },
{ "name": "date_of_birth",
"propertyUrl": "schema:birthDate",
Expand Down
3 changes: 1 addition & 2 deletions core/rio/csvw/src/test/resources/painters.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"wikidata_id","first_name","last_name","country_id","country_name_nl","country_name_en","date_of_birth","married","languages"
"Q5582"," Vincent","van Gogh","Q29999","Nederland","The Netherlands","30/3/1853","No","dutch french"
"Q164712","Paul","Delvaux","Q31","België","Belgium","23/9/1897","Yes","french"
"Q46408","Georgia ","O'Keeffe","Q30","Verenigde Staten","United States","15/11/1887","Yes","english"

"Q46408","Georgia ","O'Keeffe","Q30","Verenigde Staten","United States","15/11/1887","Yes","english"

0 comments on commit 542503f

Please sign in to comment.