Skip to content

Commit

Permalink
Merge branch 'dh2-i18n-rc1' of https://github.com/cidgoh/DataHarmonizer
Browse files Browse the repository at this point in the history
… into dh2-i18n-rc1
  • Loading branch information
kennethbruskiewicz committed Dec 16, 2024
2 parents a033308 + e5d16db commit 8677828
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 59 deletions.
36 changes: 30 additions & 6 deletions lib/DataHarmonizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ class DataHarmonizer {
attributes[name],
this.context.template.current.schema.slots[name]
);
// console.log(field);

//let field = attributes[name];
let section_title = null;
Expand Down Expand Up @@ -345,9 +344,12 @@ class DataHarmonizer {
// FUTURE: figure out how to accomodate newlines?
*/
switch (range) {
case 'string':
case 'WhitespaceMinimizedString':
new_field.datatype = 'xsd:token';
break;
case 'string':
new_field.datatype = 'string';
break;
case 'Provenance':
new_field.datatype = 'Provenance';
break;
Expand Down Expand Up @@ -563,6 +565,7 @@ class DataHarmonizer {
}
}

// Called by toolbox.js validate()
async validate() {
// const data = this.getTrimmedData();
const rowStart = 0;
Expand All @@ -571,8 +574,11 @@ class DataHarmonizer {
const colEnd = this.hot.countCols() - 1;
const data = this.hot.getData(rowStart, colStart, rowEnd, colEnd);

await this.doPreValidationRepairs(data);
// doPreValidationRepairs(data) cleans some kinds of [row][column] value without
// signaling changes.
await this.doPreValidationRepairs(data); // Screen refresh requires await().
this.invalid_cells = this.getInvalidCells(data);
//console.log("INVALID CELLS", this.invalid_cells);
this.hot.render();
}

Expand Down Expand Up @@ -2649,6 +2655,7 @@ class DataHarmonizer {
if (number >= 0) {
// Here we have the 3 field call, with units sandwitched in the middle
if (binOffset === 2) {

const unit = matrix[row][hotRowNextCol];
// Host age unit is interpreted by default to be year.
// If user selects month, value is converted into years for binning.
Expand Down Expand Up @@ -2684,8 +2691,9 @@ class DataHarmonizer {
selection = bin_value; // Default value is itself.

const bin_values = fields[hotRowBinCol].flatVocabulary;
if (!bin_value || (bin_value === '' && value in bin_values)) {
if (value in bin_values && (!bin_value || bin_value === '')) {
selection = value;
console.log("no bin value", value);
}
// If a unit field exists, then set that to metadata too.
if (binOffset == 2) {
Expand All @@ -2694,7 +2702,7 @@ class DataHarmonizer {
? matrix[hotRowPtr][hotRowNextCol]
: null;
const unit_values = fields[col + 1].flatVocabulary;
if (!unit_value || (unit_value === '' && value in unit_values)) {
if (value in unit_values && (!unit_value || unit_value === '')) {
triggered_changes.push([
hotRowPtr,
hotRowNextCol,
Expand Down Expand Up @@ -2727,9 +2735,14 @@ class DataHarmonizer {
return this.validator.validate(data, fieldNames);
}

/**
* Cleans up some data[row][column] values silently
* xsd:token normalized to have whitespace removed.
*/
doPreValidationRepairs(data) {
return new Promise((resolve) => {
const cellChanges = [];
const whitespace_minimized_re = new RegExp(/\s+/,'g');
let fullVersion =
VERSION_TEXT +
', ' +
Expand All @@ -2744,10 +2757,21 @@ class DataHarmonizer {
continue;
}
for (let col = 0; col < data[row].length; col++) {
const cellVal = data[row][col];
let cellVal = data[row][col];
const field = this.fields[col];
const datatype = field.datatype;

if (cellVal && datatype === 'xsd:token') {

const minimized = cellVal.replace(whitespace_minimized_re, ' ').trim();
// Update cellVal in advance of validateVal(s) below
if (minimized !== cellVal) {
cellVal = minimized;
data[row][col] = cellVal;
cellChanges.push([row, col, minimized, 'thisChange']);
}
};

if (datatype === 'Provenance') {
checkProvenance(cellChanges, fullVersion, cellVal, row, col);
} else if (field.flatVocabulary) {
Expand Down
135 changes: 86 additions & 49 deletions lib/Validator.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,13 @@ class Validator {
if (!todos || !todos.length) {
return;
}
// todos is an array of strings.
const slotType = this.getSlotType(slotDefinition);// LinkML schema.type object

const slotType = this.getSlotType(slotDefinition);

// Slot type could be:
// a number, string, date ...
// null - if it is only a menu
// both date and NullValueList
if (slotType?.uri === 'xsd:date') {
for (const todo of todos) {
if (todo.substring(0, 2) === '>=') {
Expand All @@ -70,16 +74,11 @@ class Validator {
}
}

for (const def of slotDefinition.any_of || []) {
processTodos(def, todos);
}
for (const def of slotDefinition.all_of || []) {
processTodos(def, todos);
}
for (const def of slotDefinition.exactly_one_of || []) {
processTodos(def, todos);
}
for (const def of slotDefinition.none_of || []) {
// Cycle through each slotDefinition any_of etc. object entries and get
// the datatype of its .range (or recurse) and in LinkML fashion attach
// minimum_value and maximum_value to the slotDefinition OR its any_of
// etc array BASED ON top level todos. E.g. inheriting min/max criteria.
for (const def of slotDefinition.any_of ?? slotDefinition.all_of ?? slotDefinition.exactly_one_of ?? slotDefinition.none_of ?? []) {
processTodos(def, todos);
}
};
Expand All @@ -94,13 +93,15 @@ class Validator {
*/
this.#dependantMinimumValuesMap = new Map();
this.#dependantMaximumValuesMap = new Map();
const RE_TODOS = new RegExp(/^([><])={(.*?)}$/);

for (const slotDefinition of Object.values(this.#targetClassInducedSlots)) {
const { todos } = slotDefinition;
if (!todos || !todos.length) {
continue;
}
for (const todo of todos) {
const match = todo.match(/^([><])={(.*?)}$/);
const match = todo.match(RE_TODOS);
if (match == null) {
continue;
}
Expand Down Expand Up @@ -140,7 +141,7 @@ class Validator {
this.#valueValidatorMap = new Map();
}

/* This returns a single primitve data type for a slot - a decimal, date,
/* This returns a single primitive data type for a slot - a decimal, date,
string etc. or possibly an enumeration. Enumerations are handled
separately however (by const slotEnum = ...). Slots either use "range"
attribute, OR they use any_of or exactly_one_of etc. attribute expression
Expand All @@ -164,6 +165,12 @@ class Validator {
return slotType;
}

/* A validation function is prepared and cached for every slot presented, so
that validation throug rows of data can make use of already established
validator for each column. Particular columns may have sensitivity to other
column values (via min/max references) or special {today} e.g. so
*/
getValidatorForSlot(slot, options = {}) {
const { cacheKey, inheritedRange } = options;
if (typeof cacheKey === 'string' && this.#valueValidatorMap.has(cacheKey)) {
Expand All @@ -177,11 +184,13 @@ class Validator {
slotDefinition = slot;
}

// This digs down into any_of, all_of etc to find first date, number etc.
// slotType is LinkML schema.type object
if (!slotDefinition.range && inheritedRange) {
slotDefinition.range = inheritedRange;
}

const slotType = this.getSlotType(slotDefinition);
const slotType = this.getSlotType(slotDefinition); // LinkML schema.type object

const slotEnum = this.#schema.enums?.[slotDefinition.range];
const slotPermissibleValues = Object.values(
Expand All @@ -194,11 +203,13 @@ class Validator {
// TEST CASE:
// if (slotDefinition.name == "sample_collection_date")
// console.log("any_of", DEBUG INFO)
// inheritedRange comes from original slot, so it might be a date or number + menu
const anyOfValidators = (slotDefinition.any_of ?? []).map((subSlot) =>
this.getValidatorForSlot(subSlot, {
inheritedRange: slotDefinition.range,
})
);

const allOfValidators = (slotDefinition.all_of ?? []).map((subSlot) =>
this.getValidatorForSlot(subSlot, {
inheritedRange: slotDefinition.range,
Expand Down Expand Up @@ -236,20 +247,18 @@ class Validator {
);

const validate = (value) => {
if (slotDefinition.required && !value) {
return 'This field is required';
}

if (slotDefinition.value_presence === 'PRESENT' && !value) {
return 'Value is not present';
} else if (slotDefinition.value_presence === 'ABSENT' && value) {
return 'Value is not absent';
}

if (!value) {
if (slotDefinition.required)
return 'This field is required';
// value_presence is subject to dynamic rules?
if (slotDefinition.value_presence === 'PRESENT')
return 'Value is not present';
return;
}

if (slotDefinition.value_presence === 'ABSENT')
return 'Value is not absent';

let splitValues;
if (slotDefinition.multivalued) {
splitValues = value.split(this.#multivaluedDelimiter);
Expand All @@ -269,39 +278,62 @@ class Validator {
splitValues = [value];
}

// For any of the value(s), whether they are valid depends on either
// an ok primitive data type parsing, OR a categorical menu match.
// Message needs
for (const value of splitValues) {
if (slotType) {
let parse_error = false;
if (slotType) {// Doesn't pertain to slots which are ONLY enumerations.
const parsed = this.#parser.parse(value, slotType.uri);

// Issue: parse can fail on decimal but menu has "Missing"
if (parsed === undefined) {
return `Value does not match format for ${slotType.uri}`;
}
parse_error = `Value does not match format for ${slotType.uri}`;

if (slotMinimumValue !== undefined && parsed < slotMinimumValue) {
return 'Value is less than minimum value';
//if (!(anyOfValidators.length || allOfValidators.length || exactlyOneOfValidators.length || noneOfValidators.length)) {
// return parse_error;
//}
}
// All these cases have encountered an item which matches basic data
// datatype and so sudden death is ok.
else {

if (slotMaximumValue !== undefined && parsed > slotMaximumValue) {
return 'Value is greater than maximum value';
}
if (slotMinimumValue !== undefined && parsed < slotMinimumValue) {
return 'Value is less than minimum value';
}

if (slotMaximumValue !== undefined && parsed > slotMaximumValue) {
return 'Value is greater than maximum value';
}

if (
(slotDefinition.equals_string !== undefined &&
parsed !== slotDefinition.equals_string) ||
(slotDefinition.equals_number !== undefined &&
parsed !== slotDefinition.equals_number)
) {
return 'Value does not match constant';
}

if (
(slotDefinition.equals_string !== undefined &&
parsed !== slotDefinition.equals_string) ||
(slotDefinition.equals_number !== undefined &&
parsed !== slotDefinition.equals_number)
) {
return 'Value does not match constant';
}
if (
slotDefinition.pattern !== undefined &&
!value.match(slotDefinition.pattern)
) {
return 'Value does not match pattern';
}

if (
slotDefinition.pattern !== undefined &&
!value.match(slotDefinition.pattern)
) {
return 'Value does not match pattern';
// Here slotType value tested and is ok!
continue;
}

// Here value didn't parse to slotType

}
else {
// No basic slot type here so only enumeration handling.
}

// Single range for slot given so no need to evaluate all_of, any_of etc.
if (slotEnum && !slotPermissibleValues.includes(value)) {
return 'Value is not allowed';
}
Expand Down Expand Up @@ -345,12 +377,17 @@ class Validator {
return 'One or more expressions of none_of held';
}
}

if (anyOfValidators.length || allOfValidators.length || exactlyOneOfValidators.length || noneOfValidators.length) {
// We passed validation here which means a parse error can be overriden
}
else if (parse_error.length) {
//There were no other ranges besides basic slotType so
return parse_error;
}
}
};

if (typeof cacheKey === 'string') {
this.#valueValidatorMap.set(cacheKey, validate);
}

return validate;
}
Expand Down
11 changes: 10 additions & 1 deletion lib/utils/datatypes.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ const DEFAULT_OPTIONS = {
datetimeFormat: 'yyyy-MM-dd HH:mm',
timeFormat: 'HH:mm',
};
const RE_WHITESPACE_MINIMIZED = new RegExp(/\s+/,'g');
const RE_DECIMAL = new RegExp(/^[+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)$/);

export class Datatypes {
PARSERS = {
'xsd:integer': this.parseInteger,
Expand All @@ -17,6 +20,7 @@ export class Datatypes {
'xsd:date': this.parseDate,
'xsd:dateTime': this.parseDateTime,
'xsd:time': this.parseTime,
'xsd:token': this.parseToken,
};
STRINGIFIERS = {
'xsd:date': this.stringifyDate,
Expand Down Expand Up @@ -50,7 +54,7 @@ export class Datatypes {
}

parseDecimal(value) {
const correctPattern = /^[+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)$/.test(value);
const correctPattern = RE_DECIMAL.test(value);
return correctPattern ? Number(value) : undefined;
}

Expand Down Expand Up @@ -88,6 +92,11 @@ export class Datatypes {
return parsed;
}

// For xsd:token which returns reduced whitespace version
parseToken(value) {
return value.replace(RE_WHITESPACE_MINIMIZED, ' ').trim();
}

stringifyDate(value) {
return format(value, this.dateFormat);
}
Expand Down
Loading

0 comments on commit 8677828

Please sign in to comment.