From 2eda0740b08652fd35e34f80b9ec51670d5a5fdc Mon Sep 17 00:00:00 2001 From: Soefgi Date: Tue, 30 Jan 2024 15:30:30 +0100 Subject: [PATCH 01/11] implemented constancy matcher --- .../matching/metadata/ConstancyMatcher.java | 68 +++++++++++++++++++ src/main/resources/first_line_matchers.yaml | 4 ++ 2 files changed, 72 insertions(+) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java new file mode 100644 index 000000000..a4f239a23 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java @@ -0,0 +1,68 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.HashMap; +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) +public class ConstancyMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + for (int j = 0; j < targetTable.getNumColumns(); j++) { + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List source, List target) { + HashMap source_map = new HashMap<>(); + HashMap target_map = new HashMap<>(); + + for (String entry : source) { + source_map.computeIfPresent( + entry, + (key, count) -> count + 1 + ); + source_map.putIfAbsent(entry, 1); + } + + for (String entry : target) { + target_map.computeIfPresent( + entry, + (key, count) -> count + 1 + ); + target_map.putIfAbsent(entry, 1); + } + + int maxSource = 0; + int maxTarget = 0; + + for (Integer count : source_map.values()) { + if (count > maxSource) maxSource = count; + } + + for (Integer count : target_map.values()) { + if (count > maxTarget) maxTarget = count; + } + + //highest frequent value divided by number of rows + float sourcePercentage = (float) maxSource / source.size(); + float targetPercentage = (float) maxTarget / target.size(); + + //normalize to value between 0 and 1 + return (float) (1.0 - Math.round(Math.abs(sourcePercentage - targetPercentage) * 100.0) / 100.0); + } + +} diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index 97c95edd5..bfd156875 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -24,6 +24,10 @@ packageName: "metadata" --- name: "ExtremaMatcher" packageName: "metadata" +--- +name: "ConstancyMatcher" +packageName: "metadata" + # Similarity matchers --- From d20907884bbfa4e454b7dedf66a63f1667d5b891 Mon Sep 17 00:00:00 2001 From: Soefgi Date: Tue, 30 Jan 2024 16:46:04 +0100 Subject: [PATCH 02/11] implemented firstDigitMatcher --- .../matching/metadata/FirstDigitMatcher.java | 86 +++++++++++++ src/main/resources/first_line_matchers.yaml | 121 +++++++++--------- src/main/resources/general.yaml | 6 +- 3 files changed, 151 insertions(+), 62 deletions(-) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/FirstDigitMatcher.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/FirstDigitMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/FirstDigitMatcher.java new file mode 100644 index 000000000..3c673b308 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/FirstDigitMatcher.java @@ -0,0 +1,86 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) +public class FirstDigitMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Datatype sourceType = sourceTable.getColumn(i).getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Datatype targetType = targetTable.getColumn(j).getDatatype(); + if (sourceType != Datatype.INTEGER && sourceType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + if (targetType != Datatype.INTEGER && targetType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + HashMap sourceDistribution = new HashMap<>(); + HashMap targetDistribution = new HashMap<>(); + + + + //count Benford's distribution + for (int i = 0; i <= 9; i++) { + sourceDistribution.put(i, 0); + targetDistribution.put(i, 0); + } + + getDistributions(sourceColumn, sourceDistribution); + + getDistributions(targetColumn, targetDistribution); + + //calculate similarity of first digit frequency + int sourceElements = sourceColumn.size(); + int targetElements = targetColumn.size(); + List similarities = new ArrayList<>(); + for (int i = 0; i <= 9; i++) { + float sourcePercentage = (float) sourceDistribution.get(i) / sourceElements; + float targetPercentage = (float) targetDistribution.get(i) / targetElements; + + float similarity = (float) (Math.round(Math.abs(sourcePercentage - targetPercentage) * 100.0) / 100.0); + similarities.add(similarity); + } + + //now average the similarities - Σ(similarities) / 10 + return (float) (similarities.stream().reduce(0.0f, Float::sum) / 10.0); + } + + private void getDistributions(List targetColumn, HashMap targetDistribution) { + for (String entry : targetColumn) { + try { + Integer.parseInt(entry); + int index = 0; + if (entry.charAt(0) == '-') index = 1; + Integer firstDigit = Integer.parseInt(String.valueOf(entry.charAt(index))); + targetDistribution.computeIfPresent(firstDigit, (key, count) -> count + 1); + } catch (NumberFormatException ignored) { + } + } + } + +} diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index bfd156875..37b2bbc14 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -6,66 +6,69 @@ params: seed: [42, 2023] # Metadata matchers ---- -name: "DatatypeMatcher" -packageName: "metadata" ---- -name: "NumberOfRowsMatcher" -packageName: "metadata" ---- -name: "NullPercentageMatcher" -packageName: "metadata" ---- -name: "DistinctCountMatcher" -packageName: "metadata" ---- -name: "UniquenessMatcher" -packageName: "metadata" ---- -name: "ExtremaMatcher" -packageName: "metadata" ---- -name: "ConstancyMatcher" +#--- +#name: "DatatypeMatcher" +#packageName: "metadata" +#--- +#name: "NumberOfRowsMatcher" +#packageName: "metadata" +#--- +#name: "NullPercentageMatcher" +#packageName: "metadata" +#--- +#name: "DistinctCountMatcher" +#packageName: "metadata" +#--- +#name: "UniquenessMatcher" +#packageName: "metadata" +#--- +#name: "ExtremaMatcher" +#packageName: "metadata" +#--- +#name: "ConstancyMatcher" +#packageName: "metadata" +--- +name: "FirstDigitMatcher" packageName: "metadata" # Similarity matchers ---- -name: "LevenshteinMatcher" -packageName: "similarity.label" ---- -name: "CosineMatcher" -packageName: "similarity.label" ---- -name: "HammingMatcher" -packageName: "similarity.label" ---- -name: "JaroWinklerMatcher" -packageName: "similarity.label" ---- -name: "LongestCommonSubsequenceMatcher" -packageName: "similarity.label" ---- -name: "JaccardInstanceMatcher" -packageName: "similarity.tokenizedinstance" ---- -name: "DiceInstanceMatcher" -packageName: "similarity.tokenizedinstance" ---- -name: "OverlapInstanceMatcher" -packageName: "similarity.tokenizedinstance" ---- -name: "SetCosineInstanceMatcher" -packageName: "similarity.tokenizedinstance" ---- -name: "JaccardLabelMatcher" -packageName: "similarity.tokenizedlabel" ---- -name: "DiceLabelMatcher" -packageName: "similarity.tokenizedlabel" ---- -name: "OverlapLabelMatcher" -packageName: "similarity.tokenizedlabel" ---- -name: "SetCosineLabelMatcher" -packageName: "similarity.tokenizedlabel" \ No newline at end of file +#--- +#name: "LevenshteinMatcher" +#packageName: "similarity.label" +#--- +#name: "CosineMatcher" +#packageName: "similarity.label" +#--- +#name: "HammingMatcher" +#packageName: "similarity.label" +#--- +#name: "JaroWinklerMatcher" +#packageName: "similarity.label" +#--- +#name: "LongestCommonSubsequenceMatcher" +#packageName: "similarity.label" +#--- +#name: "JaccardInstanceMatcher" +#packageName: "similarity.tokenizedinstance" +#--- +#name: "DiceInstanceMatcher" +#packageName: "similarity.tokenizedinstance" +#--- +#name: "OverlapInstanceMatcher" +#packageName: "similarity.tokenizedinstance" +#--- +#name: "SetCosineInstanceMatcher" +#packageName: "similarity.tokenizedinstance" +#--- +#name: "JaccardLabelMatcher" +#packageName: "similarity.tokenizedlabel" +#--- +#name: "DiceLabelMatcher" +#packageName: "similarity.tokenizedlabel" +#--- +#name: "OverlapLabelMatcher" +#packageName: "similarity.tokenizedlabel" +#--- +#name: "SetCosineLabelMatcher" +#packageName: "similarity.tokenizedlabel" \ No newline at end of file diff --git a/src/main/resources/general.yaml b/src/main/resources/general.yaml index cad69e3e7..83469d0a5 100644 --- a/src/main/resources/general.yaml +++ b/src/main/resources/general.yaml @@ -52,19 +52,19 @@ evaluateFirstLineMatchers: True readCacheFirstLineMatchers: False writeCacheFirstLineMatchers: False # Step 3: run similarity matrix boosting on the output of first line matchers -runSimMatrixBoostingOnFirstLineMatchers: True +runSimMatrixBoostingOnFirstLineMatchers: False saveOutputSimMatrixBoostingOnFirstLineMatchers: True evaluateSimMatrixBoostingOnFirstLineMatchers: True readCacheSimMatrixBoostingOnFirstLineMatchers: False writeCacheSimMatrixBoostingOnFirstLineMatchers: False # Step 4: run second line matchers (ensemble matchers and other matchers using output of first line matchers) -runSecondLineMatchers: True +runSecondLineMatchers: False saveOutputSecondLineMatchers: True evaluateSecondLineMatchers: True readCacheSecondLineMatchers: False writeCacheSecondLineMatchers: False # Step 5: run similarity matrix boosting on the output of second line matchers -runSimMatrixBoostingOnSecondLineMatchers: True +runSimMatrixBoostingOnSecondLineMatchers: False saveOutputSimMatrixBoostingOnSecondLineMatchers: True evaluateSimMatrixBoostingOnSecondLineMatchers: True readCacheSimMatrixBoostingOnSecondLineMatchers: False From 792e58a24a7dd83711fbc72aefe81b6f9c1db55d Mon Sep 17 00:00:00 2001 From: Reuterl5 Date: Fri, 2 Feb 2024 21:10:21 +0100 Subject: [PATCH 03/11] Implemented size- and decimal-Matcher --- .../matching/metadata/DecimalsMatcher.java | 66 +++++++++++++++++ .../matching/metadata/SizeMatcher.java | 72 +++++++++++++++++++ src/main/resources/first_line_matchers.yaml | 10 ++- 3 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/DecimalsMatcher.java create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/DecimalsMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/DecimalsMatcher.java new file mode 100644 index 000000000..3236697ce --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/DecimalsMatcher.java @@ -0,0 +1,66 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) +public class DecimalsMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Datatype sourceType = sourceTable.getColumn(i).getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Datatype targetType = targetTable.getColumn(j).getDatatype(); + if (sourceType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + if (targetType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + + int maxSourceDecimals = 0; + int maxTargetDecimals = 0; + int numSourceDecimals; + int numTargetDecimals; + + for (String s : sourceColumn) { + + numSourceDecimals = s.length() - (s.indexOf('.') + 1); + if (numSourceDecimals > maxSourceDecimals) { + maxSourceDecimals = numSourceDecimals; + } + } + + for (String s : targetColumn) { + + numTargetDecimals = s.length() - (s.indexOf('.') + 1); + if (numTargetDecimals > maxTargetDecimals) { + maxTargetDecimals = numTargetDecimals; + } + } + + if (maxTargetDecimals > maxSourceDecimals) { + return (float)(maxSourceDecimals / maxTargetDecimals); + } + else return (float)(maxTargetDecimals / maxSourceDecimals); + } +} diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java new file mode 100644 index 000000000..822cead31 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java @@ -0,0 +1,72 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) +public class SizeMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Column sourceColumn = sourceTable.getColumn(i); + Datatype sourceType = sourceColumn.getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Column targetColumn = targetTable.getColumn(j); + Datatype targetType = targetColumn.getDatatype(); + if (sourceType != Datatype.INTEGER && sourceType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + if (targetType != Datatype.INTEGER && targetType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + float test = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + + int maxSourceDigits = 0; + int maxTargetDigits = 0; + int numSourceDigits; + int numTargetDigits; + + for (String s : sourceColumn) { + if (s.isEmpty()) continue; + numSourceDigits = s.contains(".") ? s.length() - 1 : s.length(); + if (numSourceDigits > maxSourceDigits) { + maxSourceDigits = numSourceDigits; + } + } + + for (String t : targetColumn) { + if (t.isEmpty()) continue; + numTargetDigits = t.contains(".") ? t.length() - 1 : t.length(); + if (numTargetDigits > maxTargetDigits) { + maxTargetDigits = numTargetDigits; + } + } + + if (maxTargetDigits > maxSourceDigits) { + return (float) (maxSourceDigits / maxTargetDigits); + } + if (maxSourceDigits == 0) return 0.0f; + return (float) maxTargetDigits / maxSourceDigits; + } +} \ No newline at end of file diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index 37b2bbc14..50fc91ce6 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -27,11 +27,15 @@ params: #--- #name: "ConstancyMatcher" #packageName: "metadata" +#--- +#name: "FirstDigitMatcher" +#packageName: "metadata" +#--- +#name: "DecimalsMatcher" +#packageName: "metadata" --- -name: "FirstDigitMatcher" +name: "SizeMatcher" packageName: "metadata" - - # Similarity matchers #--- #name: "LevenshteinMatcher" From 10f2e9f6aac3a992182fe986115a0ff723fc1b06 Mon Sep 17 00:00:00 2001 From: Reuterl5 Date: Mon, 5 Feb 2024 22:06:21 +0100 Subject: [PATCH 04/11] Implemented Length Matchers --- .../matching/metadata/MaximumLength.java | 65 ++++++++++++++ .../matching/metadata/MeanLength.java | 72 +++++++++++++++ .../matching/metadata/MedianLength.java | 90 +++++++++++++++++++ .../matching/metadata/MinimumLength.java | 65 ++++++++++++++ .../matching/metadata/SizeMatcher.java | 1 - src/main/resources/first_line_matchers.yaml | 14 ++- 6 files changed, 305 insertions(+), 2 deletions(-) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java new file mode 100644 index 000000000..7d3fdc2d4 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java @@ -0,0 +1,65 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) + +public class MaximumLength extends TablePairMatcher{ + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Column sourceColumn = sourceTable.getColumn(i); + Datatype sourceType = sourceColumn.getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Column targetColumn = targetTable.getColumn(j); + Datatype targetType = targetColumn.getDatatype(); + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + + int maxSourceDigits = 0; + int maxTargetDigits = 0; + int numSourceDigits; + int numTargetDigits; + + for (String s : sourceColumn) { + if (s.isEmpty()) continue; + numSourceDigits = s.length(); + if (numSourceDigits > maxSourceDigits) { + maxSourceDigits = numSourceDigits; + } + } + + for (String t : targetColumn) { + if (t.isEmpty()) continue; + numTargetDigits = t.length(); + if (numTargetDigits > maxTargetDigits) { + maxTargetDigits = numTargetDigits; + } + } + + if (maxTargetDigits > maxSourceDigits) { + return (float) (maxSourceDigits / maxTargetDigits); + } + if (maxSourceDigits == 0) return 0.0f; + return (float) maxTargetDigits / maxSourceDigits; + } +} + diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java new file mode 100644 index 000000000..424c0f2b9 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java @@ -0,0 +1,72 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) + +public class MeanLength extends TablePairMatcher{ + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Column sourceColumn = sourceTable.getColumn(i); + Datatype sourceType = sourceColumn.getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Column targetColumn = targetTable.getColumn(j); + Datatype targetType = targetColumn.getDatatype(); + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + + int i = 1; + int j = 1; + int SourceLength = 0; + int TargetLength = 0; + + for (String s : sourceColumn) { + if (s.isEmpty()){ + ++i; + continue; + } + SourceLength = SourceLength + s.length(); + ++i; + } + int SourceMean = SourceLength / i; + + for (String t : targetColumn) { + if (t.isEmpty()){ + ++j; + continue; + } + TargetLength = TargetLength + t.length(); + ++j; + } + int TargetMean = TargetLength / j; + + if (TargetMean == 0) return 0.0f; + + if (TargetMean > SourceMean) { + return (float) (SourceMean / TargetMean); + } + if (SourceMean == 0) return 0.0f; + return (float) TargetMean / SourceMean; + } +} + + diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java new file mode 100644 index 000000000..6f73856f5 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java @@ -0,0 +1,90 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.Arrays; +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) + +public class MedianLength extends TablePairMatcher{ + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Column sourceColumn = sourceTable.getColumn(i); + Datatype sourceType = sourceColumn.getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Column targetColumn = targetTable.getColumn(j); + Datatype targetType = targetColumn.getDatatype(); + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + + int i = 0; + int j = 0; + if(sourceColumn.size() == 0) return 0.0f; + if(targetColumn.size() == 0) return 0.0f; + + int sourceLengths[] = new int[sourceColumn.size()]; + int targetLengths[] = new int[targetColumn.size()]; + + for (String s : sourceColumn) { + if (s.isEmpty()){ + sourceLengths[i] = 0; + } + else{ + sourceLengths[i] = s.length(); + } + ++i; + } + + for (String t : targetColumn) { + if (t.isEmpty()){ + targetLengths[j] = 0; + } + else{ + targetLengths[j] = t.length(); + } + ++j; + } + + float sourceMedian; + float targetMedian; + Arrays.sort(sourceLengths); + Arrays.sort(targetLengths); + + if (sourceLengths.length % 2 == 0) + sourceMedian = ((float)sourceLengths[sourceLengths.length/2] + (float)sourceLengths[sourceLengths.length/2 - 1])/2; + else + sourceMedian = (float) sourceLengths[sourceLengths.length/2]; + + if (targetLengths.length % 2 == 0) + targetMedian = ((float)targetLengths[targetLengths.length/2] + (float)targetLengths[targetLengths.length/2 - 1])/2; + else + targetMedian = (float) targetLengths[targetLengths.length/2]; + + + if (targetMedian == 0) return 0.0f; + + if (targetMedian > sourceMedian) { + return (float) (sourceMedian / targetMedian); + } + if (sourceMedian == 0) return 0.0f; + return (float) targetMedian / sourceMedian; + } +} diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java new file mode 100644 index 000000000..e0fad6a64 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java @@ -0,0 +1,65 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) + +public class MinimumLength extends TablePairMatcher{ + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Column sourceColumn = sourceTable.getColumn(i); + Datatype sourceType = sourceColumn.getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Column targetColumn = targetTable.getColumn(j); + Datatype targetType = targetColumn.getDatatype(); + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); + } + } + return simMatrix; + } + + private float calculateScore(List sourceColumn, List targetColumn) { + + int minSourceDigits = 100000; + int minTargetDigits = 100000; + int numSourceDigits; + int numTargetDigits; + + for (String s : sourceColumn) { + if (s.isEmpty()) continue; + numSourceDigits = s.length(); + if (numSourceDigits < minSourceDigits) { + minSourceDigits = numSourceDigits; + } + } + + for (String t : targetColumn) { + if (t.isEmpty()) continue; + numTargetDigits = t.length(); + if (numTargetDigits < minTargetDigits) { + minTargetDigits = numTargetDigits; + } + } + + if (minTargetDigits > minSourceDigits) { + return (float) (minSourceDigits / minTargetDigits); + } + if (minSourceDigits == 0) return 0.0f; + return (float) minTargetDigits / minSourceDigits; + } + +} diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java index 822cead31..94d7fbe3e 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/SizeMatcher.java @@ -33,7 +33,6 @@ public float[][] match(TablePair tablePair) { simMatrix[i][j] = 0.0f; continue; } - float test = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); } } diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index 50fc91ce6..38722ed67 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -33,8 +33,20 @@ params: #--- #name: "DecimalsMatcher" #packageName: "metadata" +#--- +#name: "SizeMatcher" +#packageName: "metadata" +#--- +#name: "MinimumLength" +#packageName: "metadata" +#--- +#name: "MaximumLength" +#packageName: "metadata" +#--- +#name: "MeanLength" +#packageName: "metadata" --- -name: "SizeMatcher" +name: "MedianLength" packageName: "metadata" # Similarity matchers #--- From 396fafee65062f85acaf863b3e26bfdc3b09b55c Mon Sep 17 00:00:00 2001 From: Soefgi Date: Mon, 5 Feb 2024 22:23:28 +0100 Subject: [PATCH 05/11] implemented label matcher --- .../matching/metadata/LabelMatcher.java | 30 +++++++++++++++++++ src/main/resources/first_line_matchers.yaml | 19 +++++++----- 2 files changed, 42 insertions(+), 7 deletions(-) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/LabelMatcher.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/LabelMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/LabelMatcher.java new file mode 100644 index 000000000..e149370ee --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/LabelMatcher.java @@ -0,0 +1,30 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +@Data +@EqualsAndHashCode(callSuper = true) +public class LabelMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + for (int j = 0; j < targetTable.getNumColumns(); j++) { + if (sourceTable.getColumn(i).getLabel().equalsIgnoreCase(targetTable.getColumn(j).getLabel())) { + simMatrix[i][j] = 1.0f; + } else { + simMatrix[i][j] = 0.0f; + } + } + } + return simMatrix; + } + +} diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index 37b2bbc14..cd2c6af53 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -1,9 +1,9 @@ # State-of-the-Art matchers ---- -name: "RandomMatcher" -packageName: "sota" -params: - seed: [42, 2023] +#--- +#name: "RandomMatcher" +#packageName: "sota" +#params: +# seed: [42, 2023] # Metadata matchers #--- @@ -27,9 +27,14 @@ params: #--- #name: "ConstancyMatcher" #packageName: "metadata" +#--- +#name: "FirstDigitMatcher" +#packageName: "metadata" +#--- +#name: "LabelMatcher" +#packageName: "metadata" --- -name: "FirstDigitMatcher" -packageName: "metadata" +name: # Similarity matchers From a496488776b4a471bc6b768dd5765da6b4054d95 Mon Sep 17 00:00:00 2001 From: Soefgi Date: Mon, 5 Feb 2024 23:27:39 +0100 Subject: [PATCH 06/11] implemented Quartiles Matcher --- .../matching/metadata/QuartilesMatcher.java | 111 ++++++++++++++++++ src/main/resources/first_line_matchers.yaml | 4 +- 2 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/QuartilesMatcher.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/QuartilesMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/QuartilesMatcher.java new file mode 100644 index 000000000..182e5c7e2 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/QuartilesMatcher.java @@ -0,0 +1,111 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.*; + +@Data +@EqualsAndHashCode(callSuper = true) +public class QuartilesMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + + Datatype sourceType = sourceTable.getColumn(i).getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + + Datatype targetType = targetTable.getColumn(j).getDatatype(); + + if (sourceType != Datatype.INTEGER && sourceType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + if (targetType != Datatype.INTEGER && targetType != Datatype.FLOAT) { + simMatrix[i][j] = 0.0f; + continue; + } + + simMatrix[i][j] = calculateScore(sourceTable.getColumn(i), targetTable.getColumn(j)); + } + } + return simMatrix; + } + + private float calculateScore(Column source, Column target) { + List sourceFloat; + if (source.getDatatype() == Datatype.INTEGER) { + List sourceInt = Datatype.castToInt(source); + sourceFloat = new ArrayList<>(); + for (Integer s : sourceInt) { + if (s == null) continue; + sourceFloat.add((float) s); + } + } else { + sourceFloat = Datatype.castToFloat(source); + } + + List targetFloat; + if (target.getDatatype() == Datatype.INTEGER) { + List targetInt = Datatype.castToInt(target); + targetFloat = new ArrayList<>(); + for (Integer t : targetInt) { + if (t == null) continue; + targetFloat.add((float) t); + } + } else { + targetFloat = Datatype.castToFloat(target); + } + + //quartiles only matter if more than 4 data values are present + if (sourceFloat.size() < 4) return 0.0f; + if (targetFloat.size() < 4) return 0.0f; + + try { + Collections.sort(sourceFloat); + Collections.sort(targetFloat); + } catch (NullPointerException ignore) { + return 0.0f; + } + + + //(n + 1) * 0,25 + double sourceQ1 = calculateQuartile(sourceFloat, 0.25f); + double sourceQ2 = calculateQuartile(sourceFloat, 0.5f); + double sourceQ3 = calculateQuartile(sourceFloat, 0.75f); + double targetQ1 = calculateQuartile(targetFloat, 0.25f); + double targetQ2 = calculateQuartile(targetFloat, 0.5f); + double targetQ3 = calculateQuartile(targetFloat, 0.75f); + + double q1; + if (sourceQ1 == 0 || targetQ1 == 0) q1 = 0; + else q1 = Math.min(sourceQ1, targetQ1) / Math.max(sourceQ1, targetQ1); + + double q2; + if (sourceQ2 == 0 || targetQ2 == 0) q2 = 0; + else q2 = Math.min(sourceQ2, targetQ2) / Math.max(sourceQ2, targetQ2); + + double q3; + if (sourceQ3 == 0 || targetQ3 == 0) q3 = 0; + else q3 = Math.min(sourceQ3, targetQ3) / Math.max(sourceQ3, targetQ3); + + return (float) (q1 + q2 + q3) / 3.0f; + } + + private double calculateQuartile(List values, float p) { + double pC = (values.size() + 1) * p; + int upper = (int) Math.ceil(pC); + int lower = (int) Math.floor(pC); + return (values.get(upper - 1) + values.get(lower) - 1) / 2; + } + +} diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index cd2c6af53..4e0406d1c 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -34,8 +34,8 @@ #name: "LabelMatcher" #packageName: "metadata" --- -name: - +name: "QuartilesMatcher" +packageName: "metadata" # Similarity matchers #--- From c10177e2511d64ce3944f1e19fda4177ca70c038 Mon Sep 17 00:00:00 2001 From: Soefgi Date: Mon, 5 Feb 2024 23:38:39 +0100 Subject: [PATCH 07/11] fixed formatting across several matchers --- .../matching/metadata/ExtremaMatcher.java | 4 -- .../matching/metadata/MaximumLength.java | 8 +--- .../matching/metadata/MeanLength.java | 12 ++---- .../matching/metadata/MedianLength.java | 38 ++++++++----------- .../matching/metadata/MinimumLength.java | 13 ++----- 5 files changed, 22 insertions(+), 53 deletions(-) diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/ExtremaMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/ExtremaMatcher.java index cd5b3af99..a03dcbc89 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/ExtremaMatcher.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/ExtremaMatcher.java @@ -9,10 +9,6 @@ import lombok.Data; import lombok.EqualsAndHashCode; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Date; import java.util.List; diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java index 7d3fdc2d4..2770a1c6d 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumLength.java @@ -1,8 +1,6 @@ package de.uni_marburg.schematch.matching.metadata; -import de.uni_marburg.schematch.data.Column; import de.uni_marburg.schematch.data.Table; -import de.uni_marburg.schematch.data.metadata.Datatype; import de.uni_marburg.schematch.matching.TablePairMatcher; import de.uni_marburg.schematch.matchtask.tablepair.TablePair; import lombok.Data; @@ -13,7 +11,7 @@ @Data @EqualsAndHashCode(callSuper = true) -public class MaximumLength extends TablePairMatcher{ +public class MaximumLength extends TablePairMatcher { @Override public float[][] match(TablePair tablePair) { @@ -21,11 +19,7 @@ public float[][] match(TablePair tablePair) { Table targetTable = tablePair.getTargetTable(); float[][] simMatrix = tablePair.getEmptySimMatrix(); for (int i = 0; i < sourceTable.getNumColumns(); i++) { - Column sourceColumn = sourceTable.getColumn(i); - Datatype sourceType = sourceColumn.getDatatype(); for (int j = 0; j < targetTable.getNumColumns(); j++) { - Column targetColumn = targetTable.getColumn(j); - Datatype targetType = targetColumn.getDatatype(); simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); } } diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java index 424c0f2b9..abf398380 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MeanLength.java @@ -1,8 +1,6 @@ package de.uni_marburg.schematch.matching.metadata; -import de.uni_marburg.schematch.data.Column; import de.uni_marburg.schematch.data.Table; -import de.uni_marburg.schematch.data.metadata.Datatype; import de.uni_marburg.schematch.matching.TablePairMatcher; import de.uni_marburg.schematch.matchtask.tablepair.TablePair; import lombok.Data; @@ -13,7 +11,7 @@ @Data @EqualsAndHashCode(callSuper = true) -public class MeanLength extends TablePairMatcher{ +public class MeanLength extends TablePairMatcher { @Override public float[][] match(TablePair tablePair) { @@ -21,11 +19,7 @@ public float[][] match(TablePair tablePair) { Table targetTable = tablePair.getTargetTable(); float[][] simMatrix = tablePair.getEmptySimMatrix(); for (int i = 0; i < sourceTable.getNumColumns(); i++) { - Column sourceColumn = sourceTable.getColumn(i); - Datatype sourceType = sourceColumn.getDatatype(); for (int j = 0; j < targetTable.getNumColumns(); j++) { - Column targetColumn = targetTable.getColumn(j); - Datatype targetType = targetColumn.getDatatype(); simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); } } @@ -40,7 +34,7 @@ private float calculateScore(List sourceColumn, List targetColum int TargetLength = 0; for (String s : sourceColumn) { - if (s.isEmpty()){ + if (s.isEmpty()) { ++i; continue; } @@ -50,7 +44,7 @@ private float calculateScore(List sourceColumn, List targetColum int SourceMean = SourceLength / i; for (String t : targetColumn) { - if (t.isEmpty()){ + if (t.isEmpty()) { ++j; continue; } diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java index 6f73856f5..739214956 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MedianLength.java @@ -1,8 +1,6 @@ package de.uni_marburg.schematch.matching.metadata; -import de.uni_marburg.schematch.data.Column; import de.uni_marburg.schematch.data.Table; -import de.uni_marburg.schematch.data.metadata.Datatype; import de.uni_marburg.schematch.matching.TablePairMatcher; import de.uni_marburg.schematch.matchtask.tablepair.TablePair; import lombok.Data; @@ -14,7 +12,7 @@ @Data @EqualsAndHashCode(callSuper = true) -public class MedianLength extends TablePairMatcher{ +public class MedianLength extends TablePairMatcher { @Override public float[][] match(TablePair tablePair) { @@ -22,11 +20,7 @@ public float[][] match(TablePair tablePair) { Table targetTable = tablePair.getTargetTable(); float[][] simMatrix = tablePair.getEmptySimMatrix(); for (int i = 0; i < sourceTable.getNumColumns(); i++) { - Column sourceColumn = sourceTable.getColumn(i); - Datatype sourceType = sourceColumn.getDatatype(); for (int j = 0; j < targetTable.getNumColumns(); j++) { - Column targetColumn = targetTable.getColumn(j); - Datatype targetType = targetColumn.getDatatype(); simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); } } @@ -37,27 +31,25 @@ private float calculateScore(List sourceColumn, List targetColum int i = 0; int j = 0; - if(sourceColumn.size() == 0) return 0.0f; - if(targetColumn.size() == 0) return 0.0f; + if (sourceColumn.isEmpty()) return 0.0f; + if (targetColumn.isEmpty()) return 0.0f; - int sourceLengths[] = new int[sourceColumn.size()]; - int targetLengths[] = new int[targetColumn.size()]; + int[] sourceLengths = new int[sourceColumn.size()]; + int[] targetLengths = new int[targetColumn.size()]; for (String s : sourceColumn) { - if (s.isEmpty()){ + if (s.isEmpty()) { sourceLengths[i] = 0; - } - else{ + } else { sourceLengths[i] = s.length(); } ++i; } for (String t : targetColumn) { - if (t.isEmpty()){ + if (t.isEmpty()) { targetLengths[j] = 0; - } - else{ + } else { targetLengths[j] = t.length(); } ++j; @@ -69,22 +61,22 @@ private float calculateScore(List sourceColumn, List targetColum Arrays.sort(targetLengths); if (sourceLengths.length % 2 == 0) - sourceMedian = ((float)sourceLengths[sourceLengths.length/2] + (float)sourceLengths[sourceLengths.length/2 - 1])/2; + sourceMedian = ((float) sourceLengths[sourceLengths.length / 2] + (float) sourceLengths[sourceLengths.length / 2 - 1]) / 2; else - sourceMedian = (float) sourceLengths[sourceLengths.length/2]; + sourceMedian = (float) sourceLengths[sourceLengths.length / 2]; if (targetLengths.length % 2 == 0) - targetMedian = ((float)targetLengths[targetLengths.length/2] + (float)targetLengths[targetLengths.length/2 - 1])/2; + targetMedian = ((float) targetLengths[targetLengths.length / 2] + (float) targetLengths[targetLengths.length / 2 - 1]) / 2; else - targetMedian = (float) targetLengths[targetLengths.length/2]; + targetMedian = (float) targetLengths[targetLengths.length / 2]; if (targetMedian == 0) return 0.0f; if (targetMedian > sourceMedian) { - return (float) (sourceMedian / targetMedian); + return sourceMedian / targetMedian; } if (sourceMedian == 0) return 0.0f; - return (float) targetMedian / sourceMedian; + return targetMedian / sourceMedian; } } diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java index e0fad6a64..4aeb6e96b 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumLength.java @@ -1,8 +1,6 @@ package de.uni_marburg.schematch.matching.metadata; -import de.uni_marburg.schematch.data.Column; import de.uni_marburg.schematch.data.Table; -import de.uni_marburg.schematch.data.metadata.Datatype; import de.uni_marburg.schematch.matching.TablePairMatcher; import de.uni_marburg.schematch.matchtask.tablepair.TablePair; import lombok.Data; @@ -13,7 +11,7 @@ @Data @EqualsAndHashCode(callSuper = true) -public class MinimumLength extends TablePairMatcher{ +public class MinimumLength extends TablePairMatcher { @Override public float[][] match(TablePair tablePair) { @@ -21,11 +19,7 @@ public float[][] match(TablePair tablePair) { Table targetTable = tablePair.getTargetTable(); float[][] simMatrix = tablePair.getEmptySimMatrix(); for (int i = 0; i < sourceTable.getNumColumns(); i++) { - Column sourceColumn = sourceTable.getColumn(i); - Datatype sourceType = sourceColumn.getDatatype(); for (int j = 0; j < targetTable.getNumColumns(); j++) { - Column targetColumn = targetTable.getColumn(j); - Datatype targetType = targetColumn.getDatatype(); simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues()); } } @@ -34,8 +28,8 @@ public float[][] match(TablePair tablePair) { private float calculateScore(List sourceColumn, List targetColumn) { - int minSourceDigits = 100000; - int minTargetDigits = 100000; + int minSourceDigits = Integer.MAX_VALUE; + int minTargetDigits = Integer.MAX_VALUE; int numSourceDigits; int numTargetDigits; @@ -58,7 +52,6 @@ private float calculateScore(List sourceColumn, List targetColum if (minTargetDigits > minSourceDigits) { return (float) (minSourceDigits / minTargetDigits); } - if (minSourceDigits == 0) return 0.0f; return (float) minTargetDigits / minSourceDigits; } From d92e4f01fa635593c4774cb8666b6e637085fc3c Mon Sep 17 00:00:00 2001 From: Soefgi Date: Sat, 16 Mar 2024 16:30:29 +0100 Subject: [PATCH 08/11] extended null check for data type detection --- .../schematch/data/metadata/Datatype.java | 24 ++++++++++--------- src/main/resources/general.yaml | 12 +++++----- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/main/java/de/uni_marburg/schematch/data/metadata/Datatype.java b/src/main/java/de/uni_marburg/schematch/data/metadata/Datatype.java index f1fe27ad0..50a9e7a0f 100644 --- a/src/main/java/de/uni_marburg/schematch/data/metadata/Datatype.java +++ b/src/main/java/de/uni_marburg/schematch/data/metadata/Datatype.java @@ -18,18 +18,20 @@ public enum Datatype { GEO_LOCATION; // boolean values - final static String[] t = {"1", "true", "t", "yes", "y", "ja", "j"}; - final static String[] f = {"0", "false", "f", "no", "n", "nein"}; - final static List patternsT = Arrays.stream(t).toList(); - final static List patternsF = Arrays.stream(f).toList(); - final static List booleanPatterns = Stream.concat(patternsT.stream(), patternsF.stream()).toList(); + private final static String[] nulls = {"-", "nan", "\"\"", "\\", "/", "null"}; + private final static List nullPatterns = Arrays.stream(nulls).toList(); + private final static String[] t = {"1", "true", "t", "yes", "y", "ja", "j"}; + private final static String[] f = {"0", "false", "f", "no", "n", "nein"}; + private final static List patternsT = Arrays.stream(t).toList(); + private final static List patternsF = Arrays.stream(f).toList(); + private final static List booleanPatterns = Stream.concat(patternsT.stream(), patternsF.stream()).toList(); // geolocation pattern - final static Pattern geoLocationPattern = Pattern.compile("-?[0-9]+.[0-9]+,-?[0-9]+.[0-9]+"); + private final static Pattern geoLocationPattern = Pattern.compile("-?[0-9]+.[0-9]+,-?[0-9]+.[0-9]+"); // date formats - final static SimpleDateFormat sdfDashes = new SimpleDateFormat("dd-MM"); - final static SimpleDateFormat sdfSlashes = new SimpleDateFormat("dd/MM"); - final static SimpleDateFormat sdfDots = new SimpleDateFormat("dd.MM"); - final static SimpleDateFormat[] sdfs = {sdfDashes, sdfSlashes, sdfDots}; + private final static SimpleDateFormat sdfDashes = new SimpleDateFormat("dd-MM"); + private final static SimpleDateFormat sdfSlashes = new SimpleDateFormat("dd/MM"); + private final static SimpleDateFormat sdfDots = new SimpleDateFormat("dd.MM"); + private final static SimpleDateFormat[] sdfs = {sdfDashes, sdfSlashes, sdfDots}; /** * Determines the definitive data type from a list of scores @@ -322,7 +324,7 @@ private static double isGeoLocation(Column column) { } private static boolean isNull(String value) { - return value.equals("\"\"") || value.isEmpty(); + return nullPatterns.contains(value.toLowerCase()) || value.isEmpty(); } } diff --git a/src/main/resources/general.yaml b/src/main/resources/general.yaml index 83469d0a5..78cbae8c8 100644 --- a/src/main/resources/general.yaml +++ b/src/main/resources/general.yaml @@ -34,7 +34,7 @@ evaluateAttributes: True saveOutputPerTablePair: False # adds header and index with attribute names to output files # applies to all matching steps for which output saving is enabled (see below) -saveOutputVerbose: True +saveOutputVerbose: False # MatchSteps configuration # run: execute this match step @@ -53,20 +53,20 @@ readCacheFirstLineMatchers: False writeCacheFirstLineMatchers: False # Step 3: run similarity matrix boosting on the output of first line matchers runSimMatrixBoostingOnFirstLineMatchers: False -saveOutputSimMatrixBoostingOnFirstLineMatchers: True -evaluateSimMatrixBoostingOnFirstLineMatchers: True +saveOutputSimMatrixBoostingOnFirstLineMatchers: False +evaluateSimMatrixBoostingOnFirstLineMatchers: False readCacheSimMatrixBoostingOnFirstLineMatchers: False writeCacheSimMatrixBoostingOnFirstLineMatchers: False # Step 4: run second line matchers (ensemble matchers and other matchers using output of first line matchers) runSecondLineMatchers: False -saveOutputSecondLineMatchers: True -evaluateSecondLineMatchers: True +saveOutputSecondLineMatchers: False +evaluateSecondLineMatchers: False readCacheSecondLineMatchers: False writeCacheSecondLineMatchers: False # Step 5: run similarity matrix boosting on the output of second line matchers runSimMatrixBoostingOnSecondLineMatchers: False saveOutputSimMatrixBoostingOnSecondLineMatchers: True -evaluateSimMatrixBoostingOnSecondLineMatchers: True +evaluateSimMatrixBoostingOnSecondLineMatchers: False readCacheSimMatrixBoostingOnSecondLineMatchers: False writeCacheSimMatrixBoostingOnSecondLineMatchers: False From 6f3e3f0b5835f09f5ecb5c3325368e21d7a948ef Mon Sep 17 00:00:00 2001 From: Soefgi Date: Sat, 16 Mar 2024 16:43:23 +0100 Subject: [PATCH 09/11] fixed uniqueness and constancy matcher scores --- .../schematch/matching/metadata/ConstancyMatcher.java | 2 +- .../schematch/matching/metadata/UniquenessMatcher.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java index a4f239a23..9bf4465bd 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/ConstancyMatcher.java @@ -62,7 +62,7 @@ private float calculateScore(List source, List target) { float targetPercentage = (float) maxTarget / target.size(); //normalize to value between 0 and 1 - return (float) (1.0 - Math.round(Math.abs(sourcePercentage - targetPercentage) * 100.0) / 100.0); + return Math.min(sourcePercentage, targetPercentage) / Math.max(sourcePercentage, targetPercentage); } } diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/UniquenessMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/UniquenessMatcher.java index 450c985aa..f6efd1af0 100644 --- a/src/main/java/de/uni_marburg/schematch/matching/metadata/UniquenessMatcher.java +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/UniquenessMatcher.java @@ -37,9 +37,9 @@ private float calculateSimilarity(ArrayList sourceValues, ArrayList sourceElements = new HashSet<>(sourceValues); HashSet targetElements = new HashSet<>(targetValues); - float sourceDistinctPercentage = (float) sourceElements.size() / sourceValues.size(); - float targetDistinctPercentage = (float) targetElements.size() / targetValues.size(); + float sourcePercentage = (float) sourceElements.size() / sourceValues.size(); + float targetPercentage = (float) targetElements.size() / targetValues.size(); - return (float) (1.0 - Math.round(Math.abs(sourceDistinctPercentage - targetDistinctPercentage) * 100.0) / 100.0); + return Math.min(sourcePercentage, targetPercentage) / Math.max(sourcePercentage, targetPercentage); } } From d2e77184bee77f8dd5354f0c5cc6f8f9c3f62842 Mon Sep 17 00:00:00 2001 From: Soefgi Date: Sun, 17 Mar 2024 18:21:54 +0100 Subject: [PATCH 10/11] added first implementation of histogram earth movers comparison (not functional yet) --- data/Test/test1/source/authors.csv | 6 +- .../test2/ground_truth/authors___authors.csv | 8 -- .../test2/metadata/source-to-target-inds.txt | 0 .../metadata/source/authors/FD_results.txt | 0 .../metadata/source/authors/UCC_results.txt | 0 data/Test/test2/metadata/source/inds.txt | 0 .../test2/metadata/target-to-source-inds.txt | 0 .../metadata/target/authors/FD_results.txt | 0 .../metadata/target/authors/UCC_results.txt | 0 data/Test/test2/metadata/target/inds.txt | 0 data/Test/test2/source/authors.csv | 8 -- data/Test/test2/target/authors.csv | 4 - .../test3/ground_truth/authors___authors.csv | 2 - .../test3/ground_truth/books___buecher.csv | 2 - .../test3/metadata/source-to-target-inds.txt | 0 .../metadata/source/authors/FD_results.txt | 0 .../metadata/source/authors/UCC_results.txt | 0 .../metadata/source/books/FD_results.txt | 0 .../metadata/source/books/UCC_results.txt | 0 data/Test/test3/metadata/source/inds.txt | 0 .../test3/metadata/target-to-source-inds.txt | 0 .../metadata/target/authors/FD_results.txt | 0 .../metadata/target/authors/UCC_results.txt | 0 .../metadata/target/buecher/FD_results.txt | 0 .../metadata/target/buecher/UCC_results.txt | 0 data/Test/test3/metadata/target/inds.txt | 0 data/Test/test3/source/authors.csv | 4 - data/Test/test3/source/books.csv | 4 - data/Test/test3/target/authors.csv | 4 - data/Test/test3/target/buecher.csv | 3 - .../matching/metadata/HistogramMatcher.java | 87 +++++++++++++++++++ 31 files changed, 90 insertions(+), 42 deletions(-) delete mode 100644 data/Test/test2/ground_truth/authors___authors.csv delete mode 100644 data/Test/test2/metadata/source-to-target-inds.txt delete mode 100644 data/Test/test2/metadata/source/authors/FD_results.txt delete mode 100644 data/Test/test2/metadata/source/authors/UCC_results.txt delete mode 100644 data/Test/test2/metadata/source/inds.txt delete mode 100644 data/Test/test2/metadata/target-to-source-inds.txt delete mode 100644 data/Test/test2/metadata/target/authors/FD_results.txt delete mode 100644 data/Test/test2/metadata/target/authors/UCC_results.txt delete mode 100644 data/Test/test2/metadata/target/inds.txt delete mode 100644 data/Test/test2/source/authors.csv delete mode 100644 data/Test/test2/target/authors.csv delete mode 100644 data/Test/test3/ground_truth/authors___authors.csv delete mode 100644 data/Test/test3/ground_truth/books___buecher.csv delete mode 100644 data/Test/test3/metadata/source-to-target-inds.txt delete mode 100644 data/Test/test3/metadata/source/authors/FD_results.txt delete mode 100644 data/Test/test3/metadata/source/authors/UCC_results.txt delete mode 100644 data/Test/test3/metadata/source/books/FD_results.txt delete mode 100644 data/Test/test3/metadata/source/books/UCC_results.txt delete mode 100644 data/Test/test3/metadata/source/inds.txt delete mode 100644 data/Test/test3/metadata/target-to-source-inds.txt delete mode 100644 data/Test/test3/metadata/target/authors/FD_results.txt delete mode 100644 data/Test/test3/metadata/target/authors/UCC_results.txt delete mode 100644 data/Test/test3/metadata/target/buecher/FD_results.txt delete mode 100644 data/Test/test3/metadata/target/buecher/UCC_results.txt delete mode 100644 data/Test/test3/metadata/target/inds.txt delete mode 100644 data/Test/test3/source/authors.csv delete mode 100644 data/Test/test3/source/books.csv delete mode 100644 data/Test/test3/target/authors.csv delete mode 100644 data/Test/test3/target/buecher.csv create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/HistogramMatcher.java diff --git a/data/Test/test1/source/authors.csv b/data/Test/test1/source/authors.csv index b3968c2f2..1be4860cc 100644 --- a/data/Test/test1/source/authors.csv +++ b/data/Test/test1/source/authors.csv @@ -1,4 +1,4 @@ aid,name -0,alice doe -1,bob smith -2,eve smith +1,alice doe +2,bob smith +3,eve smith diff --git a/data/Test/test2/ground_truth/authors___authors.csv b/data/Test/test2/ground_truth/authors___authors.csv deleted file mode 100644 index a0ed46cdd..000000000 --- a/data/Test/test2/ground_truth/authors___authors.csv +++ /dev/null @@ -1,8 +0,0 @@ -1,0 -0,1 -0,0 -0,0 -0,0 -0,0 -0,0 -0,0 diff --git a/data/Test/test2/metadata/source-to-target-inds.txt b/data/Test/test2/metadata/source-to-target-inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/source/authors/FD_results.txt b/data/Test/test2/metadata/source/authors/FD_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/source/authors/UCC_results.txt b/data/Test/test2/metadata/source/authors/UCC_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/source/inds.txt b/data/Test/test2/metadata/source/inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/target-to-source-inds.txt b/data/Test/test2/metadata/target-to-source-inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/target/authors/FD_results.txt b/data/Test/test2/metadata/target/authors/FD_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/target/authors/UCC_results.txt b/data/Test/test2/metadata/target/authors/UCC_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/metadata/target/inds.txt b/data/Test/test2/metadata/target/inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test2/source/authors.csv b/data/Test/test2/source/authors.csv deleted file mode 100644 index cbebd34f9..000000000 --- a/data/Test/test2/source/authors.csv +++ /dev/null @@ -1,8 +0,0 @@ -aid,name,username,income,famous,alive,dob,address -0,alice doe,42,43.3,1,true,12-01-93,"67.36,-43.04" -1,bob smith,bs,42.0,0,false,11/1/1934,"12.3,26.04" -2,eve smith,123,50.0,1,true,1.1.85,"160.4,-12.3" -3,tim doe,td,54.3,0,true,14-01-1960,"0.7,-0.7" -4,eve evil,ee,59.4,1,false,01-14-1960,"0.0,0.0" -5,bob b.,bb,60.2,1,true,1/11/1934,"-12.34,-12.34" -6,a. author,a6,14.3,0,false,4.3.60,"180.0,180.0" diff --git a/data/Test/test2/target/authors.csv b/data/Test/test2/target/authors.csv deleted file mode 100644 index 7d5adc6ec..000000000 --- a/data/Test/test2/target/authors.csv +++ /dev/null @@ -1,4 +0,0 @@ -authorid,full_name,dob -2,john doe,xyz -3,jane doe,12-01-93 -4,bob smith,20/01/93 diff --git a/data/Test/test3/ground_truth/authors___authors.csv b/data/Test/test3/ground_truth/authors___authors.csv deleted file mode 100644 index 1fe698519..000000000 --- a/data/Test/test3/ground_truth/authors___authors.csv +++ /dev/null @@ -1,2 +0,0 @@ -1,0 -0,0 diff --git a/data/Test/test3/ground_truth/books___buecher.csv b/data/Test/test3/ground_truth/books___buecher.csv deleted file mode 100644 index 1fe698519..000000000 --- a/data/Test/test3/ground_truth/books___buecher.csv +++ /dev/null @@ -1,2 +0,0 @@ -1,0 -0,0 diff --git a/data/Test/test3/metadata/source-to-target-inds.txt b/data/Test/test3/metadata/source-to-target-inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/source/authors/FD_results.txt b/data/Test/test3/metadata/source/authors/FD_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/source/authors/UCC_results.txt b/data/Test/test3/metadata/source/authors/UCC_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/source/books/FD_results.txt b/data/Test/test3/metadata/source/books/FD_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/source/books/UCC_results.txt b/data/Test/test3/metadata/source/books/UCC_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/source/inds.txt b/data/Test/test3/metadata/source/inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/target-to-source-inds.txt b/data/Test/test3/metadata/target-to-source-inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/target/authors/FD_results.txt b/data/Test/test3/metadata/target/authors/FD_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/target/authors/UCC_results.txt b/data/Test/test3/metadata/target/authors/UCC_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/target/buecher/FD_results.txt b/data/Test/test3/metadata/target/buecher/FD_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/target/buecher/UCC_results.txt b/data/Test/test3/metadata/target/buecher/UCC_results.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/metadata/target/inds.txt b/data/Test/test3/metadata/target/inds.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/Test/test3/source/authors.csv b/data/Test/test3/source/authors.csv deleted file mode 100644 index b3968c2f2..000000000 --- a/data/Test/test3/source/authors.csv +++ /dev/null @@ -1,4 +0,0 @@ -aid,name -0,alice doe -1,bob smith -2,eve smith diff --git a/data/Test/test3/source/books.csv b/data/Test/test3/source/books.csv deleted file mode 100644 index ae79801aa..000000000 --- a/data/Test/test3/source/books.csv +++ /dev/null @@ -1,4 +0,0 @@ -name,genre -AlgoDat,CS -Programming,CS -Java,CS diff --git a/data/Test/test3/target/authors.csv b/data/Test/test3/target/authors.csv deleted file mode 100644 index 80ac6ab38..000000000 --- a/data/Test/test3/target/authors.csv +++ /dev/null @@ -1,4 +0,0 @@ -authorid,age -2,60 -3,68 -4,42 diff --git a/data/Test/test3/target/buecher.csv b/data/Test/test3/target/buecher.csv deleted file mode 100644 index c9b2e6c1e..000000000 --- a/data/Test/test3/target/buecher.csv +++ /dev/null @@ -1,3 +0,0 @@ -titel,numPages -Programmierung,203 -JavaFuerDummies,494 diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/HistogramMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/HistogramMatcher.java new file mode 100644 index 000000000..6ccf2a5c7 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/HistogramMatcher.java @@ -0,0 +1,87 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.*; + +@Data +@EqualsAndHashCode(callSuper = true) +public class HistogramMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Column sourceColumn = sourceTable.getColumn(i); + Datatype sourceType = sourceColumn.getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Column targetColumn = targetTable.getColumn(j); + Datatype targetType = targetColumn.getDatatype(); + if (sourceType != targetType) { + simMatrix[i][j] = 0f; + continue; + } + simMatrix[i][j] = calculateScore(sourceColumn, targetColumn); + } + } + return simMatrix; + } + + + private float calculateScore(Column sourceColumn, Column targetColumn) { + List sourceList = sourceColumn.getValues(); + List targetList = targetColumn.getValues(); + + HashMap sourceMap = new HashMap<>(); + HashMap targetMap = new HashMap<>(); + + for (String item : sourceList) { + sourceMap.putIfAbsent(item, 0); + sourceMap.put(item, sourceMap.get(item) + 1); + } + + for (String item : targetList) { + targetMap.putIfAbsent(item, 0); + targetMap.put(item, targetMap.get(item) + 1); + } + + Set bins = new HashSet<>(sourceMap.keySet()); + bins.addAll(targetMap.keySet()); + + ArrayList binsList = new ArrayList<>(bins); + Collections.sort(binsList); + + double[] source = new double[bins.size()]; + double[] target = new double[bins.size()]; + + for (int i = 0; i < binsList.size(); i++) { + String currKey = binsList.get(i); + source[i] = sourceMap.getOrDefault(currKey, 0); + target[i] = targetMap.getOrDefault(currKey, 0); + } + + double emd = EMD(source, target); + return (float) emd; + } + + + private double EMD(double[] source, double[] target) { + double lastDistance = 0; + double totalDistance = 0; + for (int i = 0; i < source.length; i++) { + final double currentDistance = source[i] + lastDistance - target[i]; + totalDistance += Math.abs(currentDistance); + lastDistance = currentDistance; + } + return totalDistance; + } + +} From 83aacceee53cfeaef2a18c6f40db059c95347c84 Mon Sep 17 00:00:00 2001 From: Soefgi Date: Sun, 17 Mar 2024 22:50:59 +0100 Subject: [PATCH 11/11] implemented Maxima and Minima Matchers --- .../matching/metadata/MaximumMatcher.java | 97 +++++++++++++++++++ .../matching/metadata/MinimumMatcher.java | 95 ++++++++++++++++++ src/main/resources/first_line_matchers.yaml | 6 ++ 3 files changed, 198 insertions(+) create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumMatcher.java create mode 100644 src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumMatcher.java diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumMatcher.java new file mode 100644 index 000000000..e2914ec38 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MaximumMatcher.java @@ -0,0 +1,97 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import de.uni_marburg.schematch.utils.GeoLocation; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.Date; +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) +public class MaximumMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Datatype datatype_i = sourceTable.getColumn(i).getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Datatype datatype_j = targetTable.getColumn(j).getDatatype(); + + if (datatype_i != datatype_j) { + simMatrix[i][j] = 0; + continue; + } + + Column sourceColumn = sourceTable.getColumn(i); + Column targetColumn = targetTable.getColumn(j); + switch (datatype_i) { + case INTEGER -> simMatrix[i][j] = integerExtrema(sourceColumn, targetColumn); + case FLOAT -> simMatrix[i][j] = floatExtrema(sourceColumn, targetColumn); + default -> simMatrix[i][j] = 0; + } + } + } + + return simMatrix; + } + + private float integerExtrema(Column sourceRaw, Column targetRaw) { + int sourceMax = Integer.MIN_VALUE; + int targetMax = Integer.MIN_VALUE; + + List source = Datatype.castToInt(sourceRaw); + List target = Datatype.castToInt(targetRaw); + + for (Integer s : source) { + if (s == null) { + continue; + } + if (s > sourceMax) sourceMax = s; + } + + for (Integer t : target) { + if (t == null) { + continue; + } + if (t > targetMax) targetMax = t; + } + + return (float) Math.min(sourceMax, targetMax) / Math.max(sourceMax, targetMax); + } + + private float floatExtrema(Column sourceRaw, Column targetRaw) { + float sourceMax = Float.MIN_VALUE; + float targetMax = Float.MIN_VALUE; + + List source = Datatype.castToFloat(sourceRaw); + List target = Datatype.castToFloat(targetRaw); + + for (Float s : source) { + if (s == null) { + continue; + } + if (s > sourceMax) sourceMax = s; + } + + for (Float t : target) { + if (t == null) { + continue; + } + if (t > targetMax) targetMax = t; + } + + return Math.min(sourceMax, targetMax) / Math.max(sourceMax, targetMax); + } + +} diff --git a/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumMatcher.java b/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumMatcher.java new file mode 100644 index 000000000..77b504fd3 --- /dev/null +++ b/src/main/java/de/uni_marburg/schematch/matching/metadata/MinimumMatcher.java @@ -0,0 +1,95 @@ +package de.uni_marburg.schematch.matching.metadata; + +import de.uni_marburg.schematch.data.Column; +import de.uni_marburg.schematch.data.Table; +import de.uni_marburg.schematch.data.metadata.Datatype; +import de.uni_marburg.schematch.matching.TablePairMatcher; +import de.uni_marburg.schematch.matchtask.tablepair.TablePair; +import lombok.Data; +import lombok.EqualsAndHashCode; + +import java.util.List; + +@Data +@EqualsAndHashCode(callSuper = true) +public class MinimumMatcher extends TablePairMatcher { + + @Override + public float[][] match(TablePair tablePair) { + + Table sourceTable = tablePair.getSourceTable(); + Table targetTable = tablePair.getTargetTable(); + + float[][] simMatrix = tablePair.getEmptySimMatrix(); + for (int i = 0; i < sourceTable.getNumColumns(); i++) { + Datatype datatype_i = sourceTable.getColumn(i).getDatatype(); + for (int j = 0; j < targetTable.getNumColumns(); j++) { + Datatype datatype_j = targetTable.getColumn(j).getDatatype(); + + if (datatype_i != datatype_j) { + simMatrix[i][j] = 0; + continue; + } + + Column sourceColumn = sourceTable.getColumn(i); + Column targetColumn = targetTable.getColumn(j); + switch (datatype_i) { + case INTEGER -> simMatrix[i][j] = integerExtrema(sourceColumn, targetColumn); + case FLOAT -> simMatrix[i][j] = floatExtrema(sourceColumn, targetColumn); + default -> simMatrix[i][j] = 0; + } + } + } + + return simMatrix; + } + + private float integerExtrema(Column sourceRaw, Column targetRaw) { + int sourceMin = Integer.MAX_VALUE; + int targetMin = Integer.MAX_VALUE; + + List source = Datatype.castToInt(sourceRaw); + List target = Datatype.castToInt(targetRaw); + + for (Integer s : source) { + if (s == null) { + continue; + } + if (s < sourceMin) sourceMin = s; + } + + for (Integer t : target) { + if (t == null) { + continue; + } + if (t < targetMin) targetMin = t; + } + + return (float) Math.min(sourceMin, targetMin) / Math.max(sourceMin, targetMin); + } + + private float floatExtrema(Column sourceRaw, Column targetRaw) { + float sourceMin = Float.MAX_VALUE; + float targetMin = Float.MAX_VALUE; + + List source = Datatype.castToFloat(sourceRaw); + List target = Datatype.castToFloat(targetRaw); + + for (Float s : source) { + if (s == null) { + continue; + } + if (s < sourceMin) sourceMin = s; + } + + for (Float t : target) { + if (t == null) { + continue; + } + if (t < targetMin) targetMin = t; + } + + return Math.min(sourceMin, targetMin) / Math.max(sourceMin, targetMin); + } + +} diff --git a/src/main/resources/first_line_matchers.yaml b/src/main/resources/first_line_matchers.yaml index a8072e223..06c9ae39c 100644 --- a/src/main/resources/first_line_matchers.yaml +++ b/src/main/resources/first_line_matchers.yaml @@ -54,6 +54,12 @@ packageName: "metadata" --- name: "MedianLength" packageName: "metadata" +--- +name: "MaximumMatcher" +packageName: "metadata" +--- +name: "MinimumMatcher" +packageName: "metadata"