Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reuter reuter main PR #26

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions data/Test/test1/source/authors.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
aid,name
0,alice doe
1,bob smith
2,eve smith
1,alice doe
2,bob smith
3,eve smith
8 changes: 0 additions & 8 deletions data/Test/test2/ground_truth/authors___authors.csv

This file was deleted.

Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
8 changes: 0 additions & 8 deletions data/Test/test2/source/authors.csv

This file was deleted.

4 changes: 0 additions & 4 deletions data/Test/test2/target/authors.csv

This file was deleted.

2 changes: 0 additions & 2 deletions data/Test/test3/ground_truth/authors___authors.csv

This file was deleted.

2 changes: 0 additions & 2 deletions data/Test/test3/ground_truth/books___buecher.csv

This file was deleted.

Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
4 changes: 0 additions & 4 deletions data/Test/test3/source/authors.csv

This file was deleted.

4 changes: 0 additions & 4 deletions data/Test/test3/source/books.csv

This file was deleted.

4 changes: 0 additions & 4 deletions data/Test/test3/target/authors.csv

This file was deleted.

3 changes: 0 additions & 3 deletions data/Test/test3/target/buecher.csv

This file was deleted.

24 changes: 13 additions & 11 deletions src/main/java/de/uni_marburg/schematch/data/metadata/Datatype.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,20 @@ public enum Datatype {
GEO_LOCATION;

// boolean values
final static String[] t = {"1", "true", "t", "yes", "y", "ja", "j"};
final static String[] f = {"0", "false", "f", "no", "n", "nein"};
final static List<String> patternsT = Arrays.stream(t).toList();
final static List<String> patternsF = Arrays.stream(f).toList();
final static List<String> booleanPatterns = Stream.concat(patternsT.stream(), patternsF.stream()).toList();
private final static String[] nulls = {"-", "nan", "\"\"", "\\", "/", "null"};
private final static List<String> nullPatterns = Arrays.stream(nulls).toList();
private final static String[] t = {"1", "true", "t", "yes", "y", "ja", "j"};
private final static String[] f = {"0", "false", "f", "no", "n", "nein"};
private final static List<String> patternsT = Arrays.stream(t).toList();
private final static List<String> patternsF = Arrays.stream(f).toList();
private final static List<String> booleanPatterns = Stream.concat(patternsT.stream(), patternsF.stream()).toList();
// geolocation pattern
final static Pattern geoLocationPattern = Pattern.compile("-?[0-9]+.[0-9]+,-?[0-9]+.[0-9]+");
private final static Pattern geoLocationPattern = Pattern.compile("-?[0-9]+.[0-9]+,-?[0-9]+.[0-9]+");
// date formats
final static SimpleDateFormat sdfDashes = new SimpleDateFormat("dd-MM");
final static SimpleDateFormat sdfSlashes = new SimpleDateFormat("dd/MM");
final static SimpleDateFormat sdfDots = new SimpleDateFormat("dd.MM");
final static SimpleDateFormat[] sdfs = {sdfDashes, sdfSlashes, sdfDots};
private final static SimpleDateFormat sdfDashes = new SimpleDateFormat("dd-MM");
private final static SimpleDateFormat sdfSlashes = new SimpleDateFormat("dd/MM");
private final static SimpleDateFormat sdfDots = new SimpleDateFormat("dd.MM");
private final static SimpleDateFormat[] sdfs = {sdfDashes, sdfSlashes, sdfDots};

/**
* Determines the definitive data type from a list of scores
Expand Down Expand Up @@ -322,7 +324,7 @@ private static double isGeoLocation(Column column) {
}

private static boolean isNull(String value) {
return value.equals("\"\"") || value.isEmpty();
return nullPatterns.contains(value.toLowerCase()) || value.isEmpty();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package de.uni_marburg.schematch.matching.metadata;

import de.uni_marburg.schematch.data.Table;
import de.uni_marburg.schematch.matching.TablePairMatcher;
import de.uni_marburg.schematch.matchtask.tablepair.TablePair;
import lombok.Data;
import lombok.EqualsAndHashCode;

import java.util.HashMap;
import java.util.List;

@Data
@EqualsAndHashCode(callSuper = true)
public class ConstancyMatcher extends TablePairMatcher {

@Override
public float[][] match(TablePair tablePair) {
Table sourceTable = tablePair.getSourceTable();
Table targetTable = tablePair.getTargetTable();
float[][] simMatrix = tablePair.getEmptySimMatrix();
for (int i = 0; i < sourceTable.getNumColumns(); i++) {
for (int j = 0; j < targetTable.getNumColumns(); j++) {
simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues());
}
}
return simMatrix;
}

private float calculateScore(List<String> source, List<String> target) {
HashMap<String, Integer> source_map = new HashMap<>();
HashMap<String, Integer> target_map = new HashMap<>();

for (String entry : source) {
source_map.computeIfPresent(
entry,
(key, count) -> count + 1
);
source_map.putIfAbsent(entry, 1);
}

for (String entry : target) {
target_map.computeIfPresent(
entry,
(key, count) -> count + 1
);
target_map.putIfAbsent(entry, 1);
}

int maxSource = 0;
int maxTarget = 0;

for (Integer count : source_map.values()) {
if (count > maxSource) maxSource = count;
}

for (Integer count : target_map.values()) {
if (count > maxTarget) maxTarget = count;
}

//highest frequent value divided by number of rows
float sourcePercentage = (float) maxSource / source.size();
float targetPercentage = (float) maxTarget / target.size();

//normalize to value between 0 and 1
return Math.min(sourcePercentage, targetPercentage) / Math.max(sourcePercentage, targetPercentage);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package de.uni_marburg.schematch.matching.metadata;

import de.uni_marburg.schematch.data.Table;
import de.uni_marburg.schematch.data.metadata.Datatype;
import de.uni_marburg.schematch.matching.TablePairMatcher;
import de.uni_marburg.schematch.matchtask.tablepair.TablePair;
import lombok.Data;
import lombok.EqualsAndHashCode;
import java.util.List;

@Data
@EqualsAndHashCode(callSuper = true)
public class DecimalsMatcher extends TablePairMatcher {

@Override
public float[][] match(TablePair tablePair) {
Table sourceTable = tablePair.getSourceTable();
Table targetTable = tablePair.getTargetTable();
float[][] simMatrix = tablePair.getEmptySimMatrix();
for (int i = 0; i < sourceTable.getNumColumns(); i++) {
Datatype sourceType = sourceTable.getColumn(i).getDatatype();
for (int j = 0; j < targetTable.getNumColumns(); j++) {
Datatype targetType = targetTable.getColumn(j).getDatatype();
if (sourceType != Datatype.FLOAT) {
simMatrix[i][j] = 0.0f;
continue;
}
if (targetType != Datatype.FLOAT) {
simMatrix[i][j] = 0.0f;
continue;
}
simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues());
}
}
return simMatrix;
}

private float calculateScore(List<String> sourceColumn, List<String> targetColumn) {

int maxSourceDecimals = 0;
int maxTargetDecimals = 0;
int numSourceDecimals;
int numTargetDecimals;

for (String s : sourceColumn) {

numSourceDecimals = s.length() - (s.indexOf('.') + 1);
if (numSourceDecimals > maxSourceDecimals) {
maxSourceDecimals = numSourceDecimals;
}
}

for (String s : targetColumn) {

numTargetDecimals = s.length() - (s.indexOf('.') + 1);
if (numTargetDecimals > maxTargetDecimals) {
maxTargetDecimals = numTargetDecimals;
}
}

if (maxTargetDecimals > maxSourceDecimals) {
return (float)(maxSourceDecimals / maxTargetDecimals);
}
else return (float)(maxTargetDecimals / maxSourceDecimals);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
import lombok.Data;
import lombok.EqualsAndHashCode;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package de.uni_marburg.schematch.matching.metadata;

import de.uni_marburg.schematch.data.Table;
import de.uni_marburg.schematch.data.metadata.Datatype;
import de.uni_marburg.schematch.matching.TablePairMatcher;
import de.uni_marburg.schematch.matchtask.tablepair.TablePair;
import lombok.Data;
import lombok.EqualsAndHashCode;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

@Data
@EqualsAndHashCode(callSuper = true)
public class FirstDigitMatcher extends TablePairMatcher {

@Override
public float[][] match(TablePair tablePair) {
Table sourceTable = tablePair.getSourceTable();
Table targetTable = tablePair.getTargetTable();
float[][] simMatrix = tablePair.getEmptySimMatrix();
for (int i = 0; i < sourceTable.getNumColumns(); i++) {
Datatype sourceType = sourceTable.getColumn(i).getDatatype();
for (int j = 0; j < targetTable.getNumColumns(); j++) {
Datatype targetType = targetTable.getColumn(j).getDatatype();
if (sourceType != Datatype.INTEGER && sourceType != Datatype.FLOAT) {
simMatrix[i][j] = 0.0f;
continue;
}
if (targetType != Datatype.INTEGER && targetType != Datatype.FLOAT) {
simMatrix[i][j] = 0.0f;
continue;
}
simMatrix[i][j] = calculateScore(sourceTable.getColumn(i).getValues(), targetTable.getColumn(j).getValues());
}
}
return simMatrix;
}

private float calculateScore(List<String> sourceColumn, List<String> targetColumn) {
HashMap<Integer, Integer> sourceDistribution = new HashMap<>();
HashMap<Integer, Integer> targetDistribution = new HashMap<>();



//count Benford's distribution
for (int i = 0; i <= 9; i++) {
sourceDistribution.put(i, 0);
targetDistribution.put(i, 0);
}

getDistributions(sourceColumn, sourceDistribution);

getDistributions(targetColumn, targetDistribution);

//calculate similarity of first digit frequency
int sourceElements = sourceColumn.size();
int targetElements = targetColumn.size();
List<Float> similarities = new ArrayList<>();
for (int i = 0; i <= 9; i++) {
float sourcePercentage = (float) sourceDistribution.get(i) / sourceElements;
float targetPercentage = (float) targetDistribution.get(i) / targetElements;

float similarity = (float) (Math.round(Math.abs(sourcePercentage - targetPercentage) * 100.0) / 100.0);
similarities.add(similarity);
}

//now average the similarities - Σ(similarities) / 10
return (float) (similarities.stream().reduce(0.0f, Float::sum) / 10.0);
}

private void getDistributions(List<String> targetColumn, HashMap<Integer, Integer> targetDistribution) {
for (String entry : targetColumn) {
try {
Integer.parseInt(entry);
int index = 0;
if (entry.charAt(0) == '-') index = 1;
Integer firstDigit = Integer.parseInt(String.valueOf(entry.charAt(index)));
targetDistribution.computeIfPresent(firstDigit, (key, count) -> count + 1);
} catch (NumberFormatException ignored) {
}
}
}

}
Loading