-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#434 - drafted libsvm/liblinear with super module
- Loading branch information
Showing
26 changed files
with
769 additions
and
1,384 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<artifactId>dkpro-tc-io-libsvm</artifactId> | ||
<parent> | ||
<groupId>org.dkpro.tc</groupId> | ||
<artifactId>dkpro-tc</artifactId> | ||
<version>1.0.0-SNAPSHOT</version> | ||
</parent> | ||
<dependencies> | ||
<dependency> | ||
<groupId>org.dkpro.tc</groupId> | ||
<artifactId>dkpro-tc-core</artifactId> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.dkpro.tc</groupId> | ||
<artifactId>dkpro-tc-ml</artifactId> | ||
</dependency> | ||
</dependencies> | ||
</project> |
34 changes: 34 additions & 0 deletions
34
dkpro-tc-io-libsvm/src/main/java/org/dkpro/tc/io/libsvm/AdapterFormat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/******************************************************************************* | ||
* Copyright 2018 | ||
* Ubiquitous Knowledge Processing (UKP) Lab | ||
* Technische Universität Darmstadt | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
******************************************************************************/ | ||
package org.dkpro.tc.io.libsvm; | ||
|
||
public class AdapterFormat { | ||
|
||
public static String getOutcomeMappingFilename() { | ||
return "outcome-mapping.txt"; | ||
} | ||
|
||
public static String getFeatureNameMappingFilename() { | ||
return "feature-name-mapping.txt"; | ||
} | ||
|
||
public static String getFeatureNames() { | ||
return "featurenames.txt"; | ||
} | ||
|
||
} |
189 changes: 189 additions & 0 deletions
189
dkpro-tc-io-libsvm/src/main/java/org/dkpro/tc/io/libsvm/LibsvmDataFormatOutcomeIdReport.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
/******************************************************************************* | ||
* Copyright 2018 | ||
* Ubiquitous Knowledge Processing (UKP) Lab | ||
* Technische Universität Darmstadt | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
******************************************************************************/ | ||
package org.dkpro.tc.io.libsvm; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.UnsupportedEncodingException; | ||
import java.net.URLEncoder; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Properties; | ||
|
||
import org.apache.commons.io.FileUtils; | ||
import org.apache.commons.io.output.FileWriterWithEncoding; | ||
import org.dkpro.lab.reporting.ReportBase; | ||
import org.dkpro.lab.storage.StorageService.AccessMode; | ||
import org.dkpro.tc.core.Constants; | ||
import org.dkpro.tc.core.task.InitTask; | ||
import org.dkpro.tc.ml.report.util.SortedKeyProperties; | ||
|
||
/** | ||
* Creates id 2 outcome report | ||
*/ | ||
public class LibsvmDataFormatOutcomeIdReport extends ReportBase implements Constants { | ||
// constant dummy value for setting as threshold which is an expected field | ||
// in the evaluation | ||
// module but is not needed/provided by liblinear | ||
private static final String THRESHOLD_CONSTANT = "-1"; | ||
|
||
public LibsvmDataFormatOutcomeIdReport() { | ||
// required by groovy | ||
} | ||
|
||
@Override | ||
public void execute() throws Exception { | ||
boolean isRegression = isRegression(); | ||
|
||
boolean isUnit = getDiscriminators().get(InitTask.class.getName() + "|" + Constants.DIM_FEATURE_MODE) | ||
.equals(Constants.FM_UNIT); | ||
|
||
Map<Integer, String> id2label = getId2LabelMapping(isRegression); | ||
String header = buildHeader(id2label, isRegression); | ||
|
||
List<String> predictions = readPredictions(); | ||
Map<String, String> index2instanceIdMap = getMapping(isUnit); | ||
|
||
Properties prop = new SortedKeyProperties(); | ||
int lineCounter = 0; | ||
for (String line : predictions) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
String[] split = line.split(";"); | ||
String key = index2instanceIdMap.get(lineCounter + ""); | ||
|
||
if (isRegression) { | ||
prop.setProperty(key, split[0] + ";" + split[1] + ";" + THRESHOLD_CONSTANT); | ||
} else { | ||
int pred = Double.valueOf(split[0]).intValue(); | ||
int gold = Double.valueOf(split[1]).intValue(); | ||
prop.setProperty(key, pred + ";" + gold + ";" + THRESHOLD_CONSTANT); | ||
} | ||
lineCounter++; | ||
} | ||
|
||
File targetFile = getId2OutcomeFileLocation(); | ||
|
||
FileWriterWithEncoding fw = new FileWriterWithEncoding(targetFile, "utf-8"); | ||
prop.store(fw, header); | ||
fw.close(); | ||
|
||
} | ||
|
||
private boolean isRegression() { | ||
|
||
Collection<String> keys = getDiscriminators().keySet(); | ||
for (String k : keys) { | ||
if (k.endsWith("|" + Constants.DIM_LEARNING_MODE)) { | ||
return getDiscriminators().get(k).equals(Constants.LM_REGRESSION); | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private Map<String, String> getMapping(boolean isUnit) throws IOException { | ||
|
||
File f; | ||
if (isUnit) { | ||
f = new File(getContext().getFolder(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY), | ||
LibsvmDataFormatWriter.INDEX2INSTANCEID); | ||
} else { | ||
f = new File(getContext().getFolder(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY), | ||
Constants.FILENAME_DOCUMENT_META_DATA_LOG); | ||
} | ||
|
||
Map<String, String> m = new HashMap<>(); | ||
|
||
int idx = 0; | ||
for (String l : FileUtils.readLines(f, "utf-8")) { | ||
if (l.startsWith("#")) { | ||
continue; | ||
} | ||
if (l.trim().isEmpty()) { | ||
continue; | ||
} | ||
String[] split = l.split("\t"); | ||
|
||
// if (isUnit) { | ||
m.put(idx + "", split[0]); | ||
idx++; | ||
// } else { | ||
// m.put(split[0], split[1]); | ||
// } | ||
|
||
} | ||
return m; | ||
} | ||
|
||
private File getId2OutcomeFileLocation() { | ||
File evaluationFolder = getContext().getFolder("", AccessMode.READWRITE); | ||
return new File(evaluationFolder, ID_OUTCOME_KEY); | ||
} | ||
|
||
private List<String> readPredictions() throws IOException { | ||
File predFolder = getContext().getFolder("", AccessMode.READWRITE); | ||
return FileUtils.readLines(new File(predFolder, Constants.FILENAME_PREDICTIONS), "utf-8"); | ||
} | ||
|
||
private String buildHeader(Map<Integer, String> id2label, boolean isRegression) | ||
throws UnsupportedEncodingException { | ||
StringBuilder header = new StringBuilder(); | ||
header.append("ID=PREDICTION;GOLDSTANDARD;THRESHOLD" + "\n" + "labels" + " "); | ||
|
||
if (isRegression) { | ||
// no label mapping for regression so that is all we have to do | ||
return header.toString(); | ||
} | ||
|
||
int numKeys = id2label.keySet().size(); | ||
List<Integer> keys = new ArrayList<Integer>(id2label.keySet()); | ||
for (int i = 0; i < numKeys; i++) { | ||
Integer key = keys.get(i); | ||
header.append(key + "=" + URLEncoder.encode(id2label.get(key), "UTF-8")); | ||
if (i + 1 < numKeys) { | ||
header.append(" "); | ||
} | ||
} | ||
return header.toString(); | ||
} | ||
|
||
private Map<Integer, String> getId2LabelMapping(boolean isRegression) throws Exception { | ||
if (isRegression) { | ||
// no map for regression; | ||
return new HashMap<>(); | ||
} | ||
|
||
File folder = getContext().getFolder(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY); | ||
String fileName = AdapterFormat.getOutcomeMappingFilename(); | ||
File file = new File(folder, fileName); | ||
Map<Integer, String> map = new HashMap<Integer, String>(); | ||
|
||
List<String> lines = FileUtils.readLines(file, "utf-8"); | ||
for (String line : lines) { | ||
String[] split = line.split("\t"); | ||
map.put(Integer.valueOf(split[1]), split[0]); | ||
} | ||
|
||
return map; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.