Skip to content

Commit

Permalink
#434 - drafted libsvm/liblinear with super module
Browse files Browse the repository at this point in the history
  • Loading branch information
Horsmann committed Feb 3, 2018
1 parent df93dbe commit d366ee3
Show file tree
Hide file tree
Showing 26 changed files with 769 additions and 1,384 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,16 @@ public void setOutputFolder(File outputFolder)
}


public void writeModelConfiguration(TaskContext aContext, String mlAdapter) throws Exception{
protected void writeModelConfiguration(TaskContext aContext) throws Exception{

SaveModelUtils.writeModelParameters(aContext, outputFolder, featureSet);
SaveModelUtils.writeFeatureMode(outputFolder, featureMode);
SaveModelUtils.writeLearningMode(outputFolder, learningMode);
SaveModelUtils.writeModelAdapterInformation(outputFolder, mlAdapter);
SaveModelUtils.writeCurrentVersionOfDKProTC(outputFolder);

writeAdapter();
}

protected abstract void writeAdapter() throws Exception;

}
4 changes: 4 additions & 0 deletions dkpro-tc-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
<groupId>org.dkpro.tc</groupId>
<artifactId>dkpro-tc-features-pair-similarity</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.tc</groupId>
<artifactId>dkpro-tc-io-libsvm</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.tc</groupId>
<artifactId>dkpro-tc-ml</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import org.dkpro.tc.features.length.NrOfTokens;
import org.dkpro.tc.features.ngram.LuceneCharacterNGram;
import org.dkpro.tc.features.ngram.LuceneNGram;
import org.dkpro.tc.io.libsvm.AdapterFormat;
import org.dkpro.tc.ml.ExperimentSaveModel;
import org.dkpro.tc.ml.liblinear.LiblinearAdapter;
import org.dkpro.tc.ml.uima.TcAnnotator;
Expand Down Expand Up @@ -165,7 +166,7 @@ private void documentVerifyCreatedModelFiles(File modelFolder)
assertTrue(learningMode.exists());

File id2outcomeMapping = new File(
modelFolder.getAbsolutePath() + "/" + LiblinearAdapter.getOutcomeMappingFilename());
modelFolder.getAbsolutePath() + "/" + AdapterFormat.getOutcomeMappingFilename());
assertTrue(id2outcomeMapping.exists());
}

Expand Down Expand Up @@ -294,6 +295,7 @@ private static void unitLoadAndUseModel(File modelFolder)
possibleOutcomes.add("JJ");
possibleOutcomes.add("VBD");
possibleOutcomes.add("NNS");
possibleOutcomes.add("TO");
possibleOutcomes.add("VBN");
possibleOutcomes.add("IN");
possibleOutcomes.add("CC");
Expand All @@ -304,7 +306,6 @@ private static void unitLoadAndUseModel(File modelFolder)

assertEquals(31, outcomes.size());
for(TextClassificationOutcome o : outcomes){
System.out.println(o.getOutcome());
assertTrue(possibleOutcomes.contains(o.getOutcome()));
}

Expand Down Expand Up @@ -343,7 +344,7 @@ private void unitVerifyCreatedModelFiles(File modelFolder)
assertTrue(learningMode.exists());

File id2outcomeMapping = new File(
modelFolder.getAbsolutePath() + "/" + LiblinearAdapter.getOutcomeMappingFilename());
modelFolder.getAbsolutePath() + "/" + AdapterFormat.getOutcomeMappingFilename());
assertTrue(id2outcomeMapping.exists());
}
}
19 changes: 19 additions & 0 deletions dkpro-tc-io-libsvm/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>dkpro-tc-io-libsvm</artifactId>
<parent>
<groupId>org.dkpro.tc</groupId>
<artifactId>dkpro-tc</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
<groupId>org.dkpro.tc</groupId>
<artifactId>dkpro-tc-core</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.tc</groupId>
<artifactId>dkpro-tc-ml</artifactId>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*******************************************************************************
* Copyright 2018
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.tc.io.libsvm;

public class AdapterFormat {

public static String getOutcomeMappingFilename() {
return "outcome-mapping.txt";
}

public static String getFeatureNameMappingFilename() {
return "feature-name-mapping.txt";
}

public static String getFeatureNames() {
return "featurenames.txt";
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
/*******************************************************************************
* Copyright 2018
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.tc.io.libsvm;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.dkpro.lab.reporting.ReportBase;
import org.dkpro.lab.storage.StorageService.AccessMode;
import org.dkpro.tc.core.Constants;
import org.dkpro.tc.core.task.InitTask;
import org.dkpro.tc.ml.report.util.SortedKeyProperties;

/**
* Creates id 2 outcome report
*/
public class LibsvmDataFormatOutcomeIdReport extends ReportBase implements Constants {
// constant dummy value for setting as threshold which is an expected field
// in the evaluation
// module but is not needed/provided by liblinear
private static final String THRESHOLD_CONSTANT = "-1";

public LibsvmDataFormatOutcomeIdReport() {
// required by groovy
}

@Override
public void execute() throws Exception {
boolean isRegression = isRegression();

boolean isUnit = getDiscriminators().get(InitTask.class.getName() + "|" + Constants.DIM_FEATURE_MODE)
.equals(Constants.FM_UNIT);

Map<Integer, String> id2label = getId2LabelMapping(isRegression);
String header = buildHeader(id2label, isRegression);

List<String> predictions = readPredictions();
Map<String, String> index2instanceIdMap = getMapping(isUnit);

Properties prop = new SortedKeyProperties();
int lineCounter = 0;
for (String line : predictions) {
if (line.startsWith("#")) {
continue;
}
String[] split = line.split(";");
String key = index2instanceIdMap.get(lineCounter + "");

if (isRegression) {
prop.setProperty(key, split[0] + ";" + split[1] + ";" + THRESHOLD_CONSTANT);
} else {
int pred = Double.valueOf(split[0]).intValue();
int gold = Double.valueOf(split[1]).intValue();
prop.setProperty(key, pred + ";" + gold + ";" + THRESHOLD_CONSTANT);
}
lineCounter++;
}

File targetFile = getId2OutcomeFileLocation();

FileWriterWithEncoding fw = new FileWriterWithEncoding(targetFile, "utf-8");
prop.store(fw, header);
fw.close();

}

private boolean isRegression() {

Collection<String> keys = getDiscriminators().keySet();
for (String k : keys) {
if (k.endsWith("|" + Constants.DIM_LEARNING_MODE)) {
return getDiscriminators().get(k).equals(Constants.LM_REGRESSION);
}
}
return false;
}

private Map<String, String> getMapping(boolean isUnit) throws IOException {

File f;
if (isUnit) {
f = new File(getContext().getFolder(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY),
LibsvmDataFormatWriter.INDEX2INSTANCEID);
} else {
f = new File(getContext().getFolder(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY),
Constants.FILENAME_DOCUMENT_META_DATA_LOG);
}

Map<String, String> m = new HashMap<>();

int idx = 0;
for (String l : FileUtils.readLines(f, "utf-8")) {
if (l.startsWith("#")) {
continue;
}
if (l.trim().isEmpty()) {
continue;
}
String[] split = l.split("\t");

// if (isUnit) {
m.put(idx + "", split[0]);
idx++;
// } else {
// m.put(split[0], split[1]);
// }

}
return m;
}

private File getId2OutcomeFileLocation() {
File evaluationFolder = getContext().getFolder("", AccessMode.READWRITE);
return new File(evaluationFolder, ID_OUTCOME_KEY);
}

private List<String> readPredictions() throws IOException {
File predFolder = getContext().getFolder("", AccessMode.READWRITE);
return FileUtils.readLines(new File(predFolder, Constants.FILENAME_PREDICTIONS), "utf-8");
}

private String buildHeader(Map<Integer, String> id2label, boolean isRegression)
throws UnsupportedEncodingException {
StringBuilder header = new StringBuilder();
header.append("ID=PREDICTION;GOLDSTANDARD;THRESHOLD" + "\n" + "labels" + " ");

if (isRegression) {
// no label mapping for regression so that is all we have to do
return header.toString();
}

int numKeys = id2label.keySet().size();
List<Integer> keys = new ArrayList<Integer>(id2label.keySet());
for (int i = 0; i < numKeys; i++) {
Integer key = keys.get(i);
header.append(key + "=" + URLEncoder.encode(id2label.get(key), "UTF-8"));
if (i + 1 < numKeys) {
header.append(" ");
}
}
return header.toString();
}

private Map<Integer, String> getId2LabelMapping(boolean isRegression) throws Exception {
if (isRegression) {
// no map for regression;
return new HashMap<>();
}

File folder = getContext().getFolder(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY);
String fileName = AdapterFormat.getOutcomeMappingFilename();
File file = new File(folder, fileName);
Map<Integer, String> map = new HashMap<Integer, String>();

List<String> lines = FileUtils.readLines(file, "utf-8");
for (String line : lines) {
String[] split = line.split("\t");
map.put(Integer.valueOf(split[1]), split[0]);
}

return map;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.tc.ml.libsvm.writer;
package org.dkpro.tc.io.libsvm;

import java.io.BufferedReader;
import java.io.BufferedWriter;
Expand All @@ -38,7 +38,6 @@
import org.dkpro.tc.api.features.Instance;
import org.dkpro.tc.core.Constants;
import org.dkpro.tc.core.io.DataWriter;
import org.dkpro.tc.ml.libsvm.LibsvmAdapter;

import com.google.gson.Gson;

Expand All @@ -49,7 +48,7 @@
*
* For example: 1 1:1 3:1 4:1 6:1 2 2:1 3:1 5:1 7:1 1 3:1 5:1
*/
public class LibsvmDataWriter implements DataWriter {
public class LibsvmDataFormatWriter implements DataWriter {

public static final String INDEX2INSTANCEID = "index2Instanceid.txt";

Expand Down Expand Up @@ -161,8 +160,8 @@ public void writeClassifierFormat(Collection<Instance> in) throws Exception {
bw = null;

writeMapping(outputDirectory, INDEX2INSTANCEID, index2instanceId);
writeFeatureName2idMapping(outputDirectory, LibsvmAdapter.getFeatureNameMappingFilename(), featureNames2id);
writeOutcomeMapping(outputDirectory, LibsvmAdapter.getOutcomeMappingFilename(), outcomeMap);
writeFeatureName2idMapping(outputDirectory, AdapterFormat.getFeatureNameMappingFilename(), featureNames2id);
writeOutcomeMapping(outputDirectory, AdapterFormat.getOutcomeMappingFilename(), outcomeMap);
}

private void writeOutcomeMapping(File outputDirectory, String file, Map<String, Integer> map) throws IOException {
Expand Down
Loading

0 comments on commit d366ee3

Please sign in to comment.