From 06e060a76fcf6cf1b8d9da7c4d9b6d6d4a98fa56 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 4 Aug 2024 19:33:57 -0300 Subject: [PATCH] #1587 - Drop NLP4J module - Removed module --- dkpro-core-asl/pom.xml | 1 - dkpro-core-bom-asl/pom.xml | 5 - dkpro-core-nlp4j-asl/LICENSE.txt | 202 -------------- dkpro-core-nlp4j-asl/pom.xml | 165 ------------ .../core/nlp4j/Nlp4JDependencyParser.java | 253 ------------------ .../org/dkpro/core/nlp4j/Nlp4JLemmatizer.java | 121 --------- .../nlp4j/Nlp4JNamedEntityRecognizer.java | 224 ---------------- .../org/dkpro/core/nlp4j/Nlp4JPosTagger.java | 237 ---------------- .../org/dkpro/core/nlp4j/Nlp4JSegmenter.java | 114 -------- .../core/nlp4j/internal/EmoryNlp2Uima.java | 112 -------- .../core/nlp4j/internal/EmoryNlpUtils.java | 164 ------------ ...ineComponentTagsetDescriptionProvider.java | 73 ----- .../core/nlp4j/internal/Uima2EmoryNlp.java | 59 ---- .../core/nlp4j/lib/ner-default-variants.map | 1 - dkpro-core-nlp4j-asl/src/scripts/build.xml | 94 ------- .../core/nlp4j/EnglishTokenizerTest.java | 41 --- .../core/nlp4j/Nlp4JDependencyParserTest.java | 93 ------- .../dkpro/core/nlp4j/Nlp4JLemmatizerTest.java | 60 ----- .../nlp4j/Nlp4JNamedEntityRecognizerTest.java | 73 ----- .../dkpro/core/nlp4j/Nlp4JPosTaggerTest.java | 84 ------ .../dkpro/core/nlp4j/Nlp4JSegmenterTest.java | 43 --- .../src/test/resources/log4j2.xml | 16 -- 22 files changed, 2235 deletions(-) delete mode 100644 dkpro-core-nlp4j-asl/LICENSE.txt delete mode 100644 dkpro-core-nlp4j-asl/pom.xml delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java delete mode 100644 dkpro-core-nlp4j-asl/src/main/resources/org/dkpro/core/nlp4j/lib/ner-default-variants.map delete mode 100644 dkpro-core-nlp4j-asl/src/scripts/build.xml delete mode 100644 dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java delete mode 100644 dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java delete mode 100644 dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java delete mode 100644 dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java delete mode 100644 dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java delete mode 100644 dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java delete mode 100644 dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml diff --git a/dkpro-core-asl/pom.xml b/dkpro-core-asl/pom.xml index b23f948aa7..586f04bafc 100644 --- a/dkpro-core-asl/pom.xml +++ b/dkpro-core-asl/pom.xml @@ -135,7 +135,6 @@ ../dkpro-core-mstparser-asl ../dkpro-core-mystem-asl ../dkpro-core-ngrams-asl - ../dkpro-core-nlp4j-asl ../dkpro-core-norvig-asl ../dkpro-core-opennlp-asl ../dkpro-core-performance-asl diff --git a/dkpro-core-bom-asl/pom.xml b/dkpro-core-bom-asl/pom.xml index 80a2671872..a99e1bb249 100644 --- a/dkpro-core-bom-asl/pom.xml +++ b/dkpro-core-bom-asl/pom.xml @@ -479,11 +479,6 @@ dkpro-core-ngrams-asl 3.0.0-SNAPSHOT - - org.dkpro.core - dkpro-core-nlp4j-asl - 3.0.0-SNAPSHOT - org.dkpro.core dkpro-core-norvig-asl diff --git a/dkpro-core-nlp4j-asl/LICENSE.txt b/dkpro-core-nlp4j-asl/LICENSE.txt deleted file mode 100644 index d645695673..0000000000 --- a/dkpro-core-nlp4j-asl/LICENSE.txt +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/dkpro-core-nlp4j-asl/pom.xml b/dkpro-core-nlp4j-asl/pom.xml deleted file mode 100644 index 9efd176531..0000000000 --- a/dkpro-core-nlp4j-asl/pom.xml +++ /dev/null @@ -1,165 +0,0 @@ - - - 4.0.0 - - - dkpro-core-asl - org.dkpro.core - 3.0.0-SNAPSHOT - ../dkpro-core-asl - - - dkpro-core-nlp4j-asl - jar - DKPro Core ASL - NLP4J - https://dkpro.github.io/dkpro-core/ - - - 1.1.3 - - - - - org.apache.uima - uimaj-core - - - org.apache.uima - uimafit-core - - - org.apache.commons - commons-lang3 - - - edu.emory.mathcs.nlp - nlp4j-api - ${nlp4j.version} - - - - org.dkpro.core - dkpro-core-api-metadata-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-resources-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-segmentation-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-lexmorph-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-syntax-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-ner-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-io-asl - ${project.version} - - - org.dkpro.core - dkpro-core-api-parameter-asl - ${project.version} - - - eu.openminted.share.annotations - omtd-share-annotations-api - - - org.dkpro.core - dkpro-core-testing-asl - ${project.version} - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-model-tagger-en-default - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-model-ner-en-default - test - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-model-parser-en-default - test - - - - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-model-tagger-en-default - 20160802.0 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-model-ner-en-default - 20160802.0 - - - de.tudarmstadt.ukp.dkpro.core - de.tudarmstadt.ukp.dkpro.core.nlp4j-model-parser-en-default - 20160802.0 - - - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.nlp4j-model-tagger-en-default - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.nlp4j-model-ner-en-default - de.tudarmstadt.ukp.dkpro.core:de.tudarmstadt.ukp.dkpro.core.nlp4j-model-parser-en-default - - - - - - - diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java deleted file mode 100644 index 5d907558d3..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JDependencyParser.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; -import static org.dkpro.core.api.resources.MappingProviderFactory.createDependencyMappingProvider; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.resources.MappingProvider; -import org.dkpro.core.api.resources.ModelProviderBase; -import org.dkpro.core.nlp4j.internal.EmoryNlp2Uima; -import org.dkpro.core.nlp4j.internal.EmoryNlpUtils; -import org.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; -import org.dkpro.core.nlp4j.internal.Uima2EmoryNlp; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import edu.emory.mathcs.nlp.common.util.NLPUtils; -import edu.emory.mathcs.nlp.component.dep.DEPState; -import edu.emory.mathcs.nlp.component.template.OnlineComponent; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; -import eu.openminted.share.annotations.api.Component; -import eu.openminted.share.annotations.api.DocumentationResource; -import eu.openminted.share.annotations.api.constants.OperationType; - -/** - * Emory NLP4J dependency parser. - */ -@Component(OperationType.DEPENDENCY_PARSER) -@ResourceMetaData(name = "NLP4J Dependency Parser") -@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency"}) -public class Nlp4JDependencyParser - extends JCasAnnotator_ImplBase -{ - /** - * Log the tag set(s) when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - private boolean printTagSet; - - /** - * Use this language instead of the document language to resolve the model and tag set mapping. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - private String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - private String variant; - - /** - * URI of the model artifact. This can be used to override the default model resolving - * mechanism and directly address a particular model. - * - *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set - * the variant parameter to match the artifact. If the artifact contains the model in - * a non-default location, you also have to specify the model location parameter, e.g. - * {@code classpath:/model/path/in/artifact/model.bin}.

- */ - public static final String PARAM_MODEL_ARTIFACT_URI = - ComponentParameters.PARAM_MODEL_ARTIFACT_URI; - @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) - protected String modelArtifactUri; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - private String modelLocation; - - /** - * Enable/disable type mapping. - */ - public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; - @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = - ComponentParameters.DEFAULT_MAPPING_ENABLED) - protected boolean mappingEnabled; - - /** - * Location of the mapping file for part-of-speech tags to UIMA types. - */ - public static final String PARAM_DEPENDENCY_MAPPING_LOCATION = - ComponentParameters.PARAM_DEPENDENCY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_DEPENDENCY_MAPPING_LOCATION, mandatory = false) - private String dependencyMappingLocation; - - /** - * Process anyway, even if the model relies on features that are not supported by this - * component. - */ - public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; - @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") - protected boolean ignoreMissingFeatures; - - private Nlp4JDependencyParserModelProvider modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new Nlp4JDependencyParserModelProvider(this); - - mappingProvider = createDependencyMappingProvider(this, dependencyMappingLocation, language, - modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - modelProvider.configure(cas); - mappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List tokens = selectCovered(aJCas, Token.class, sentence); - NLPNode[] nodes = Uima2EmoryNlp.convertSentence(tokens); - - // Process the sentences - new results will be stored in the existing NLPNodes - modelProvider.getResource().process(nodes); - - EmoryNlp2Uima.convertDependencies(aJCas, tokens, nodes, mappingProvider); - } - } - - private class Nlp4JDependencyParserModelProvider - extends ModelProviderBase>> - { - public Nlp4JDependencyParserModelProvider(Object aObject) - { - super(aObject, "nlp4j", "parser"); - - setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); - setDefault(LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/parser-${language}-${variant}.properties"); - } - - @Override - protected OnlineComponent> produceResource(InputStream aStream) - throws Exception - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - EmoryNlpUtils.initGlobalLexica(); - - // Load the POS tagger model from the location the model provider offers - OnlineComponent> component = (OnlineComponent) - NLPUtils.getComponent(aStream); - - // Extract tagset information from the model - OnlineComponentTagsetDescriptionProvider> tsdp = - new OnlineComponentTagsetDescriptionProvider>( - getResourceMetaData().getProperty("dependency.tagset"), Dependency.class, - component) - { - @Override - public Set listTags(String aLayer, String aTagsetName) - { - Set cleanTags = new TreeSet(); - - for (String tag : super.listTags(aLayer, aTagsetName)) { - String t = StringUtils.substringAfterLast(tag, "_"); - if (t.length() > 0) { - cleanTags.add(t); - } - } - - return cleanTags; - } - }; - addTagset(tsdp); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - - Set features = EmoryNlpUtils.extractFeatures(component); - getLogger().info("Model uses these features: " + features); - - - Set unsupportedFeatures = EmoryNlpUtils.extractUnsupportedFeatures(component, - "dependency_label", "valency"); - if (!unsupportedFeatures.isEmpty()) { - String message = "Model these uses unsupported features: " + unsupportedFeatures; - if (ignoreMissingFeatures) { - getLogger().warn(message); - } - else { - throw new IOException(message); - } - } - - return component; - } - }; -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java deleted file mode 100644 index 0aabc1e872..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JLemmatizer.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.util.JCasUtil.select; - -import java.io.IOException; -import java.net.URL; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.resources.ModelProviderBase; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.emory.mathcs.nlp.common.util.StringUtils; -import edu.emory.mathcs.nlp.component.morph.MorphAnalyzer; -import edu.emory.mathcs.nlp.component.morph.english.EnglishMorphAnalyzer; -import eu.openminted.share.annotations.api.Component; -import eu.openminted.share.annotations.api.DocumentationResource; -import eu.openminted.share.annotations.api.constants.OperationType; - -/** - * Emory NLP4J lemmatizer. This is a lower-casing lemmatizer. - */ -@Component(OperationType.LEMMATIZER) -@ResourceMetaData(name = "NLP4J Lemmatizer") -@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"}, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) -public class Nlp4JLemmatizer - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private ModelProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() { - { - setContextObject(Nlp4JLemmatizer.this); - setDefault(LOCATION, NOT_REQUIRED + "-${language}"); - setOverride(LANGUAGE, language); - } - - @Override - protected MorphAnalyzer produceResource(URL aUrl) - throws IOException - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - return new EnglishMorphAnalyzer(); - } - }; - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - - MorphAnalyzer lemmatizer = modelProvider.getResource(); - - for (Token t : select(aJCas, Token.class)) { - String pos = null; - if (t.getPos() != null) { - pos = t.getPos().getPosValue(); - } - - Lemma lemma = new Lemma(aJCas, t.getBegin(), t.getEnd()); - lemma.setValue(lemmatizer.lemmatize(StringUtils.toSimplifiedForm(t.getText()), - pos)); - lemma.addToIndexes(); - - t.setLemma(lemma); - } - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java deleted file mode 100644 index 3da583e65e..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizer.java +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.resources.MappingProvider; -import org.dkpro.core.api.resources.MappingProviderFactory; -import org.dkpro.core.api.resources.ModelProviderBase; -import org.dkpro.core.nlp4j.internal.EmoryNlp2Uima; -import org.dkpro.core.nlp4j.internal.EmoryNlpUtils; -import org.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; -import org.dkpro.core.nlp4j.internal.Uima2EmoryNlp; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.emory.mathcs.nlp.common.util.NLPUtils; -import edu.emory.mathcs.nlp.component.ner.NERState; -import edu.emory.mathcs.nlp.component.template.OnlineComponent; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; -import eu.openminted.share.annotations.api.Component; -import eu.openminted.share.annotations.api.DocumentationResource; -import eu.openminted.share.annotations.api.constants.OperationType; - -/** - * Emory NLP4J name finder wrapper. - */ -@Component(OperationType.NAMED_ENTITITY_RECOGNIZER) -@ResourceMetaData(name = "NLP4J Named Entity Recognizer") -@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) -public class Nlp4JNamedEntityRecognizer - extends JCasAnnotator_ImplBase -{ - /** - * Log the tag set(s) when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Variant of a model the model. Used to address a specific model if here are multiple models - * for one language. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * URI of the model artifact. This can be used to override the default model resolving - * mechanism and directly address a particular model. - * - *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set - * the variant parameter to match the artifact. If the artifact contains the model in - * a non-default location, you also have to specify the model location parameter, e.g. - * {@code classpath:/model/path/in/artifact/model.bin}.

- */ - public static final String PARAM_MODEL_ARTIFACT_URI = - ComponentParameters.PARAM_MODEL_ARTIFACT_URI; - @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) - protected String modelArtifactUri; - - /** - * Location from which the model is read. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Location of the mapping file for named entity tags to UIMA types. - */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = - ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) - protected String mappingLocation; - - /** - * Process anyway, even if the model relies on features that are not supported by this - * component. - */ - public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; - @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") - protected boolean ignoreMissingFeatures; - - private Nlp4JNamedEntityRecognizerModelProvider modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new Nlp4JNamedEntityRecognizerModelProvider(this); - - mappingProvider = MappingProviderFactory.createNerMappingProvider(this, mappingLocation, - language, variant, modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - modelProvider.configure(cas); - mappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List tokens = selectCovered(aJCas, Token.class, sentence); - NLPNode[] nodes = Uima2EmoryNlp.convertSentence(tokens); - - // Process the sentences - new results will be stored in the existing NLPNodes - modelProvider.getResource().process(nodes); - - EmoryNlp2Uima.convertNamedEntities(cas, tokens, nodes, mappingProvider); - } - } - - private class Nlp4JNamedEntityRecognizerModelProvider - extends ModelProviderBase>> - { - public Nlp4JNamedEntityRecognizerModelProvider(Object aOwner) - { - super(aOwner, "nlp4j", "ner"); - setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); - setDefault(LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/ner-${language}-${variant}.properties"); - } - - @Override - protected OnlineComponent> produceResource(InputStream aStream) - throws Exception - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - EmoryNlpUtils.initGlobalLexica(); - - // Load the POS tagger model from the location the model provider offers - OnlineComponent> component = (OnlineComponent) NLPUtils - .getComponent(aStream); - - // Extract tagset information from the model - OnlineComponentTagsetDescriptionProvider> tsdp = - new OnlineComponentTagsetDescriptionProvider>( - getResourceMetaData().getProperty("ner.tagset"), POS.class, component); - // addTagset(tsdp); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - - Set features = EmoryNlpUtils.extractFeatures(component); - getLogger().info("Model uses these features: " + features); - - Set unsupportedFeatures = EmoryNlpUtils.extractUnsupportedFeatures(component, - "named_entity_tag"); - if (!unsupportedFeatures.isEmpty()) { - String message = "Model these uses unsupported features: " + unsupportedFeatures; - if (ignoreMissingFeatures) { - getLogger().warn(message); - } - else { - throw new IOException(message); - } - } - - return component; - } - }; -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java deleted file mode 100644 index ad66128277..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JPosTagger.java +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; -import static org.apache.uima.util.Level.INFO; -import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.CAS; -import org.apache.uima.fit.component.JCasAnnotator_ImplBase; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.resources.MappingProvider; -import org.dkpro.core.api.resources.ModelProviderBase; -import org.dkpro.core.nlp4j.internal.EmoryNlp2Uima; -import org.dkpro.core.nlp4j.internal.EmoryNlpUtils; -import org.dkpro.core.nlp4j.internal.OnlineComponentTagsetDescriptionProvider; -import org.dkpro.core.nlp4j.internal.Uima2EmoryNlp; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.emory.mathcs.nlp.common.util.NLPUtils; -import edu.emory.mathcs.nlp.component.pos.POSState; -import edu.emory.mathcs.nlp.component.template.OnlineComponent; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; -import eu.openminted.share.annotations.api.Component; -import eu.openminted.share.annotations.api.DocumentationResource; -import eu.openminted.share.annotations.api.constants.OperationType; - -/** - * Part-of-Speech annotator using Emory NLP4J. Requires {@link Sentence}s to be annotated before. - */ -@Component(OperationType.PART_OF_SPEECH_TAGGER) -@ResourceMetaData(name = "NLP4J POS-Tagger") -@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) -public class Nlp4JPosTagger - extends JCasAnnotator_ImplBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - /** - * Override the default variant used to locate the model. - */ - public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; - @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) - protected String variant; - - /** - * URI of the model artifact. This can be used to override the default model resolving - * mechanism and directly address a particular model. - * - *

The URI format is {@code mvn:${groupId}:${artifactId}:${version}}. Remember to set - * the variant parameter to match the artifact. If the artifact contains the model in - * a non-default location, you also have to specify the model location parameter, e.g. - * {@code classpath:/model/path/in/artifact/model.bin}.

- */ - public static final String PARAM_MODEL_ARTIFACT_URI = - ComponentParameters.PARAM_MODEL_ARTIFACT_URI; - @ConfigurationParameter(name = PARAM_MODEL_ARTIFACT_URI, mandatory = false) - protected String modelArtifactUri; - - /** - * Load the model from this location instead of locating the model automatically. - */ - public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; - @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) - protected String modelLocation; - - /** - * Enable/disable type mapping. - */ - public static final String PARAM_MAPPING_ENABLED = ComponentParameters.PARAM_MAPPING_ENABLED; - @ConfigurationParameter(name = PARAM_MAPPING_ENABLED, mandatory = true, defaultValue = - ComponentParameters.DEFAULT_MAPPING_ENABLED) - protected boolean mappingEnabled; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = - ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Log the tag set(s) when a model is loaded. - */ - public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; - @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") - protected boolean printTagSet; - - /** - * Process anyway, even if the model relies on features that are not supported by this - * component. - */ - public static final String PARAM_IGNORE_MISSING_FEATURES = "ignoreMissingFeatures"; - @ConfigurationParameter(name = PARAM_IGNORE_MISSING_FEATURES, mandatory = true, defaultValue = "false") - protected boolean ignoreMissingFeatures; - - private Nlp4JPosTaggerModelProvider modelProvider; - private MappingProvider mappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new Nlp4JPosTaggerModelProvider(this); - - // General setup of the mapping provider in initialize() - mappingProvider = createPosMappingProvider(this, posMappingLocation, language, - modelProvider); - } - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - CAS cas = aJCas.getCas(); - - // Document-specific configuration of model and mapping provider in process() - modelProvider.configure(cas); - - // Mind the mapping provider must be configured after the model provider as it uses the - // model metadata - mappingProvider.configure(cas); - - for (Sentence sentence : select(aJCas, Sentence.class)) { - List tokens = selectCovered(aJCas, Token.class, sentence); - NLPNode[] nodes = Uima2EmoryNlp.convertSentence(tokens); - - // Process the sentences - new results will be stored in the existing NLPNodes - modelProvider.getResource().process(nodes); - - EmoryNlp2Uima.convertPos(cas, tokens, nodes, mappingProvider); - } - } - - private class Nlp4JPosTaggerModelProvider - extends ModelProviderBase>> - { - public Nlp4JPosTaggerModelProvider(Object aOwner) - { - super(aOwner, "nlp4j", "tagger"); - setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core"); - setDefault(LOCATION, - "classpath:/de/tudarmstadt/ukp/dkpro/core/nlp4j/lib/tagger-${language}-${variant}.properties"); - } - - @Override - protected OnlineComponent> produceResource(InputStream aStream) - throws Exception - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - EmoryNlpUtils.initGlobalLexica(); - - // Load the POS tagger model from the location the model provider offers - OnlineComponent> component = (OnlineComponent) - NLPUtils.getComponent(aStream); - - // Extract tagset information from the model - OnlineComponentTagsetDescriptionProvider> tsdp = - new OnlineComponentTagsetDescriptionProvider<>( - getResourceMetaData().getProperty("pos.tagset"), POS.class, component); - addTagset(tsdp); - - if (printTagSet) { - getContext().getLogger().log(INFO, tsdp.toString()); - } - - Set features = EmoryNlpUtils.extractFeatures(component); - getLogger().info("Model uses these features: " + features); - - - Set unsupportedFeatures = EmoryNlpUtils.extractUnsupportedFeatures(component); - if (!unsupportedFeatures.isEmpty()) { - String message = "Model these uses unsupported features: " + unsupportedFeatures; - if (ignoreMissingFeatures) { - getLogger().warn(message); - } - else { - throw new IOException(message); - } - } - - // Create a new POS tagger instance from the loaded model - return component; - } - }; -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java deleted file mode 100644 index 1c158c10ad..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/Nlp4JSegmenter.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import java.io.IOException; -import java.net.URL; -import java.util.List; - -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.resources.CasConfigurableProviderBase; -import org.dkpro.core.api.resources.ModelProviderBase; -import org.dkpro.core.api.segmentation.SegmenterBase; - -import edu.emory.mathcs.nlp.component.tokenizer.EnglishTokenizer; -import edu.emory.mathcs.nlp.component.tokenizer.Tokenizer; -import edu.emory.mathcs.nlp.component.tokenizer.token.Token; -import eu.openminted.share.annotations.api.DocumentationResource; - -/** - * Segmenter using Emory NLP4J. - */ -@ResourceMetaData(name = "NLP4J Segmenter") -@DocumentationResource("${docbase}/component-reference.html#engine-${shortClassName}") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }) -public class Nlp4JSegmenter - extends SegmenterBase -{ - /** - * Use this language instead of the document language to resolve the model. - */ - public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; - @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) - protected String language; - - private CasConfigurableProviderBase modelProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - modelProvider = new ModelProviderBase() - { - { - setContextObject(Nlp4JSegmenter.this); - setDefault(LOCATION, NOT_REQUIRED + "-${language}"); - setOverride(LANGUAGE, language); - } - - @Override - protected Tokenizer produceResource(URL aUrl) - throws IOException - { - String language = getAggregatedProperties().getProperty(LANGUAGE); - - if (!language.equals("en")) { - throw new IllegalArgumentException(new Throwable( - "Emory NLP4J supports only English")); - } - - return new EnglishTokenizer(); - } - }; - } - - @Override - protected void process(JCas aJCas, String aText, int aZoneBegin) - throws AnalysisEngineProcessException - { - modelProvider.configure(aJCas.getCas()); - Tokenizer segmenter = modelProvider.getResource(); - - List> sentences = segmenter.segmentize(aText); - - for (List sentence : sentences) { - // Tokens actually start only at index 1 - the 0 index is some odd "@#r$%" - for (Token token : sentence) { - createToken(aJCas, aZoneBegin + token.getStartOffset(), - aZoneBegin + token.getEndOffset()); - } - - int sentBegin = aZoneBegin + sentence.get(0).getStartOffset(); - int sentEnd = aZoneBegin + sentence.get(sentence.size() - 1).getEndOffset(); - - createSentence(aJCas, sentBegin, sentEnd); - } - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java deleted file mode 100644 index a821577675..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlp2Uima.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j.internal; - -import java.util.List; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.api.io.BilouDecoder; -import org.dkpro.core.api.lexmorph.pos.POSUtils; -import org.dkpro.core.api.resources.MappingProvider; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; - -public class EmoryNlp2Uima -{ - public static void convertPos(CAS aCas, List aTokens, NLPNode[] aNodes, - MappingProvider aMappingProvider) - { - // EmoryNLP tokens start at 1 - int i = 1; - for (Token t : aTokens) { - String tag = aNodes[i].getPartOfSpeechTag(); - - // Convert the tag produced by the tagger to an UIMA type, create an annotation - // of this type, and add it to the document. - Type posTag = aMappingProvider.getTagType(tag); - POS posAnno = (POS) aCas.createAnnotation(posTag, t.getBegin(), t.getEnd()); - // To save memory, we typically intern() tag strings - posAnno.setPosValue(tag != null ? tag.intern() : null); - POSUtils.assignCoarseValue(posAnno); - posAnno.addToIndexes(); - - // Connect the POS annotation to the respective token annotation - t.setPos(posAnno); - i++; - } - } - - public static void convertDependencies(JCas aJCas, List aTokens, NLPNode[] aNodes, - MappingProvider aMappingProvider) - { - for (int i = 1; i < aNodes.length; i++) { - NLPNode depNode = aNodes[i]; - NLPNode govNode = depNode.getDependencyHead(); - String label = depNode.getDependencyLabel(); - - // FIXME Also extract the semantic heads and store them with dependency flavor - // ENHANCED - - if (govNode.getID() != 0) { - Type depRel = aMappingProvider.getTagType(label); - Dependency dep = (Dependency) aJCas.getCas().createFS(depRel); - dep.setDependencyType(label != null ? label.intern() : null); - dep.setDependent(aTokens.get(depNode.getID() - 1)); - dep.setGovernor(aTokens.get(govNode.getID() - 1)); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.setFlavor(DependencyFlavor.BASIC); - dep.addToIndexes(); - } - else { - Dependency dep = new ROOT(aJCas); - dep.setDependencyType(label); - dep.setDependent(aTokens.get(depNode.getID() - 1)); - dep.setGovernor(aTokens.get(depNode.getID() - 1)); - dep.setBegin(dep.getDependent().getBegin()); - dep.setEnd(dep.getDependent().getEnd()); - dep.setFlavor(DependencyFlavor.BASIC); - dep.addToIndexes(); - } - } - } - - public static void convertNamedEntities(CAS aCas, List aTokens, NLPNode[] aNodes, - MappingProvider aMappingProvider) - { - Type neType = aCas.getTypeSystem().getType(NamedEntity.class.getName()); - Feature valueFeat = neType.getFeatureByBaseName("value"); - - String[] neTags = new String[aNodes.length - 1]; - for (int i = 1; i < aNodes.length; i++) { - neTags[i - 1] = aNodes[i].getNamedEntityTag(); - } - - BilouDecoder decoder = new BilouDecoder(aCas, valueFeat, aMappingProvider); - decoder.decode(aTokens, neTags); - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java deleted file mode 100644 index acb0249f2a..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/EmoryNlpUtils.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j.internal; - -import static java.util.Arrays.asList; - -import java.io.IOException; -import java.io.ObjectInputStream; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -import org.dkpro.core.api.resources.ResourceUtils; -import org.w3c.dom.Document; -import org.w3c.dom.Element; - -import edu.emory.mathcs.nlp.common.collection.tree.PrefixTree; -import edu.emory.mathcs.nlp.common.util.IOUtils; -import edu.emory.mathcs.nlp.component.template.OnlineComponent; -import edu.emory.mathcs.nlp.component.template.feature.FeatureItem; -import edu.emory.mathcs.nlp.component.template.feature.Field; -import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica; -import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexicon; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; - -public class EmoryNlpUtils -{ - private static GlobalLexica lexica; - - public static synchronized void initGlobalLexica() - throws IOException, ParserConfigurationException - { - if (lexica != null) { - return; - } - - // Cf. classpath:/edu/emory/mathcs/nlp/configuration/config-decode-en.xml - - String LEXICA_PREFIX = "classpath:/edu/emory/mathcs/nlp/lexica/"; - - DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); - Document xmlDoc = builder.newDocument(); - Element root = xmlDoc.createElement("dummy"); - - lexica = new GlobalLexica<>(root); - - lexica.setAmbiguityClasses(new GlobalLexicon>>( - loadLexicon(LEXICA_PREFIX + "en-ambiguity-classes-simplified-lowercase.xz"), - Field.word_form_simplified_lowercase, "en-ambiguity-classes-simplified-lowercase")); - - lexica.setWordClusters(new GlobalLexicon>>( - loadLexicon(LEXICA_PREFIX + "en-brown-clusters-simplified-lowercase.xz"), - Field.word_form_simplified_lowercase, "en-brown-clusters-simplified-lowercase")); - - lexica.setNamedEntityGazetteers(new GlobalLexicon>>( - loadLexicon(LEXICA_PREFIX + "en-named-entity-gazetteers-simplified.xz"), - Field.word_form_simplified, "en-named-entity-gazetteers-simplified")); - - lexica.setWordEmbeddings(new GlobalLexicon>( - loadLexicon(LEXICA_PREFIX + "en-word-embeddings-undigitalized.xz"), - Field.word_form_undigitalized, "en-word-embeddings-undigitalized")); - -// lexica.setStopWords( -// loadLexicon(LEXICA_PREFIX + "en-stop-words-simplified-lowercase.xz")); - } - - public static void assignGlobalLexica(NLPNode[] aNodes) - { - lexica.process(aNodes); - } - - @SuppressWarnings("unchecked") - private static T loadLexicon(String aLocation) - throws IOException - { - try (ObjectInputStream is = IOUtils.createObjectXZBufferedInputStream( - ResourceUtils.resolveLocation(aLocation).openStream())) { - return (T) is.readObject(); - } - catch (ClassNotFoundException e) { - throw new IOException(e); - } - } - - public static Set extractFeatures(OnlineComponent component) - throws IllegalAccessException - { - Set features = new HashSet(); - - for (FeatureItem f : component.getFeatureTemplate().getSetFeatureList()) { - features.add(f.field.name()); - } - - for (FeatureItem f : component.getFeatureTemplate().getEmbeddingFeatureList()) { - features.add(f.field.name()); - } - - for (FeatureItem[] fl : component.getFeatureTemplate().getFeatureList()) { - for (FeatureItem f : fl) { - features.add(f.field.name()); - } - } - - return features; - } - - public static Set extractUnsupportedFeatures( - OnlineComponent component, String... aExtra) - throws IllegalAccessException - { - Set features = extractFeatures(component); - - Set unsupportedFeatures = new HashSet(features); - // This is generated in FeatureTemplate.getPositionFeatures - unsupportedFeatures.remove("positional"); - // This is generated in FeatureTemplate.getOrthographicFeatures - // FIXME There is a special handling for hyperlinks which we likely do not support! - unsupportedFeatures.remove("orthographic"); - unsupportedFeatures.remove("orthographic_lowercase"); - // This is generated in FeatureTemplate.getPrefix / getSuffix - unsupportedFeatures.remove("prefix"); - unsupportedFeatures.remove("suffix"); - // The following are created internally in NLPNode.setWordForm() - unsupportedFeatures.remove("word_form"); - unsupportedFeatures.remove("word_form_simplified"); - unsupportedFeatures.remove("word_form_undigitalized"); - unsupportedFeatures.remove("word_form_simplified_lowercase"); - // These are handled internally in NLPNode - unsupportedFeatures.remove("word_shape"); - // These are handled by GlobalLexica.assignGlobalLexica() - unsupportedFeatures.remove("ambiguity_classes"); - unsupportedFeatures.remove("word_clusters"); - unsupportedFeatures.remove("named_entity_gazetteers"); - unsupportedFeatures.remove("word_embedding"); - // We know POS tag if POS tagger ran before - unsupportedFeatures.remove("part_of_speech_tag"); - // We know the lemma if we ran a lemmatizer before - unsupportedFeatures.remove("lemma"); - - unsupportedFeatures.removeAll(asList(aExtra)); - - return unsupportedFeatures; - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java deleted file mode 100644 index 2a8cb22227..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/OnlineComponentTagsetDescriptionProvider.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j.internal; - -import static java.util.Collections.singletonMap; - -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -import org.dkpro.core.api.metadata.TagsetBase; - -import edu.emory.mathcs.nlp.component.template.OnlineComponent; -import edu.emory.mathcs.nlp.component.template.node.AbstractNLPNode; -import edu.emory.mathcs.nlp.component.template.state.NLPState; -import edu.emory.mathcs.nlp.learning.optimization.OnlineOptimizer; - -public class OnlineComponentTagsetDescriptionProvider - , S extends NLPState> - extends TagsetBase -{ - private String name; - private String layer; - private OnlineComponent model; - - public OnlineComponentTagsetDescriptionProvider(String aName, Class aLayer, - OnlineComponent aModel) - { - name = aName; - layer = aLayer.getName(); - model = aModel; - } - - @Override - public Map getLayers() - { - return singletonMap(layer, name); - } - - @Override - public Set listTags(String aLayer, String aTagsetName) - { - OnlineOptimizer optimizer = model.getOptimizer(); - - Set tagSet = new TreeSet(); - for (int i = 0; i < optimizer.getLabelSize(); i++) { - String tag = optimizer.getLabel(i); - tagSet.add(tag); - } - - return tagSet; - } - - public OnlineComponent getModel() - { - return model; - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java b/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java deleted file mode 100644 index 7728b97cd8..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/java/org/dkpro/core/nlp4j/internal/Uima2EmoryNlp.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j.internal; - -import java.util.List; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import edu.emory.mathcs.nlp.component.template.node.NLPNode; - -public class Uima2EmoryNlp -{ - public static NLPNode[] convertSentence(List aTokens) - { - NLPNode[] nodes = new NLPNode[aTokens.size() + 1]; - - nodes[0] = new NLPNode(); - nodes[0].toRoot(); - - int i = 1; - for (Token t : aTokens) { - nodes[i] = new NLPNode(i, t.getText()); - nodes[i].setStartOffset(t.getBegin()); - nodes[i].setEndOffset(t.getEnd()); - - if (t.getPos() != null) { - nodes[i].setPartOfSpeechTag(t.getPos().getPosValue()); - } - // FIXME should throw an exception if POS not set but is a required feature and - // ignoreMissingFeatures is not enabled - - if (t.getLemma() != null) { - nodes[i].setLemma(t.getLemma().getValue()); - } - // FIXME should throw an exception if lemma not set but is a required feature and - // ignoreMissingFeatures is not enabled - - i++; - } - - EmoryNlpUtils.assignGlobalLexica(nodes); - - return nodes; - } -} diff --git a/dkpro-core-nlp4j-asl/src/main/resources/org/dkpro/core/nlp4j/lib/ner-default-variants.map b/dkpro-core-nlp4j-asl/src/main/resources/org/dkpro/core/nlp4j/lib/ner-default-variants.map deleted file mode 100644 index 5db5b4322a..0000000000 --- a/dkpro-core-nlp4j-asl/src/main/resources/org/dkpro/core/nlp4j/lib/ner-default-variants.map +++ /dev/null @@ -1 +0,0 @@ -en=default \ No newline at end of file diff --git a/dkpro-core-nlp4j-asl/src/scripts/build.xml b/dkpro-core-nlp4j-asl/src/scripts/build.xml deleted file mode 100644 index 7d7aa5db4f..0000000000 --- a/dkpro-core-nlp4j-asl/src/scripts/build.xml +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java deleted file mode 100644 index 7b831451d2..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/EnglishTokenizerTest.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import java.util.List; - -import org.junit.jupiter.api.Test; - -import edu.emory.mathcs.nlp.component.tokenizer.EnglishTokenizer; -import edu.emory.mathcs.nlp.component.tokenizer.Tokenizer; -import edu.emory.mathcs.nlp.component.tokenizer.token.Token; - -public class EnglishTokenizerTest -{ - @Test - public void test() { - Tokenizer tokenizer = new EnglishTokenizer(); - List> sentences = tokenizer.segmentize("A a a a . B b b b -"); - for (List sentence : sentences) { - for (Token token : sentence) { - System.out.printf("%d %d %s%n", token.getStartOffset(), token.getEndOffset(), - token.getWordForm()); - } - } - } -} diff --git a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java deleted file mode 100644 index 7bf1340740..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JDependencyParserTest.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.uima.fit.factory.AggregateBuilder; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.testing.AssertAnnotations; -import org.dkpro.core.testing.TestRunner; -import org.junit.jupiter.api.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; - -public class Nlp4JDependencyParserTest -{ - private static final String[] ENGLISH_DEPENDENCY_TAGS = { "acl", "acomp", "advcl", "advmod", - "agent", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", - "csubj", "csubjpass", "dative", "dep", "det", "discourse", "dobj", "expl", "mark", - "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "oprd", "parataxis", "pcomp", - "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "qmod", "relcl", "root", - "vocative", "xcomp" }; - - @Test - public void testEnglish() - throws Exception - { - long maxMemory = Runtime.getRuntime().maxMemory(); - assumeTrue(maxMemory > 3700000000l, "Insufficient max memory: " + maxMemory); - - JCas jcas = runTest("en", null, "We need a very complicated example sentence , which " - + "contains as many constituents and dependencies as possible ."); - - String[] dependencies = { - "[ 0, 2]NSUBJ(nsubj,basic) D[0,2](We) G[3,7](need)", - "[ 3, 7]ROOT(root,basic) D[3,7](need) G[3,7](need)", - "[ 8, 9]DET(det,basic) D[8,9](a) G[35,43](sentence)", - "[ 10, 14]ADVMOD(advmod,basic) D[10,14](very) G[15,26](complicated)", - "[ 15, 26]Dependency(nmod,basic) D[15,26](complicated) G[35,43](sentence)", - "[ 27, 34]Dependency(compound,basic) D[27,34](example) G[35,43](sentence)", - "[ 35, 43]DOBJ(dobj,basic) D[35,43](sentence) G[3,7](need)", - "[ 44, 45]PUNCT(punct,basic) D[44,45](,) G[35,43](sentence)", - "[ 46, 51]NSUBJ(nsubj,basic) D[46,51](which) G[52,60](contains)", - "[ 52, 60]Dependency(relcl,basic) D[52,60](contains) G[35,43](sentence)", - "[ 61, 63]PREP(prep,basic) D[61,63](as) G[52,60](contains)", - "[ 64, 68]Dependency(nmod,basic) D[64,68](many) G[69,81](constituents)", - "[ 69, 81]POBJ(pobj,basic) D[69,81](constituents) G[61,63](as)", - "[ 82, 85]CC(cc,basic) D[82,85](and) G[69,81](constituents)", - "[ 86, 98]CONJ(conj,basic) D[86,98](dependencies) G[69,81](constituents)", - "[ 99,101]PREP(prep,basic) D[99,101](as) G[69,81](constituents)", - "[102,110]PCOMP(pcomp,basic) D[102,110](possible) G[99,101](as)", - "[111,112]PUNCT(punct,basic) D[111,112](.) G[3,7](need)" }; - - String[] unmappedDep = {}; - - AssertAnnotations.assertDependencies(dependencies, select(jcas, Dependency.class)); - AssertAnnotations.assertTagset(Dependency.class, "emory", ENGLISH_DEPENDENCY_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(Dependency.class, "emory", unmappedDep, jcas); - } - - private JCas runTest(String aLanguage, String aVariant, String aText, Object... aExtraParams) - throws Exception - { - AggregateBuilder aggregate = new AggregateBuilder(); - - Object[] params = new Object[] { - Nlp4JDependencyParser.PARAM_VARIANT, aVariant, - Nlp4JDependencyParser.PARAM_PRINT_TAGSET, true}; - params = ArrayUtils.addAll(params, aExtraParams); - aggregate.add(createEngineDescription(Nlp4JPosTagger.class)); - aggregate.add(createEngineDescription(Nlp4JDependencyParser.class, params)); - - return TestRunner.runTest(aggregate.createAggregateDescription(), aLanguage, aText); - } -} diff --git a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java deleted file mode 100644 index 7bd9da09d6..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JLemmatizerTest.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.testing.AssertAnnotations; -import org.dkpro.core.testing.TestRunner; -import org.junit.jupiter.api.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; - -public class Nlp4JLemmatizerTest -{ - @Test - public void testEnglish() - throws Exception - { - runTest("en", "This is a test .", - new String[] { "this", "be", "a", "test", "." }); - - runTest("en", "A neural net .", - new String[] { "a", "neural", "net", "." }); - - runTest("en", "John is purchasing oranges .", - new String[] { "john", "be", "purchase", "orange", "." }); - } - - private JCas runTest(String language, String testDocument, String[] aLemma) - throws Exception - { - AnalysisEngineDescription engine = createEngineDescription( - createEngineDescription(Nlp4JPosTagger.class), - createEngineDescription(Nlp4JLemmatizer.class)); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertLemma(aLemma, select(jcas, Lemma.class)); - - return jcas; - } -} diff --git a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java deleted file mode 100644 index 67ab3575c0..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JNamedEntityRecognizerTest.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.testing.AssertAnnotations; -import org.dkpro.core.testing.TestRunner; -import org.junit.jupiter.api.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; - -public class Nlp4JNamedEntityRecognizerTest -{ - @Test - public void testEnglish() - throws Exception - { - long maxMemory = Runtime.getRuntime().maxMemory(); - assumeTrue(maxMemory > 3700000000l, "Insufficient max memory: " + maxMemory); - - // Run the test pipeline. Note the full stop at the end of a sentence is preceded by a - // whitespace. This is necessary for it to be detected as a separate token! - JCas jcas = runTest("en", null, "SAP where John Doe works is in Germany ."); - - // Define the reference data that we expect to get back from the test - String[] namedEntity = { - "[ 10, 18]NamedEntity(PERSON) (John Doe)", - "[ 31, 38]NamedEntity(GPE) (Germany)" }; - - // Compare the annotations created in the pipeline to the reference data - AssertAnnotations.assertNamedEntity(namedEntity, select(jcas, NamedEntity.class)); - } - - // Auxiliary method that sets up the analysis engine or pipeline used in the test. - // Typically, we have multiple tests per unit test file that each invoke this method. - private JCas runTest(String language, String variant, String testDocument) - throws Exception - { - AnalysisEngineDescription postagger = createEngineDescription(Nlp4JPosTagger.class); - AnalysisEngineDescription lemmatizer = createEngineDescription(Nlp4JLemmatizer.class); - AnalysisEngineDescription ner = createEngineDescription(Nlp4JNamedEntityRecognizer.class, - Nlp4JNamedEntityRecognizer.PARAM_VARIANT, variant, - Nlp4JNamedEntityRecognizer.PARAM_PRINT_TAGSET, true); - - AnalysisEngineDescription engine = createEngineDescription(postagger, lemmatizer, ner); - - // Here we invoke the TestRunner which performs basic whitespace tokenization and - // sentence splitting, creates a CAS, runs the pipeline, etc. TestRunner explicitly - // disables automatic model loading. Thus, models used in unit tests must be explicitly - // made dependencies in the pom.xml file. - return TestRunner.runTest(engine, language, testDocument); - } -} diff --git a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java deleted file mode 100644 index 0aa403cf83..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JPosTaggerTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.apache.uima.fit.util.JCasUtil.select; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.testing.AssertAnnotations; -import org.dkpro.core.testing.TestRunner; -import org.junit.jupiter.api.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; - -public class Nlp4JPosTaggerTest -{ - private static final String[] ENGLISH_POS_TAGS = { "$", "''", ",", "-LRB-", "-RRB-", ".", ":", - "ADD", "AFX", "CC", "CD", "DT", "EX", "FW", "GW", "HYPH", "IN", "JJ", "JJR", "JJS", - "LS", "MD", "NFP", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", - "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", - "WP", "WP$", "WRB", "XX", "``" }; - - private static final String[] ENGLISH_POS_UNMAPPED = {}; - - @Test - public void testEnglishDetail() - throws Exception - { - JCas jcas = runTest("en", null, "This is a test .", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - AssertAnnotations.assertTagset(POS.class, "ptb-emory", ENGLISH_POS_TAGS, jcas); - AssertAnnotations.assertTagsetMapping(POS.class, "ptb-emory", ENGLISH_POS_UNMAPPED, jcas); - } - - @Test - public void testEnglish() - throws Exception - { - runTest("en", null, "This is a test .", - new String[] { "DT", "VBZ", "DT", "NN", "." }, - new String[] { "POS_DET", "POS_VERB", "POS_DET", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "A neural net .", - new String[] { "DT", "JJ", "NN", "." }, - new String[] { "POS_DET", "POS_ADJ", "POS_NOUN", "POS_PUNCT" }); - - runTest("en", null, "John is purchasing oranges .", - new String[] { "NNP", "VBZ", "VBG", "NNS", "." }, - new String[] { "POS_PROPN", "POS_VERB", "POS_VERB", "POS_NOUN", "POS_PUNCT" }); - } - - private JCas runTest(String language, String variant, String testDocument, String[] tags, - String[] tagClasses) - throws Exception - { - AnalysisEngine engine = createEngine(Nlp4JPosTagger.class, - Nlp4JPosTagger.PARAM_VARIANT, variant, - Nlp4JPosTagger.PARAM_PRINT_TAGSET, true); - - JCas jcas = TestRunner.runTest(engine, language, testDocument); - - AssertAnnotations.assertPOS(tagClasses, tags, select(jcas, POS.class)); - - return jcas; - } -} diff --git a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java b/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java deleted file mode 100644 index 73272cd4fb..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/java/org/dkpro/core/nlp4j/Nlp4JSegmenterTest.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright 2017 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.nlp4j; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; - -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.dkpro.core.testing.harness.SegmenterHarness; -import org.junit.jupiter.api.Test; - -public class Nlp4JSegmenterTest -{ - @Test - public void runHarness() - throws Throwable - { - AnalysisEngineDescription aed = createEngineDescription(Nlp4JSegmenter.class); - - SegmenterHarness.run(aed, "de.1", "de.2", "de.3", "de.4", "en.1", "en.7", "en.9", - "ar.1", "zh.1", "zh.2"); - } - - @Test - public void testZoning() throws Exception - { - SegmenterHarness.testZoning(Nlp4JSegmenter.class); - } -} diff --git a/dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml b/dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml deleted file mode 100644 index 31c71b9dc4..0000000000 --- a/dkpro-core-nlp4j-asl/src/test/resources/log4j2.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - - - - - - - - - -