diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java index 5f958e6b..b985e890 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java @@ -55,8 +55,8 @@ public void setParameters(List parameters) } /** - * Returns the Position Span of this Template refering to the ContentElement in which the - * Template occures. This is mainly the same like Link.getPos(), but a Template does�n know it�s + * Returns the Position Span of this Template referring to the ContentElement in which the + * Template occurs. This is mainly the same as {@link Link#getPos()}, but a Template doesn't know it's * HomeElement. */ public Span getPos() @@ -72,14 +72,15 @@ public void setPos(Span pos) this.pos = pos; } + @Override public String toString() { StringBuilder result = new StringBuilder(); - result.append("TE_NAME: \"" + name + "\""); - result.append("\nTE_PARAMETERS: " + parameters.size()); + result.append("TE_NAME: \"").append(name).append("\""); + result.append("\nTE_PARAMETERS: ").append(parameters.size()); for (String parameter : parameters) - result.append("\nTE_PARAMETER: \"" + parameter + "\""); - result.append("\nTE_POS: " + pos); + result.append("\nTE_PARAMETER: \"").append(parameter).append("\""); + result.append("\nTE_POS: ").append(pos); return result.toString(); } } diff --git a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java index 041b6513..de984c8e 100644 --- a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java +++ b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java @@ -18,9 +18,11 @@ package org.dkpro.jwpl.wikimachine.decompression; import java.io.BufferedInputStream; -import java.io.FileInputStream; +import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; @@ -35,19 +37,17 @@ public class BZip2Decompressor @Override public InputStream getInputStream(String fileName) throws IOException { - InputStream outputStream; - - BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(fileName)); - /* - * skip 2 first bytes (see the documentation of CBZip2InputStream) e.g. here - * http://lucene.apache.org/tika/xref/org/apache/tika/parser /pkg/bzip2 - * /CBZip2InputStream.html - */ - inputStream.skip(2); - outputStream = new BZip2CompressorInputStream(inputStream); - - return outputStream; - + final InputStream in; + if (fileName.contains(File.separator)) { + in = Files.newInputStream(Path.of(fileName).toAbsolutePath()); + } else { + in = getContextClassLoader().getResourceAsStream(fileName); + } + return new BZip2CompressorInputStream(new BufferedInputStream(in)); } + private ClassLoader getContextClassLoader() + { + return Thread.currentThread().getContextClassLoader(); + } } diff --git a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/GZipDecompressor.java b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/GZipDecompressor.java index 440bac13..3ff2d89e 100644 --- a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/GZipDecompressor.java +++ b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/GZipDecompressor.java @@ -17,9 +17,12 @@ */ package org.dkpro.jwpl.wikimachine.decompression; -import java.io.FileInputStream; +import java.io.BufferedInputStream; +import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.zip.GZIPInputStream; /** @@ -33,7 +36,18 @@ public class GZipDecompressor @Override public InputStream getInputStream(String fileName) throws IOException { - return new GZIPInputStream(new FileInputStream(fileName)); + final InputStream in; + if (fileName.contains(File.separator)) { + in = Files.newInputStream(Path.of(fileName).toAbsolutePath()); + } else { + in = getContextClassLoader().getResourceAsStream(fileName); + } + return new GZIPInputStream(new BufferedInputStream(in)); + } + + private ClassLoader getContextClassLoader() + { + return Thread.currentThread().getContextClassLoader(); } } diff --git a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressor.java b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressor.java index 04deb284..53f107a6 100644 --- a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressor.java +++ b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressor.java @@ -22,22 +22,26 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; import java.util.HashMap; +import java.util.Map; import java.util.Properties; /** - * Factory to create java.io.InputStream depending on the filename's extension. If - * there are a supported archive type we decorate FileInputStream with special - * InputStream derivatives to uncompress it on the fly. Otherwise the possible + * Factory to create {@link InputStream} depending on the filename's extension. If + * there are a supported archive type we decorate {@link FileInputStream} with special + * {@link InputStream} derivatives to decompress it on the fly. Otherwise, the possible * compression will be ignored and the plain unmodified byte stream will be returned.
*
*

* Current supported archives are: GZip, BZip2. Each other archive type can be added using the file * "decompressor.xml" where you should specify the file extension as a key and the according utility * (incl. parameters), that have to be started. Please note that the unpack utility has to use the - * standard output and external unpack utilities are in preference to the internal. Also there could + * standard output and external unpack utilities are in preference to the internal. Also, there could * be more heap memory necessary to use start external programs. The compressed file should be - * specified with the place holder %f.
+ * specified with the placeholder %f.
* E.g. the entry for the 7z utility could look like that:
* {@code 7z e -so %f}. The properties file should conform to @@ -53,26 +57,23 @@ public class UniversalDecompressor * Placeholder for compressed file path in external command */ public static final String FILEPLACEHOLDER = "%f"; - /** - * File path to decompressor properties files - */ - private static final String PROPERTIES_PATH = "src/main/resources/decompressor.xml"; /** * Archive extensions which are supported by external utilities */ - private final HashMap externalSupport; + private final Map externalSupport; /** * Archive extensions which are supported by ReaderFactory */ - private final HashMap internalSupport; + private final Map internalSupport; /** * Check if the file extension is supported by the external utility * - * @param extension - * @return true if this extension is supported with external utilities + * @param extension The file extension to check for. + * @return {@code True} if this extension is supported with external utilities, + * {@code false} otherwise. */ private boolean isExternalSupported(String extension) { @@ -80,50 +81,83 @@ private boolean isExternalSupported(String extension) } /** - * Check if the file extension is supported by the internal IDecompressor + * Check if the file extension is supported by the internal {@link IDecompressor}. * - * @param extension - * @return + * @param extension The file extension to check for. + * @return {@code True} if supported with internal utilities, + * {@code false} otherwise. */ private boolean isInternalSupported(String extension) { return internalSupport.containsKey(extension); } - /** - * Don't let anyone instantiate this class - set the constructor to private - */ + /** + * Instantiates a {@link UniversalDecompressor} via bundled "decompressor.xml" + * file which is expected in the classpath of the surrounding environment. + */ public UniversalDecompressor() { internalSupport = new HashMap<>(); internalSupport.put("bz2", new BZip2Decompressor()); internalSupport.put("gz", new GZipDecompressor()); + externalSupport = new HashMap<>(); + ClassLoader cl = Thread.currentThread().getContextClassLoader(); + loadExternal(cl.getResourceAsStream("decompressor.xml")); + } + /** + * Instantiates a {@link UniversalDecompressor} via an external + * {@link Path} reference to a custom "decompressor.xml" file. + * + * @param externalXML A valid {@link Path} reference to a + * "decompressor.xml" file. + */ + public UniversalDecompressor(Path externalXML) + { + internalSupport = new HashMap<>(); + internalSupport.put("bz2", new BZip2Decompressor()); + internalSupport.put("gz", new GZipDecompressor()); externalSupport = new HashMap<>(); - loadExternal(); + loadExternal(externalXML); } /** - * Load the properties for external utilities from a XML file + * Load the properties for external utilities from an XML file. */ - private void loadExternal() + private void loadExternal(Path externalConfig) + { + try { + loadExternal(Files.newInputStream(externalConfig, StandardOpenOption.READ)); + } + catch (IOException ignore) { + // silently ignore it + } + } + + /** + * Load the properties for external utilities from an XML file + */ + private void loadExternal(InputStream externalConfig) { Properties properties = new Properties(); try { - properties.loadFromXML(new FileInputStream(PROPERTIES_PATH)); - for (String key : properties.stringPropertyNames()) { - externalSupport.put(key, properties.getProperty(key)); - } + properties.loadFromXML(externalConfig); + for (String key : properties.stringPropertyNames()) { + externalSupport.put(key, properties.getProperty(key)); + } } catch (IOException ignore) { + // silently ignore it } } + /** * Return the extension of the filename * * @param fileName - * that should be inputed + * that should be input * @return file extension or null */ private String getExtension(String fileName) @@ -145,7 +179,7 @@ private String getExtension(String fileName) * Check if the file is supported by the internal or external decompressor * * @param fileName - * @return true if the file extension is supported + * @return {@code True} if the file extension is supported, {@code false} otherwise. */ public boolean isSupported(String fileName) { @@ -170,6 +204,7 @@ private InputStream startExternal(String fileName) result = externalProcess.getInputStream(); } catch (IOException ignore) { + ignore.printStackTrace(); } return result; } @@ -193,36 +228,34 @@ private InputStream getDefault(String fileName) } /** - * Creates a InputStream where the unpacked data could be read from. Internal GZip and BZip2 + * Creates a {@link InputStream} where the unpacked data could be read from. Internal GZip and BZip2 * archive formats are supported. The list archive formats can be increased with settings file * decompressor.xml. Thereby *

* External decompression utilities are in preference to the internal. If there is nether * external nor internal possibilities to unpack the file - the standard - * FileInputSteam will be returned + * {@link FileInputStream} will be returned * * @see UniversalDecompressor */ @Override public InputStream getInputStream(String fileName) throws IOException { - InputStream inputStream = null; - if (fileExists(fileName)) { - String extension = getExtension(fileName); + InputStream inputStream; + String extension = getExtension(fileName); - if (isExternalSupported(extension)) { - inputStream = startExternal(fileName); - } - else if (isInternalSupported(extension)) { - inputStream = internalSupport.get(extension).getInputStream(fileName); - } - else { - inputStream = getDefault(fileName); - } + if (isExternalSupported(extension) && fileExists(fileName)) { + inputStream = startExternal(fileName); + } + else if (isInternalSupported(extension)) { + inputStream = internalSupport.get(extension).getInputStream(fileName); + } + else { + inputStream = getDefault(fileName); } return inputStream; } diff --git a/dkpro-jwpl-wikimachine/src/test/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressorTest.java b/dkpro-jwpl-wikimachine/src/test/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressorTest.java new file mode 100644 index 00000000..d7a25cdf --- /dev/null +++ b/dkpro-jwpl-wikimachine/src/test/java/org/dkpro/jwpl/wikimachine/decompression/UniversalDecompressorTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.jwpl.wikimachine.decompression; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class UniversalDecompressorTest { + + private static final String EXPECTED_CONTENT = "This file is here to test decompression types."; + + @TempDir + private Path tmpDir; + + private UniversalDecompressor udc; + + @BeforeEach + public void setup() { + udc = new UniversalDecompressor(); + assertNotNull(udc); + } + + @ParameterizedTest + @ValueSource(strings = {"decompressor-ar.xml"}) + public void testInitializeWithExternalConfig(String input) throws IOException { + Path defaultTestConfig = Path.of("src/test/resources/" + input); + Path externalConfig = tmpDir.resolve(input); + /* Copy project local xml config file to external tmp path */ + Files.copy(defaultTestConfig, externalConfig, StandardCopyOption.REPLACE_EXISTING); + UniversalDecompressor udc = new UniversalDecompressor(externalConfig); + assertNotNull(udc); + assertTrue(udc.isSupported("archive.txt.ar")); + } + + @ParameterizedTest + @ValueSource(strings = {"archive.txt.gz", "archive.txt.bz2", "archive.txt.7z", + "src/test/resources/archive.txt.gz"}) + public void testIsSupported(String input) + { + assertTrue(udc.isSupported(input)); + } + + @ParameterizedTest + @ValueSource(strings = {"archive.txt.gz", "archive.txt.bz2", + "src/test/resources/archive.txt.gz", "src/test/resources/archive.txt.bz2"}) + public void testGetInputStream(String input) throws IOException { + try (BufferedInputStream in = new BufferedInputStream(udc.getInputStream(input))) { + assertNotNull(in); + String content = new String(in.readAllBytes(), StandardCharsets.UTF_8); + assertNotNull(content); + assertEquals(EXPECTED_CONTENT, content); + } + } + + @ParameterizedTest + @ValueSource(strings = {"src/test/resources/archive.txt.ar"}) + @DisabledOnOs(OS.WINDOWS) + public void testGetInputStreamWithExternalConfig(String input) throws IOException { + Path arConfig = Path.of("src/test/resources/decompressor-ar.xml"); + UniversalDecompressor udc = new UniversalDecompressor(arConfig); + assertNotNull(udc); + assertTrue(udc.isSupported(input)); + + try (InputStream in = udc.getInputStream(input)) { + assertNotNull(in); + String content = new String(in.readAllBytes(), StandardCharsets.UTF_8).trim(); + assertNotNull(content); + assertEquals(EXPECTED_CONTENT, content); + } + } +} diff --git a/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.ar b/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.ar new file mode 100644 index 00000000..f77be4dd --- /dev/null +++ b/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.ar @@ -0,0 +1,4 @@ +! +archive.txt 1699298743 501 20 100644 47 ` +This file is here to test decompression types. + diff --git a/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.bz2 b/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.bz2 new file mode 100644 index 00000000..e1dc230e Binary files /dev/null and b/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.bz2 differ diff --git a/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.gz b/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.gz new file mode 100644 index 00000000..9a79a116 Binary files /dev/null and b/dkpro-jwpl-wikimachine/src/test/resources/archive.txt.gz differ diff --git a/dkpro-jwpl-wikimachine/src/test/resources/decompressor-ar.xml b/dkpro-jwpl-wikimachine/src/test/resources/decompressor-ar.xml new file mode 100755 index 00000000..1c0ff1e0 --- /dev/null +++ b/dkpro-jwpl-wikimachine/src/test/resources/decompressor-ar.xml @@ -0,0 +1,24 @@ + + + + + + Please define a supported extension with a key and the utility's path as a value. Use %f to define a compressed file path + ar p %f archive.txt + diff --git a/dkpro-jwpl-wikimachine/src/test/resources/decompressor.xml b/dkpro-jwpl-wikimachine/src/test/resources/decompressor.xml new file mode 100755 index 00000000..7a77520a --- /dev/null +++ b/dkpro-jwpl-wikimachine/src/test/resources/decompressor.xml @@ -0,0 +1,25 @@ + + + + + + Please define a supported extension with a key and the utility's path as a value. Use %f to define a compressed file path + 7z e -so %f + +