diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 397c1cc8f..e4f401e5a 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest] + os: [ubuntu-latest, windows-latest] jdk: [17] runs-on: ${{ matrix.os }} diff --git a/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryVectorizer.java b/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryVectorizer.java index 457b885ef..fb1a9da25 100644 --- a/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryVectorizer.java +++ b/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryVectorizer.java @@ -15,6 +15,8 @@ */ package org.dkpro.core.api.embeddings.binary; +import static java.nio.channels.FileChannel.MapMode.READ_ONLY; + import java.io.DataInput; import java.io.DataOutputStream; import java.io.File; @@ -23,7 +25,6 @@ import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.FloatBuffer; -import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Locale; @@ -38,7 +39,7 @@ * @see BinaryWordVectorUtils */ public class BinaryVectorizer - implements Vectorizer + implements Vectorizer { private static final Logger LOG = LoggerFactory.getLogger(BinaryVectorizer.class); private final String[] words; @@ -51,7 +52,7 @@ public class BinaryVectorizer private BinaryVectorizer(Header aHeader, RandomAccessFile aFile, String[] aWords, long vectorStartOffset, float[] aUnk) - throws IOException + throws IOException { file = aFile; header = aHeader; @@ -71,7 +72,7 @@ private BinaryVectorizer(Header aHeader, RandomAccessFile aFile, String[] aWords } parts = new FloatBuffer[neededPartitions]; - FileChannel channel = aFile.getChannel(); + var channel = aFile.getChannel(); for (int i = 0; i < neededPartitions; i++) { long start = vectorStartOffset + ((long) i * maxPartitionSizeBytes); long length = maxPartitionSizeBytes; @@ -79,12 +80,13 @@ private BinaryVectorizer(Header aHeader, RandomAccessFile aFile, String[] aWords length = (aWords.length % maxVectorsPerPartition) * header.getVectorLength() * Float.BYTES; } - parts[i] = channel.map(FileChannel.MapMode.READ_ONLY, start, length).asFloatBuffer(); + parts[i] = channel.map(READ_ONLY, start, length).asFloatBuffer(); } } - + @Override - public void close() throws IOException { + public void close() throws IOException + { if (file != null) { file.close(); } @@ -93,12 +95,13 @@ public void close() throws IOException { /** * Load a binary embeddings file and return a new {@link BinaryVectorizer} object. * - * @param f a {@link File} + * @param f + * a {@link File} * @return a new {@link BinaryVectorizer} - * @throws IOException if an I/O error occurs + * @throws IOException + * if an I/O error occurs */ - public static BinaryVectorizer load(File f) - throws IOException + public static BinaryVectorizer load(File f) throws IOException { var file = new RandomAccessFile(f, "rw"); @@ -115,7 +118,8 @@ public static BinaryVectorizer load(File f) // Load UNK vector byte[] buffer = new byte[header.getVectorLength() * Float.BYTES]; file.readFully(buffer); - ByteBuffer byteBuffer = ByteBuffer.wrap(buffer); + + var byteBuffer = ByteBuffer.wrap(buffer); float[] unk = new float[header.getVectorLength()]; for (int i = 0; i < unk.length; i++) { unk[i] = byteBuffer.getFloat(i * Float.BYTES); @@ -126,8 +130,8 @@ public static BinaryVectorizer load(File f) return new BinaryVectorizer(header, file, words, offset, unk); } - @Override public float[] vectorize(String aWord) - throws IOException + @Override + public float[] vectorize(String aWord) throws IOException { String word = aWord; if (header.isCaseless()) { @@ -156,7 +160,8 @@ public static BinaryVectorizer load(File f) return vector; } - @Override public boolean contains(String aWord) + @Override + public boolean contains(String aWord) { String word = aWord; if (header.isCaseless()) { @@ -166,22 +171,26 @@ public static BinaryVectorizer load(File f) return Arrays.binarySearch(words, word) >= 0; } - @Override public float[] unknownVector() + @Override + public float[] unknownVector() { return unknownVector; } - @Override public int dimensions() + @Override + public int dimensions() { return header.getVectorLength(); } - @Override public int size() + @Override + public int size() { return header.getWordCount(); } - @Override public boolean isCaseless() + @Override + public boolean isCaseless() { return header.isCaseless(); } @@ -195,8 +204,7 @@ static class Header private boolean caseless; private String locale; - public static Header read(DataInput aInput) - throws IOException + public static Header read(DataInput aInput) throws IOException { byte[] magicBytes = new byte[MAGIC.length()]; aInput.readFully(magicBytes); @@ -273,8 +281,7 @@ public void setVectorLength(int vectorLength) this.vectorLength = vectorLength; } - public void write(OutputStream aOutput) - throws IOException + public void write(OutputStream aOutput) throws IOException { DataOutputStream out = new DataOutputStream(aOutput); diff --git a/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtils.java b/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtils.java index ef856e33d..9507197cc 100644 --- a/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtils.java +++ b/dkpro-core-api-embeddings-asl/src/main/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtils.java @@ -21,7 +21,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.FloatBuffer; import java.util.Locale; import java.util.Map; @@ -48,13 +47,16 @@ public class BinaryWordVectorUtils * Write a map of token embeddings into binary format. Uses the default locale {@link Locale#US} * and assume case-sensitivity iff there is any token containing an uppercase letter. * - * @param vectors a {@code Map} holding all tokens with embeddings - * @param binaryTarget the target file {@link File} - * @throws IOException if an I/O error occurs + * @param vectors + * a {@code Map} holding all tokens with embeddings + * @param binaryTarget + * the target file {@link File} + * @throws IOException + * if an I/O error occurs * @see #convertWordVectorsToBinary(Map, boolean, Locale, File) */ public static void convertWordVectorsToBinary(Map vectors, File binaryTarget) - throws IOException + throws IOException { boolean caseless = vectors.keySet().stream() .allMatch(token -> token.equals(token.toLowerCase())); @@ -64,15 +66,20 @@ public static void convertWordVectorsToBinary(Map vectors, File /** * Write a map of token embeddings into binary format. * - * @param vectors a {@code Map} holding all tokens with embeddings - * @param aCaseless if true, tokens are expected to be caseless - * @param aLocale the {@link Locale} - * @param binaryTarget the target file {@link File} - * @throws IOException if an I/O error occurs + * @param vectors + * a {@code Map} holding all tokens with embeddings + * @param aCaseless + * if true, tokens are expected to be caseless + * @param aLocale + * the {@link Locale} + * @param binaryTarget + * the target file {@link File} + * @throws IOException + * if an I/O error occurs */ public static void convertWordVectorsToBinary(Map vectors, boolean aCaseless, Locale aLocale, File binaryTarget) - throws IOException + throws IOException { if (vectors.isEmpty()) { throw new IllegalArgumentException("Word embeddings map must not be empty."); @@ -82,47 +89,44 @@ public static void convertWordVectorsToBinary(Map vectors, bool assert vectors.values().stream().allMatch(v -> v.length == vectorLength); Header header = prepareHeader(aCaseless, aLocale, vectors.size(), vectorLength); - DataOutputStream output = new DataOutputStream( - new BufferedOutputStream(new FileOutputStream(binaryTarget))); - header.write(output); + try (var output = new DataOutputStream( + new BufferedOutputStream(new FileOutputStream(binaryTarget)))) { + header.write(output); - LOG.info("Sorting data..."); - String[] words = vectors.keySet().stream() - .sorted() - .toArray(String[]::new); + LOG.info("Sorting data..."); + String[] words = vectors.keySet().stream().sorted().toArray(String[]::new); - LOG.info("Writing strings..."); - for (String word : words) { - output.writeUTF(word); - } + LOG.info("Writing strings..."); + for (String word : words) { + output.writeUTF(word); + } - LOG.info("Writing UNK vector..."); - { - float[] vector = VectorizerUtils.randomVector(header.getVectorLength()); - writeVector(output, vector); - } + LOG.info("Writing UNK vector..."); + { + float[] vector = VectorizerUtils.randomVector(header.getVectorLength()); + writeVector(output, vector); + } - LOG.info("Writing vectors..."); - for (String word : words) { - float[] vector = vectors.get(word); - writeVector(output, vector); + LOG.info("Writing vectors..."); + for (String word : words) { + float[] vector = vectors.get(word); + writeVector(output, vector); + } } - output.close(); } - private static void writeVector(DataOutputStream output, float[] vector) - throws IOException + private static void writeVector(DataOutputStream output, float[] vector) throws IOException { - ByteBuffer buffer = ByteBuffer.allocate(vector.length * Float.BYTES); - FloatBuffer floatBuffer = buffer.asFloatBuffer(); + var buffer = ByteBuffer.allocate(vector.length * Float.BYTES); + var floatBuffer = buffer.asFloatBuffer(); floatBuffer.put(vector); output.write(buffer.array()); } - private static Header prepareHeader(boolean aCaseless, - Locale aLocale, int wordCount, int vectorLength) + private static Header prepareHeader(boolean aCaseless, Locale aLocale, int wordCount, + int vectorLength) { - Header header = new Header(); + var header = new Header(); header.setVersion(1); header.setWordCount(wordCount); header.setVectorLength(vectorLength); diff --git a/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtilsTest.java b/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtilsTest.java index 6b51fff40..900daadd4 100644 --- a/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtilsTest.java +++ b/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/binary/BinaryWordVectorUtilsTest.java @@ -32,12 +32,16 @@ import org.dkpro.core.api.embeddings.VectorizerUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; import org.junit.jupiter.api.io.TempDir; +@DisabledOnOs(value = OS.WINDOWS, // + disabledReason = "mmapped buffers cannot be unmapped explicitly, so we cannot delete the temp dir on Windows") public class BinaryWordVectorUtilsTest { private @TempDir File tempDir; - + // TODO: test for very large data (>2GB should be chunked) private Map vectors; @@ -50,30 +54,28 @@ public void setUp() } @Test - public void testConvertWordVectorsToBinary() - throws Exception + public void testConvertWordVectorsToBinary() throws Exception { var binaryTarget = writeBinaryFile(vectors); - + try (var vec = BinaryVectorizer.load(binaryTarget)) { assertThat(vec.contains("t1")).isTrue(); assertThat(vec.contains("t2")).isTrue(); assertThat(vec.dimensions()).isEqualTo(3); assertThat(vec.size()).isEqualTo(2); assertThat(vec.isCaseless()).isTrue(); - + for (var word : vectors.keySet()) { var orig = vectors.get(word); var conv = vec.vectorize(word); - + assertThat(conv).containsExactly(orig); } } } @Test - public void testConvertWordVectorsToBinaryCaseSensitive() - throws Exception + public void testConvertWordVectorsToBinaryCaseSensitive() throws Exception { vectors.put("T1", new float[] { 0.1f, 0.2f, 0.3f }); var binaryTarget = writeBinaryFile(vectors); @@ -97,20 +99,19 @@ public void testConvertWordVectorsToBinaryCaseSensitive() } @Test - public void testRandomVector() - throws IOException + public void testRandomVector() throws IOException { var binaryTarget = writeBinaryFile(vectors); try (var vec = BinaryVectorizer.load(binaryTarget)) { var randVector = VectorizerUtils.randomVector(3); - + var unk1 = vec.vectorize("unk1"); var unk2 = vec.vectorize("unk2"); assertTrue(Arrays.equals(randVector, unk1)); assertTrue(Arrays.equals(randVector, unk2)); - assertTrue( - Arrays.equals(unk1, unk2), "Vectors or unknown words should always be the same."); + assertTrue(Arrays.equals(unk1, unk2), + "Vectors or unknown words should always be the same."); } } @@ -118,10 +119,10 @@ public void testRandomVector() * Write a binary vectors file to a testContext-dependent location. * * @return the binary vectors {@link File} - * @throws IOException if an I/O error occurs + * @throws IOException + * if an I/O error occurs */ - private File writeBinaryFile(Map vectors) - throws IOException + private File writeBinaryFile(Map vectors) throws IOException { var binaryTarget = new File(tempDir, "binaryTarget"); convertWordVectorsToBinary(vectors, binaryTarget); diff --git a/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/text/TextFormatVectorizerTest.java b/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/text/TextFormatVectorizerTest.java index cfa4d7bda..e6f82d9f2 100644 --- a/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/text/TextFormatVectorizerTest.java +++ b/dkpro-core-api-embeddings-asl/src/test/java/org/dkpro/core/api/embeddings/text/TextFormatVectorizerTest.java @@ -27,25 +27,28 @@ import org.dkpro.core.api.embeddings.Vectorizer; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.DisabledOnOs; +import org.junit.jupiter.api.condition.OS; +@DisabledOnOs(value = OS.WINDOWS, // + disabledReason = "mmapped buffers cannot be unmapped explicitly, so we cannot delete the temp dir on Windows") public class TextFormatVectorizerTest { @Test - public void testVectorizer() - throws Exception + public void testVectorizer() throws Exception { File modelFile = new File("src/test/resources/dummy.vec"); Vectorizer vectorizer = TextFormatVectorizer.load(modelFile); int expectedSize = 699; int expectedDimensions = 50; - float[] expectedVectorHer = new float[] { -0.003060f, 0.003507f, - -0.008743f, -0.002152f, -0.004767f, -0.007613f, 0.004302f, 0.002171f, -0.002029f, - 0.001279f, 0.002584f, 0.002896f, 0.006834f, 0.000398f, 0.005685f, -0.006861f, - -0.005104f, -0.006102f, 0.001795f, -0.005347f, 0.006562f, -0.009437f, -0.005975f, - -0.007835f, 0.000151f, 0.008032f, -0.004748f, 0.006110f, -0.008335f, -0.005110f, - -0.004147f, 0.005215f, -0.009278f, -0.008693f, -0.004793f, -0.006631f, 0.005200f, - 0.003343f, -0.002542f, 0.006161f, 0.009828f, -0.001308f, 0.004804f, 0.001710f, - 0.005781f, 0.002312f, -0.002556f, 0.007643f, 0.003270f, -0.000747f }; + float[] expectedVectorHer = new float[] { -0.003060f, 0.003507f, -0.008743f, -0.002152f, + -0.004767f, -0.007613f, 0.004302f, 0.002171f, -0.002029f, 0.001279f, 0.002584f, + 0.002896f, 0.006834f, 0.000398f, 0.005685f, -0.006861f, -0.005104f, -0.006102f, + 0.001795f, -0.005347f, 0.006562f, -0.009437f, -0.005975f, -0.007835f, 0.000151f, + 0.008032f, -0.004748f, 0.006110f, -0.008335f, -0.005110f, -0.004147f, 0.005215f, + -0.009278f, -0.008693f, -0.004793f, -0.006631f, 0.005200f, 0.003343f, -0.002542f, + 0.006161f, 0.009828f, -0.001308f, 0.004804f, 0.001710f, 0.005781f, 0.002312f, + -0.002556f, 0.007643f, 0.003270f, -0.000747f }; float[] expectedVectorPartiality = new float[] { 0.003056f, -0.004063f, 0.008095f, 0.008563f, -0.004409f, -0.000555f, 0.002892f, -0.003428f, -0.009526f, 0.005398f, 0.005198f, 0.000784f, 0.000739f, -0.002909f, -0.000911f, 0.001754f, 0.000432f, @@ -63,8 +66,7 @@ public void testVectorizer() } @Test - public void testCaseless() - throws IOException + public void testCaseless() throws IOException { File modelFile = new File("src/test/resources/dummy_lowercased.vec"); Vectorizer vectorizer = TextFormatVectorizer.load(modelFile); diff --git a/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java index 99c08a69e..c855f9bb9 100644 --- a/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java +++ b/dkpro-core-mallet-asl/src/main/java/org/dkpro/core/mallet/wordembeddings/MalletEmbeddingsAnnotator.java @@ -133,6 +133,19 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti } } + @Override + public void destroy() + { + if (vectorizer != null) { + try { + vectorizer.close(); + } + catch (Exception e) { + getLogger().error("Error while closing vectorizer", e); + } + } + } + @Override public void process(JCas aJCas) throws AnalysisEngineProcessException {