From 0988de80dc5dccdedd369f9901eb0f65dadb430b Mon Sep 17 00:00:00 2001 From: kmannthe Date: Sun, 30 Jan 2022 22:18:06 -0800 Subject: [PATCH] Set String and File IO types to be UTF_8 Define UTF_8 to be used for all GKL string and file realted things Unit tests shows no negative side affects of these changes. Signed-off-by: Keith Mannthey --- .../gkl/smithwaterman/IntelSmithWaterman.java | 3 +- .../gkl/compression/DeflaterProfile.java | 9 ++- .../gkl/compression/InflaterUnitTest.java | 5 +- .../intel/gkl/pairhmm/PairHmmUnitTest.java | 76 +++++++++++-------- .../smithwaterman/SmithWatermanUnitTest.java | 18 ++--- 5 files changed, 62 insertions(+), 49 deletions(-) diff --git a/src/main/java/com/intel/gkl/smithwaterman/IntelSmithWaterman.java b/src/main/java/com/intel/gkl/smithwaterman/IntelSmithWaterman.java index d7e35cab..cf383a20 100644 --- a/src/main/java/com/intel/gkl/smithwaterman/IntelSmithWaterman.java +++ b/src/main/java/com/intel/gkl/smithwaterman/IntelSmithWaterman.java @@ -35,6 +35,7 @@ import org.broadinstitute.gatk.nativebindings.smithwaterman.SWNativeAlignerResult; import java.io.File; +import java.nio.charset.StandardCharsets; /** * Provides a native SmithWaterman implementation accelerated for the Intel Architecture. @@ -157,7 +158,7 @@ public SWNativeAlignerResult align(byte[] refArray, byte[] altArray, SWParameter throw new IllegalArgumentException("Ran into invalid argument issue"); } - return new SWNativeAlignerResult(new String(cigar).trim(), offset); + return new SWNativeAlignerResult(new String(cigar,StandardCharsets.UTF_8).trim(), offset); } public byte getStrategy(SWOverhangStrategy strategy) diff --git a/src/test/java/com/intel/gkl/compression/DeflaterProfile.java b/src/test/java/com/intel/gkl/compression/DeflaterProfile.java index 91868ecc..5c170655 100644 --- a/src/test/java/com/intel/gkl/compression/DeflaterProfile.java +++ b/src/test/java/com/intel/gkl/compression/DeflaterProfile.java @@ -14,7 +14,8 @@ import java.util.Iterator; import java.util.List; import java.util.zip.Deflater; - +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; /** * Integration and performance/compression profiling test for IntelDeflater */ @@ -52,7 +53,9 @@ public Deflater makeDeflater(final int compressionLevel, final boolean nowrap) { deflaterFactories.add(javaDeflaterFactory); // create profile log file - final FileWriter fileWriter = new FileWriter(profileFile); + final FileOutputStream fileStream = new FileOutputStream(profileFile); + final OutputStreamWriter fileWriter = new OutputStreamWriter(fileStream, "UTF-8"); +// final FileWriter fileWriter = new FileWriter(profileFile); try { fileWriter.write("level, time(sec), filesize\n"); } catch (IOException e) {System.err.println("Caught IOException: " + e.getMessage());} @@ -84,7 +87,7 @@ public Deflater makeDeflater(final int compressionLevel, final boolean nowrap) { } } try { - fileWriter.write(String.format("%d, %.3f, %d\n", + fileWriter.write(String.format("%d, %.3f, %d%n", compressionLevel, (totalTime/1000.0/loopCount), outputFile.length())); fileWriter.flush(); } catch (IOException e) {System.err.println("Caught IOException: " + e.getMessage());} diff --git a/src/test/java/com/intel/gkl/compression/InflaterUnitTest.java b/src/test/java/com/intel/gkl/compression/InflaterUnitTest.java index a888a168..a8a954c3 100644 --- a/src/test/java/com/intel/gkl/compression/InflaterUnitTest.java +++ b/src/test/java/com/intel/gkl/compression/InflaterUnitTest.java @@ -11,6 +11,7 @@ import java.io.File; import java.io.IOException; + import java.util.Arrays; import java.util.zip.DataFormatException; import java.util.zip.Deflater; @@ -129,7 +130,7 @@ public void inflateOutputBufferOverflowShortTest() throws DataFormatException, j int resultLength = inflater.inflate(result, 0 , result.length); inflater.end(); - String seq2 = new String(result, 0, resultLength); + String seq2 = new String(result, 0, resultLength, "UTF8"); log.info(String.format("UnCompressed length : %d Seq : %s" , seq2.length() , seq2)); Assert.assertEquals(sequence, seq2); } @@ -186,7 +187,7 @@ public void inflateNowrapFalseJavaTest() throws DataFormatException, java.io.Uns int resultLength = inflater.inflate(result, 0 , 1024); inflater.end(); - String seq2 = new String(result, 0, resultLength); + String seq2 = new String(result, 0, resultLength, "UTF8"); log.info(String.format("UnCompressed length : %d Seq : %s" , seq2.length() , seq2)); Assert.assertEquals(sequence, seq2); diff --git a/src/test/java/com/intel/gkl/pairhmm/PairHmmUnitTest.java b/src/test/java/com/intel/gkl/pairhmm/PairHmmUnitTest.java index 34e519bd..87e9a6f8 100644 --- a/src/test/java/com/intel/gkl/pairhmm/PairHmmUnitTest.java +++ b/src/test/java/com/intel/gkl/pairhmm/PairHmmUnitTest.java @@ -17,6 +17,10 @@ import java.io.FileNotFoundException; import java.io.FileReader; import java.util.Scanner; +import java.nio.file.Files; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.nio.file.Paths; public class PairHmmUnitTest { static final String pairHMMTestData = IntelGKLUtils.pathToTestResource("pairhmm-testdata.txt"); @@ -68,13 +72,13 @@ public void simpleTest() { // read data from file haplotypeDataArray[0] = new HaplotypeDataHolder(); - haplotypeDataArray[0].haplotypeBases = "ACGT".getBytes(); + haplotypeDataArray[0].haplotypeBases = "ACGT".getBytes(StandardCharsets.UTF_8); readDataArray[0] = new ReadDataHolder(); - readDataArray[0].readBases = "ACGT".getBytes(); - readDataArray[0].readQuals = "++++".getBytes(); - readDataArray[0].insertionGOP = "++++".getBytes(); - readDataArray[0].deletionGOP = "++++".getBytes(); - readDataArray[0].overallGCP = "++++".getBytes(); + readDataArray[0].readBases = "ACGT".getBytes(StandardCharsets.UTF_8); + readDataArray[0].readQuals = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].insertionGOP = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].deletionGOP = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].overallGCP = "++++".getBytes(StandardCharsets.UTF_8); double expectedResult = -6.022797e-01; // call pairHMM @@ -116,13 +120,13 @@ public void omp_Test() { // read data from file haplotypeDataArray[0] = new HaplotypeDataHolder(); - haplotypeDataArray[0].haplotypeBases = "ACGT".getBytes(); + haplotypeDataArray[0].haplotypeBases = "ACGT".getBytes(StandardCharsets.UTF_8); readDataArray[0] = new ReadDataHolder(); - readDataArray[0].readBases = "ACGT".getBytes(); - readDataArray[0].readQuals = "++++".getBytes(); - readDataArray[0].insertionGOP = "++++".getBytes(); - readDataArray[0].deletionGOP = "++++".getBytes(); - readDataArray[0].overallGCP = "++++".getBytes(); + readDataArray[0].readBases = "ACGT".getBytes(StandardCharsets.UTF_8); + readDataArray[0].readQuals = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].insertionGOP = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].deletionGOP = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].overallGCP = "++++".getBytes(StandardCharsets.UTF_8); double expectedResult = -6.022797e-01; // call pairHMM @@ -147,13 +151,13 @@ public void omp_Test() { // read data from file haplotypeDataArray[0] = new HaplotypeDataHolder(); - haplotypeDataArray[0].haplotypeBases = "ACGT".getBytes(); + haplotypeDataArray[0].haplotypeBases = "ACGT".getBytes(StandardCharsets.UTF_8); readDataArray[0] = new ReadDataHolder(); - readDataArray[0].readBases = "ACGT".getBytes(); - readDataArray[0].readQuals = "++++".getBytes(); - readDataArray[0].insertionGOP = "++++".getBytes(); - readDataArray[0].deletionGOP = "++++".getBytes(); - readDataArray[0].overallGCP = "++++".getBytes(); + readDataArray[0].readBases = "ACGT".getBytes(StandardCharsets.UTF_8); + readDataArray[0].readQuals = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].insertionGOP = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].deletionGOP = "++++".getBytes(StandardCharsets.UTF_8); + readDataArray[0].overallGCP = "++++".getBytes(StandardCharsets.UTF_8); double expectedResult = -6.022797e-01; // call pairHMM @@ -189,8 +193,11 @@ public void dataFileTest() { // read test data from file Scanner s = null; + BufferedReader r = null; try { - s = new Scanner(new BufferedReader(new FileReader(pairHMMTestData))); + Path Data = Paths.get(pairHMMTestData); + r = new BufferedReader(Files.newBufferedReader(Data, StandardCharsets.UTF_8)); + s = new Scanner(r); while (s.hasNext()) { // skip comment lines @@ -199,12 +206,12 @@ public void dataFileTest() { continue; } - haplotypeDataArray[0].haplotypeBases = s.next().getBytes(); - readDataArray[0].readBases = s.next().getBytes(); - readDataArray[0].readQuals = normalize(s.next().getBytes(), 6); - readDataArray[0].insertionGOP = normalize(s.next().getBytes()); - readDataArray[0].deletionGOP = normalize(s.next().getBytes()); - readDataArray[0].overallGCP = normalize(s.next().getBytes()); + haplotypeDataArray[0].haplotypeBases = s.next().getBytes(StandardCharsets.UTF_8); + readDataArray[0].readBases = s.next().getBytes(StandardCharsets.UTF_8); + readDataArray[0].readQuals = normalize(s.next().getBytes(StandardCharsets.UTF_8), 6); + readDataArray[0].insertionGOP = normalize(s.next().getBytes(StandardCharsets.UTF_8)); + readDataArray[0].deletionGOP = normalize(s.next().getBytes(StandardCharsets.UTF_8)); + readDataArray[0].overallGCP = normalize(s.next().getBytes(StandardCharsets.UTF_8)); double expectedResult = s.nextDouble(); // call pairHMM @@ -250,9 +257,12 @@ public void testDataFileBatchTest() { // read test data from file Scanner s = null; - try { - s = new Scanner(new BufferedReader(new FileReader(pairHMMTestData))); + BufferedReader r = null; + try { + Path Data = Paths.get(pairHMMTestData); + r = new BufferedReader(Files.newBufferedReader(Data, StandardCharsets.UTF_8)); + s = new Scanner(r); int batchSize = 0; while (s.hasNext()) { // skip comment lines @@ -261,12 +271,12 @@ public void testDataFileBatchTest() { continue; } - haplotypeDataArray[batchSize].haplotypeBases = s.next().getBytes(); - readDataArray[batchSize].readBases = s.next().getBytes(); - readDataArray[batchSize].readQuals = normalize(s.next().getBytes(), 6); - readDataArray[batchSize].insertionGOP = normalize(s.next().getBytes()); - readDataArray[batchSize].deletionGOP = normalize(s.next().getBytes()); - readDataArray[batchSize].overallGCP = normalize(s.next().getBytes()); + haplotypeDataArray[batchSize].haplotypeBases = s.next().getBytes(StandardCharsets.UTF_8); + readDataArray[batchSize].readBases = s.next().getBytes(StandardCharsets.UTF_8); + readDataArray[batchSize].readQuals = normalize(s.next().getBytes(StandardCharsets.UTF_8), 6); + readDataArray[batchSize].insertionGOP = normalize(s.next().getBytes(StandardCharsets.UTF_8)); + readDataArray[batchSize].deletionGOP = normalize(s.next().getBytes(StandardCharsets.UTF_8)); + readDataArray[batchSize].overallGCP = normalize(s.next().getBytes(StandardCharsets.UTF_8)); expectedResult[batchSize] = s.nextDouble(); log.info(String.format("expected[%d] = %e ", batchSize, expectedResult[batchSize])); batchSize++; diff --git a/src/test/java/com/intel/gkl/smithwaterman/SmithWatermanUnitTest.java b/src/test/java/com/intel/gkl/smithwaterman/SmithWatermanUnitTest.java index 0c545d48..8cca3bfe 100644 --- a/src/test/java/com/intel/gkl/smithwaterman/SmithWatermanUnitTest.java +++ b/src/test/java/com/intel/gkl/smithwaterman/SmithWatermanUnitTest.java @@ -11,6 +11,10 @@ import java.io.*; import java.util.Arrays; +import java.nio.file.Files; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.nio.file.Paths; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.Log; @@ -35,10 +39,8 @@ public void inputDataTest() { } try { - - final File inputFile = new File(smithwatermanData); - final FileReader input = new FileReader(inputFile); - final BufferedReader in = new BufferedReader(input); + final Path Data = Paths.get(smithwatermanData); + final BufferedReader in = new BufferedReader(Files.newBufferedReader(Data, StandardCharsets.UTF_8)); String refString = new String(""), altString = new String(""); SWParameters SWparameters = new SWParameters(200, -150, -260, -11); @@ -213,12 +215,9 @@ public void simpleTest() { smithWaterman.close(); throw new SkipException(err); } - try { - - final File inputFile = new File(smithwatermanData); - final FileReader input = new FileReader(inputFile); - final BufferedReader in = new BufferedReader(input); + final Path Data = Paths.get(smithwatermanData); + final BufferedReader in = new BufferedReader(Files.newBufferedReader(Data, StandardCharsets.UTF_8)); byte[] ref; byte[] alt; @@ -250,7 +249,6 @@ public void simpleTest() { } in.close(); - input.close(); } catch (java.io.IOException e) { e.printStackTrace(); }