diff --git a/catalog/transformer/catalog-transformer-pdf/src/main/java/ddf/catalog/transformer/input/pdf/PdfInputTransformer.java b/catalog/transformer/catalog-transformer-pdf/src/main/java/ddf/catalog/transformer/input/pdf/PdfInputTransformer.java index ced91d0cb2ef..252118d23734 100644 --- a/catalog/transformer/catalog-transformer-pdf/src/main/java/ddf/catalog/transformer/input/pdf/PdfInputTransformer.java +++ b/catalog/transformer/catalog-transformer-pdf/src/main/java/ddf/catalog/transformer/input/pdf/PdfInputTransformer.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.Calendar; import java.util.Collection; import java.util.Collections; @@ -167,15 +168,18 @@ private Metacard transformWithExtractors(InputStream input, String id) e); } - String plainText = null; + InputStream plainText = null; try (InputStream isCopy = fbos.asByteSource() .openStream()) { Parser parser = new AutoDetectParser(); - ContentHandler contentHandler = new ToTextContentHandler(); - TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, - contentHandler); - tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext()); - plainText = contentHandler.toString(); + try (TemporaryFileBackedOutputStream contentHandlerStream = new TemporaryFileBackedOutputStream()) { + ContentHandler contentHandler = new ToTextContentHandler(contentHandlerStream, + StandardCharsets.UTF_8.toString()); + TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, + contentHandler); + tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext()); + plainText = contentHandlerStream.asByteSource().openStream(); + } } catch (CatalogTransformerException e) { LOGGER.warn("Cannot extract metadata from pdf", e); } @@ -195,10 +199,16 @@ private MetacardImpl initializeMetacard(String id) { return initializeMetacard(id, null); } - private MetacardImpl initializeMetacard(String id, String contentInput) { + private MetacardImpl initializeMetacard(String id, InputStream contentInput) { MetacardImpl metacard; - if (StringUtils.isNotBlank(contentInput)) { + if (contentInput != null && !contentMetadataExtractors.isEmpty()) { + String content = null; + try { + content = IOUtils.toString(contentInput, StandardCharsets.UTF_8); + } catch (IOException e) { + LOGGER.debug("Unable to read content for PDF.", e); + } Set attributes = contentMetadataExtractors.values() .stream() .map(ContentMetadataExtractor::getMetacardAttributes) @@ -210,7 +220,7 @@ private MetacardImpl initializeMetacard(String id, String contentInput) { attributes)); for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) { - contentMetadataExtractor.process(contentInput, metacard); + contentMetadataExtractor.process(content, metacard); } } else { metacard = new MetacardImpl(metacardType); @@ -228,7 +238,7 @@ private Metacard transformPdf(String id, PDDocument pdfDocument) throws IOExcept return transformPdf(id, pdfDocument, null); } - private Metacard transformPdf(String id, PDDocument pdfDocument, String contentInput) + private Metacard transformPdf(String id, PDDocument pdfDocument, InputStream contentInput) throws IOException { MetacardImpl metacard = initializeMetacard(id, contentInput); diff --git a/catalog/transformer/catalog-transformer-tika-input/src/main/java/ddf/catalog/transformer/input/tika/TikaInputTransformer.java b/catalog/transformer/catalog-transformer-tika-input/src/main/java/ddf/catalog/transformer/input/tika/TikaInputTransformer.java index adf6d4c553d2..e1c351b86794 100644 --- a/catalog/transformer/catalog-transformer-tika-input/src/main/java/ddf/catalog/transformer/input/tika/TikaInputTransformer.java +++ b/catalog/transformer/catalog-transformer-tika-input/src/main/java/ddf/catalog/transformer/input/tika/TikaInputTransformer.java @@ -19,9 +19,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.StringReader; -import java.io.StringWriter; -import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -354,42 +352,55 @@ public Metacard transform(InputStream input, String id) } Parser parser = new AutoDetectParser(); - ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler(); - ToTextContentHandler textContentHandler = null; - ContentHandler contentHandler; - if (!contentExtractors.isEmpty()) { - textContentHandler = new ToTextContentHandler(); - contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler); - } else { - contentHandler = xmlContentHandler; - } - - TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, - contentHandler); - Metadata metadata; - try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource() - .openStream()) { - metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext()); - } - - String metadataText = xmlContentHandler.toString(); - if (templates != null) { - metadataText = transformToXml(metadataText); - } - - String contentType = metadata.get(Metadata.CONTENT_TYPE); - MetacardType metacardType = mergeAttributes(getMetacardType(contentType)); - Metacard metacard = MetacardCreator.createMetacard(metadata, - id, - metadataText, - metacardType, - useResourceTitleAsTitle); + String metadataText; + ToTextContentHandler textContentHandler = null; + Metacard metacard; + String contentType; + try (TemporaryFileBackedOutputStream textContentHandlerOutStream = new TemporaryFileBackedOutputStream()) { + try (TemporaryFileBackedOutputStream xmlContentHandlerOutStream = new TemporaryFileBackedOutputStream()) { + ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler( + xmlContentHandlerOutStream, + StandardCharsets.UTF_8.toString()); + ContentHandler contentHandler; + if (!contentExtractors.isEmpty()) { + textContentHandler = new ToTextContentHandler(textContentHandlerOutStream, + StandardCharsets.UTF_8.toString()); + contentHandler = new TeeContentHandler(xmlContentHandler, + textContentHandler); + } else { + contentHandler = xmlContentHandler; + } + + TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, + contentHandler); + + try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource() + .openStream()) { + metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, + new ParseContext()); + } + + if (templates != null) { + metadataText = transformToXml(xmlContentHandlerOutStream); + } else { + metadataText = xmlContentHandler.toString(); + } + } - if (textContentHandler != null) { - String plainText = textContentHandler.toString(); - for (ContentMetadataExtractor contentMetadataExtractor : contentExtractors.values()) { - contentMetadataExtractor.process(plainText, metacard); + contentType = metadata.get(Metadata.CONTENT_TYPE); + MetacardType metacardType = mergeAttributes(getMetacardType(contentType)); + metacard = MetacardCreator.createMetacard(metadata, + id, + metadataText, + metacardType, + useResourceTitleAsTitle); + + if (textContentHandler != null && !contentExtractors.isEmpty()) { + String plainText = textContentHandler.toString(); + for (ContentMetadataExtractor contentMetadataExtractor : contentExtractors.values()) { + contentMetadataExtractor.process(plainText, metacard); + } } } @@ -628,7 +639,7 @@ private void createThumbnail(InputStream input, Metacard metacard) { } } - private String transformToXml(String xhtml) { + private String transformToXml(TemporaryFileBackedOutputStream xhtml) { LOGGER.debug("Transforming xhtml to xml."); XMLReader xmlReader = null; @@ -639,17 +650,27 @@ private String transformToXml(String xhtml) { LOGGER.debug(e.getMessage(), e); } if (xmlReader != null) { - try { - Writer xml = new StringWriter(); + try (TemporaryFileBackedOutputStream xmlOutStream = new TemporaryFileBackedOutputStream(); + InputStream xhtmlInStream = xhtml.asByteSource() + .openStream()) { Transformer transformer = templates.newTransformer(); - transformer.transform(new SAXSource(xmlReader, - new InputSource(new StringReader(xhtml))), new StreamResult(xml)); - return xml.toString(); - } catch (TransformerException e) { + transformer.transform(new SAXSource(xmlReader, new InputSource(xhtmlInStream)), + new StreamResult(xmlOutStream)); + //we should not be doing this and should be returning the stream instead + try (InputStream resultStream = xmlOutStream.asByteSource() + .openStream()) { + return IOUtils.toString(resultStream, StandardCharsets.UTF_8); + } + } catch (IOException | TransformerException e) { LOGGER.debug("Unable to transform metadata from XHTML to XML.", e); } } - return xhtml; + try (InputStream xhtmlStream = xhtml.asByteSource().openStream()) { + return IOUtils.toString(xhtmlStream, StandardCharsets.UTF_8); + } catch (IOException e) { + LOGGER.debug("Unable to read data from XHTML stream.", e); + } + return ""; } Bundle getBundle() {