Skip to content

Commit

Permalink
DDF-3178 reduced the number of redundant memory copies that the tika …
Browse files Browse the repository at this point in the history
…input transformer was doing (#2164)
  • Loading branch information
stustison authored Jul 31, 2017
1 parent 27e24bc commit 4c0fdec
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -167,15 +168,18 @@ private Metacard transformWithExtractors(InputStream input, String id)
e);
}

String plainText = null;
InputStream plainText = null;
try (InputStream isCopy = fbos.asByteSource()
.openStream()) {
Parser parser = new AutoDetectParser();
ContentHandler contentHandler = new ToTextContentHandler();
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser,
contentHandler);
tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
plainText = contentHandler.toString();
try (TemporaryFileBackedOutputStream contentHandlerStream = new TemporaryFileBackedOutputStream()) {
ContentHandler contentHandler = new ToTextContentHandler(contentHandlerStream,
StandardCharsets.UTF_8.toString());
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser,
contentHandler);
tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
plainText = contentHandlerStream.asByteSource().openStream();
}
} catch (CatalogTransformerException e) {
LOGGER.warn("Cannot extract metadata from pdf", e);
}
Expand All @@ -195,10 +199,16 @@ private MetacardImpl initializeMetacard(String id) {
return initializeMetacard(id, null);
}

private MetacardImpl initializeMetacard(String id, String contentInput) {
private MetacardImpl initializeMetacard(String id, InputStream contentInput) {
MetacardImpl metacard;

if (StringUtils.isNotBlank(contentInput)) {
if (contentInput != null && !contentMetadataExtractors.isEmpty()) {
String content = null;
try {
content = IOUtils.toString(contentInput, StandardCharsets.UTF_8);
} catch (IOException e) {
LOGGER.debug("Unable to read content for PDF.", e);
}
Set<AttributeDescriptor> attributes = contentMetadataExtractors.values()
.stream()
.map(ContentMetadataExtractor::getMetacardAttributes)
Expand All @@ -210,7 +220,7 @@ private MetacardImpl initializeMetacard(String id, String contentInput) {
attributes));

for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
contentMetadataExtractor.process(contentInput, metacard);
contentMetadataExtractor.process(content, metacard);
}
} else {
metacard = new MetacardImpl(metacardType);
Expand All @@ -228,7 +238,7 @@ private Metacard transformPdf(String id, PDDocument pdfDocument) throws IOExcept
return transformPdf(id, pdfDocument, null);
}

private Metacard transformPdf(String id, PDDocument pdfDocument, String contentInput)
private Metacard transformPdf(String id, PDDocument pdfDocument, InputStream contentInput)
throws IOException {
MetacardImpl metacard = initializeMetacard(id, contentInput);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -354,42 +352,55 @@ public Metacard transform(InputStream input, String id)
}

Parser parser = new AutoDetectParser();
ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
ToTextContentHandler textContentHandler = null;
ContentHandler contentHandler;
if (!contentExtractors.isEmpty()) {
textContentHandler = new ToTextContentHandler();
contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
} else {
contentHandler = xmlContentHandler;
}

TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser,
contentHandler);

Metadata metadata;
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource()
.openStream()) {
metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
}

String metadataText = xmlContentHandler.toString();
if (templates != null) {
metadataText = transformToXml(metadataText);
}

String contentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = mergeAttributes(getMetacardType(contentType));
Metacard metacard = MetacardCreator.createMetacard(metadata,
id,
metadataText,
metacardType,
useResourceTitleAsTitle);
String metadataText;
ToTextContentHandler textContentHandler = null;
Metacard metacard;
String contentType;
try (TemporaryFileBackedOutputStream textContentHandlerOutStream = new TemporaryFileBackedOutputStream()) {
try (TemporaryFileBackedOutputStream xmlContentHandlerOutStream = new TemporaryFileBackedOutputStream()) {
ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler(
xmlContentHandlerOutStream,
StandardCharsets.UTF_8.toString());
ContentHandler contentHandler;
if (!contentExtractors.isEmpty()) {
textContentHandler = new ToTextContentHandler(textContentHandlerOutStream,
StandardCharsets.UTF_8.toString());
contentHandler = new TeeContentHandler(xmlContentHandler,
textContentHandler);
} else {
contentHandler = xmlContentHandler;
}

TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser,
contentHandler);

try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource()
.openStream()) {
metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy,
new ParseContext());
}

if (templates != null) {
metadataText = transformToXml(xmlContentHandlerOutStream);
} else {
metadataText = xmlContentHandler.toString();
}
}

if (textContentHandler != null) {
String plainText = textContentHandler.toString();
for (ContentMetadataExtractor contentMetadataExtractor : contentExtractors.values()) {
contentMetadataExtractor.process(plainText, metacard);
contentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = mergeAttributes(getMetacardType(contentType));
metacard = MetacardCreator.createMetacard(metadata,
id,
metadataText,
metacardType,
useResourceTitleAsTitle);

if (textContentHandler != null && !contentExtractors.isEmpty()) {
String plainText = textContentHandler.toString();
for (ContentMetadataExtractor contentMetadataExtractor : contentExtractors.values()) {
contentMetadataExtractor.process(plainText, metacard);
}
}
}

Expand Down Expand Up @@ -628,7 +639,7 @@ private void createThumbnail(InputStream input, Metacard metacard) {
}
}

private String transformToXml(String xhtml) {
private String transformToXml(TemporaryFileBackedOutputStream xhtml) {
LOGGER.debug("Transforming xhtml to xml.");

XMLReader xmlReader = null;
Expand All @@ -639,17 +650,27 @@ private String transformToXml(String xhtml) {
LOGGER.debug(e.getMessage(), e);
}
if (xmlReader != null) {
try {
Writer xml = new StringWriter();
try (TemporaryFileBackedOutputStream xmlOutStream = new TemporaryFileBackedOutputStream();
InputStream xhtmlInStream = xhtml.asByteSource()
.openStream()) {
Transformer transformer = templates.newTransformer();
transformer.transform(new SAXSource(xmlReader,
new InputSource(new StringReader(xhtml))), new StreamResult(xml));
return xml.toString();
} catch (TransformerException e) {
transformer.transform(new SAXSource(xmlReader, new InputSource(xhtmlInStream)),
new StreamResult(xmlOutStream));
//we should not be doing this and should be returning the stream instead
try (InputStream resultStream = xmlOutStream.asByteSource()
.openStream()) {
return IOUtils.toString(resultStream, StandardCharsets.UTF_8);
}
} catch (IOException | TransformerException e) {
LOGGER.debug("Unable to transform metadata from XHTML to XML.", e);
}
}
return xhtml;
try (InputStream xhtmlStream = xhtml.asByteSource().openStream()) {
return IOUtils.toString(xhtmlStream, StandardCharsets.UTF_8);
} catch (IOException e) {
LOGGER.debug("Unable to read data from XHTML stream.", e);
}
return "";
}

Bundle getBundle() {
Expand Down

0 comments on commit 4c0fdec

Please sign in to comment.