From 57df4b4e8bd3bfbcd5ac02a145813b762aa89781 Mon Sep 17 00:00:00 2001 From: psainics Date: Mon, 8 Jul 2024 10:19:28 +0530 Subject: [PATCH] apache.poi ByteArrayMaxOverride --- core-plugins/pom.xml | 4 ++-- .../io/cdap/plugin/batch/source/ExcelInputFormat.java | 9 ++++++++- .../io/cdap/plugin/batch/source/ExcelInputReader.java | 7 ++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/core-plugins/pom.xml b/core-plugins/pom.xml index c4185bbfd..8a62ee01c 100644 --- a/core-plugins/pom.xml +++ b/core-plugins/pom.xml @@ -186,12 +186,12 @@ org.apache.poi poi - 5.2.4 + 5.2.5 org.apache.poi poi-ooxml - 5.2.4 + 5.2.5 com.github.pjfanning diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java index d5f0ef5d1..a7e46a30f 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java @@ -41,6 +41,7 @@ import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import org.apache.poi.ss.util.CellReference; +import org.apache.poi.util.IOUtils; import java.io.IOException; import java.io.InputStream; @@ -66,6 +67,8 @@ public class ExcelInputFormat extends TextInputFormat { public static final String FILE_PATTERN = "filePattern"; public static final String SHEET = "sheet"; public static final String SHEET_VALUE = "sheetValue"; + public static final String EXCEL_BYTE_ARRAY_MAX_OVERRIDE = "excel.byteArrayMaxOverride"; + public static final int EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT = Integer.MAX_VALUE / 2; @Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) { @@ -80,7 +83,7 @@ public boolean isSplitable(JobContext context, Path file) { public static void setConfigurations(Job job, String filePattern, String sheet, boolean reprocess, String sheetValue, String columnList, boolean skipFirstRow, String terminateIfEmptyRow, String rowLimit, String ifErrorRecord, - String processedFiles) { + String processedFiles, int byteArrayMaxOverride) { Configuration configuration = job.getConfiguration(); configuration.set(FILE_PATTERN, filePattern); @@ -100,6 +103,7 @@ public static void setConfigurations(Job job, String filePattern, String sheet, configuration.set(IF_ERROR_RECORD, ifErrorRecord); configuration.set(PROCESSED_FILES, processedFiles); + configuration.set(EXCEL_BYTE_ARRAY_MAX_OVERRIDE, String.valueOf(byteArrayMaxOverride)); } @@ -175,6 +179,9 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro isStreaming = true; break; case OLE2: + // workaround for large files + IOUtils.setByteArrayMaxOverride(job.getInt(EXCEL_BYTE_ARRAY_MAX_OVERRIDE, + ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT)); workbook = WorkbookFactory.create(is); break; default: diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java index e6e549bf0..38157d85e 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java @@ -318,11 +318,16 @@ public void prepareRun(BatchSourceContext batchSourceContext) throws Exception { processFiles = GSON.toJson(getAllProcessedFiles(batchSourceContext), ARRAYLIST_PREPROCESSED_FILES); } + Map arguments = new HashMap<>(batchSourceContext.getArguments().asMap()); + int byteArrayMaxOverride = arguments.containsKey(ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE) ? + Integer.parseInt(arguments.get(ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE)) : + ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT; + ExcelInputFormat.setConfigurations(job, excelInputreaderConfig.filePattern, excelInputreaderConfig.sheet, excelInputreaderConfig.reprocess, excelInputreaderConfig.sheetValue, excelInputreaderConfig.columnList, excelInputreaderConfig.skipFirstRow, excelInputreaderConfig.terminateIfEmptyRow, excelInputreaderConfig.rowsLimit, - excelInputreaderConfig.ifErrorRecord, processFiles); + excelInputreaderConfig.ifErrorRecord, processFiles, byteArrayMaxOverride); // Sets the input path(s). ExcelInputFormat.addInputPaths(job, excelInputreaderConfig.filePath);