diff --git a/core-plugins/pom.xml b/core-plugins/pom.xml
index c4185bbfd..8a62ee01c 100644
--- a/core-plugins/pom.xml
+++ b/core-plugins/pom.xml
@@ -186,12 +186,12 @@
org.apache.poi
poi
- 5.2.4
+ 5.2.5
org.apache.poi
poi-ooxml
- 5.2.4
+ 5.2.5
com.github.pjfanning
diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java
index d5f0ef5d1..a7e46a30f 100644
--- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java
+++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java
@@ -41,6 +41,7 @@
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.ss.util.CellReference;
+import org.apache.poi.util.IOUtils;
import java.io.IOException;
import java.io.InputStream;
@@ -66,6 +67,8 @@ public class ExcelInputFormat extends TextInputFormat {
public static final String FILE_PATTERN = "filePattern";
public static final String SHEET = "sheet";
public static final String SHEET_VALUE = "sheetValue";
+ public static final String EXCEL_BYTE_ARRAY_MAX_OVERRIDE = "excel.byteArrayMaxOverride";
+ public static final int EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT = Integer.MAX_VALUE / 2;
@Override
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) {
@@ -80,7 +83,7 @@ public boolean isSplitable(JobContext context, Path file) {
public static void setConfigurations(Job job, String filePattern, String sheet, boolean reprocess,
String sheetValue, String columnList, boolean skipFirstRow,
String terminateIfEmptyRow, String rowLimit, String ifErrorRecord,
- String processedFiles) {
+ String processedFiles, int byteArrayMaxOverride) {
Configuration configuration = job.getConfiguration();
configuration.set(FILE_PATTERN, filePattern);
@@ -100,6 +103,7 @@ public static void setConfigurations(Job job, String filePattern, String sheet,
configuration.set(IF_ERROR_RECORD, ifErrorRecord);
configuration.set(PROCESSED_FILES, processedFiles);
+ configuration.set(EXCEL_BYTE_ARRAY_MAX_OVERRIDE, String.valueOf(byteArrayMaxOverride));
}
@@ -175,6 +179,9 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
isStreaming = true;
break;
case OLE2:
+ // workaround for large files
+ IOUtils.setByteArrayMaxOverride(job.getInt(EXCEL_BYTE_ARRAY_MAX_OVERRIDE,
+ ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT));
workbook = WorkbookFactory.create(is);
break;
default:
diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java
index e6e549bf0..38157d85e 100644
--- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java
+++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputReader.java
@@ -318,11 +318,16 @@ public void prepareRun(BatchSourceContext batchSourceContext) throws Exception {
processFiles = GSON.toJson(getAllProcessedFiles(batchSourceContext), ARRAYLIST_PREPROCESSED_FILES);
}
+ Map arguments = new HashMap<>(batchSourceContext.getArguments().asMap());
+ int byteArrayMaxOverride = arguments.containsKey(ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE) ?
+ Integer.parseInt(arguments.get(ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE)) :
+ ExcelInputFormat.EXCEL_BYTE_ARRAY_MAX_OVERRIDE_DEFAULT;
+
ExcelInputFormat.setConfigurations(job, excelInputreaderConfig.filePattern, excelInputreaderConfig.sheet,
excelInputreaderConfig.reprocess, excelInputreaderConfig.sheetValue,
excelInputreaderConfig.columnList, excelInputreaderConfig.skipFirstRow,
excelInputreaderConfig.terminateIfEmptyRow, excelInputreaderConfig.rowsLimit,
- excelInputreaderConfig.ifErrorRecord, processFiles);
+ excelInputreaderConfig.ifErrorRecord, processFiles, byteArrayMaxOverride);
// Sets the input path(s).
ExcelInputFormat.addInputPaths(job, excelInputreaderConfig.filePath);