From 098c0080c007ec50d38951975925b96758faefa1 Mon Sep 17 00:00:00 2001 From: psainics Date: Thu, 30 Nov 2023 13:44:42 +0530 Subject: [PATCH] Formula Cache miss, re-eval ! --- .../format/xls/input/XlsInputFormat.java | 33 +++++++++++++++++-- .../xls/input/XlsInputFormatProvider.java | 1 + 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java index f53d66d33..c2d207959 100644 --- a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java +++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java @@ -17,6 +17,7 @@ package io.cdap.plugin.format.xls.input; import com.google.common.base.Preconditions; +import com.google.common.base.Strings; import io.cdap.cdap.api.data.format.StructuredRecord; import io.cdap.cdap.api.data.schema.Schema; import org.apache.hadoop.conf.Configuration; @@ -31,6 +32,7 @@ import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellType; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.FormulaEvaluator; import org.apache.poi.ss.usermodel.Row; @@ -66,7 +68,7 @@ public RecordReader createRecordReader(InputSpli public static class XlsRecordReader extends RecordReader { // DataFormatter to format and get each cell's value as String DataFormatter formatter; - FormulaEvaluator evaluator; + FormulaEvaluator formulaEvaluator; // Map key that represents the row index. private LongWritable key; // Map value that represents an excel row @@ -91,6 +93,7 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro Path file = split.getPath(0); String schema = context.getConfiguration().get("schema"); formatter = new DataFormatter(); + formatter.setUseCachedValuesForFormulaCells(true); outputSchema = schema != null ? Schema.parseJson(schema) : null; FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath(0)); @@ -100,7 +103,8 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro terminateIfEmptyRow = job.getBoolean(TERMINATE_IF_EMPTY_ROW, false); try (Workbook workbook = WorkbookFactory.create(fileIn)) { - evaluator = workbook.getCreationHelper().createFormulaEvaluator(); + formulaEvaluator = workbook.getCreationHelper().createFormulaEvaluator(); + formulaEvaluator.setIgnoreMissingWorkbooks(true); // Check if user wants to access with name or number if (sheet.equals(XlsInputFormatConfig.SHEET_NUMBER)) { workSheet = workbook.getSheetAt(Integer.parseInt(sheetValue)); @@ -143,7 +147,30 @@ public boolean nextKeyValue() { } isRowNull = false; Schema.Field field = fields.get(cellIndex); - builder.convertAndSet(field.getName(), formatter.formatCellValue(cell, evaluator)); + String result; + // Handel Formulas + if (cell.getCellType() == CellType.FORMULA) { + try { + // Try to get cached value of formula if it exists + // Skip if cell has an error as the pipeline may fail due to schema mismatch + if (cell.getCachedFormulaResultType() == CellType.ERROR) { + continue; + } + // Use cached value if it exists + result = formatter.formatCellValue(cell); + } catch (Exception e) { + // If cached value does not exist, then evaluate result + result = formatter.formatCellValue(cell, formulaEvaluator); + } + } else { + // If cell is not a formula, then get the cell value + result = formatter.formatCellValue(cell); + // Skip empty cells. + if (Strings.isNullOrEmpty(result)) { + continue; + } + } + builder.convertAndSet(field.getName(), result); } value = builder.build(); rowIndex++; diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java index 48d58b4b0..13eec1fc0 100644 --- a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java +++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java @@ -109,6 +109,7 @@ public Schema detectSchema(FormatContext context, InputFiles inputFiles) throws DataFormatter formatter = new DataFormatter(); try (Workbook workbook = WorkbookFactory.create(inputFile.open())) { formulaEvaluator = workbook.getCreationHelper().createFormulaEvaluator(); + formulaEvaluator.setIgnoreMissingWorkbooks(true); Sheet workSheet; // Check if user wants to access with name or number if (conf.getSheet() != null && conf.getSheet().equals(XlsInputFormatConfig.SHEET_NUMBER)) {