Skip to content

Commit

Permalink
Formula Cache miss, re-eval !
Browse files Browse the repository at this point in the history
  • Loading branch information
psainics committed Nov 30, 2023
1 parent 09e59ae commit 098c008
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package io.cdap.plugin.format.xls.input;

import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import org.apache.hadoop.conf.Configuration;
Expand All @@ -31,6 +32,7 @@
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.FormulaEvaluator;
import org.apache.poi.ss.usermodel.Row;
Expand Down Expand Up @@ -66,7 +68,7 @@ public RecordReader<LongWritable, StructuredRecord> createRecordReader(InputSpli
public static class XlsRecordReader extends RecordReader<LongWritable, StructuredRecord> {
// DataFormatter to format and get each cell's value as String
DataFormatter formatter;
FormulaEvaluator evaluator;
FormulaEvaluator formulaEvaluator;
// Map key that represents the row index.
private LongWritable key;
// Map value that represents an excel row
Expand All @@ -91,6 +93,7 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
Path file = split.getPath(0);
String schema = context.getConfiguration().get("schema");
formatter = new DataFormatter();
formatter.setUseCachedValuesForFormulaCells(true);
outputSchema = schema != null ? Schema.parseJson(schema) : null;
FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(split.getPath(0));
Expand All @@ -100,7 +103,8 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
terminateIfEmptyRow = job.getBoolean(TERMINATE_IF_EMPTY_ROW, false);

try (Workbook workbook = WorkbookFactory.create(fileIn)) {
evaluator = workbook.getCreationHelper().createFormulaEvaluator();
formulaEvaluator = workbook.getCreationHelper().createFormulaEvaluator();
formulaEvaluator.setIgnoreMissingWorkbooks(true);
// Check if user wants to access with name or number
if (sheet.equals(XlsInputFormatConfig.SHEET_NUMBER)) {
workSheet = workbook.getSheetAt(Integer.parseInt(sheetValue));
Expand Down Expand Up @@ -143,7 +147,30 @@ public boolean nextKeyValue() {
}
isRowNull = false;
Schema.Field field = fields.get(cellIndex);
builder.convertAndSet(field.getName(), formatter.formatCellValue(cell, evaluator));
String result;
// Handel Formulas
if (cell.getCellType() == CellType.FORMULA) {
try {
// Try to get cached value of formula if it exists
// Skip if cell has an error as the pipeline may fail due to schema mismatch
if (cell.getCachedFormulaResultType() == CellType.ERROR) {
continue;
}
// Use cached value if it exists
result = formatter.formatCellValue(cell);
} catch (Exception e) {
// If cached value does not exist, then evaluate result
result = formatter.formatCellValue(cell, formulaEvaluator);
}
} else {
// If cell is not a formula, then get the cell value
result = formatter.formatCellValue(cell);
// Skip empty cells.
if (Strings.isNullOrEmpty(result)) {
continue;
}
}
builder.convertAndSet(field.getName(), result);
}
value = builder.build();
rowIndex++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ public Schema detectSchema(FormatContext context, InputFiles inputFiles) throws
DataFormatter formatter = new DataFormatter();
try (Workbook workbook = WorkbookFactory.create(inputFile.open())) {
formulaEvaluator = workbook.getCreationHelper().createFormulaEvaluator();
formulaEvaluator.setIgnoreMissingWorkbooks(true);
Sheet workSheet;
// Check if user wants to access with name or number
if (conf.getSheet() != null && conf.getSheet().equals(XlsInputFormatConfig.SHEET_NUMBER)) {
Expand Down

0 comments on commit 098c008

Please sign in to comment.