From fbf74b84ef393f7bef36f9b97d3189b15a75c5c8 Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Fri, 22 Nov 2024 17:08:18 -0500 Subject: [PATCH 1/5] Implement row limit for lookup tables --- corehq/apps/fixtures/exceptions.py | 2 ++ corehq/apps/fixtures/upload/const.py | 1 + corehq/apps/fixtures/upload/workbook.py | 10 +++++++-- corehq/util/workbook_json/const.py | 1 + corehq/util/workbook_json/excel.py | 29 ++++++++++++++++++++++--- 5 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 corehq/util/workbook_json/const.py diff --git a/corehq/apps/fixtures/exceptions.py b/corehq/apps/fixtures/exceptions.py index c7ce7c102f6b..9b54fb09598f 100644 --- a/corehq/apps/fixtures/exceptions.py +++ b/corehq/apps/fixtures/exceptions.py @@ -14,6 +14,8 @@ class FixtureUploadError(FixtureException): def __init__(self, errors): self.errors = errors +class FixtureTooManyRows(FixtureException): + """Raised when an uploaded fixture exceeds MAX_FIXTURE_ROWS""" class FixtureTypeCheckError(Exception): pass diff --git a/corehq/apps/fixtures/upload/const.py b/corehq/apps/fixtures/upload/const.py index 66771685d137..68fedd0535de 100644 --- a/corehq/apps/fixtures/upload/const.py +++ b/corehq/apps/fixtures/upload/const.py @@ -1,4 +1,5 @@ DELETE_HEADER = "Delete(Y/N)" +MAX_FIXTURE_ROWS = 500_000 class MULTIPLE: diff --git a/corehq/apps/fixtures/upload/workbook.py b/corehq/apps/fixtures/upload/workbook.py index 306a9a50e878..9ad6c78cccfe 100644 --- a/corehq/apps/fixtures/upload/workbook.py +++ b/corehq/apps/fixtures/upload/workbook.py @@ -3,11 +3,12 @@ from django.utils.translation import gettext as _, gettext_lazy from corehq.apps.fixtures.exceptions import FixtureUploadError -from corehq.apps.fixtures.upload.const import DELETE_HEADER, INVALID, MULTIPLE +from corehq.apps.fixtures.upload.const import DELETE_HEADER, INVALID, MAX_FIXTURE_ROWS, MULTIPLE from corehq.apps.fixtures.upload.failure_messages import FAILURE_MESSAGES from corehq.apps.fixtures.utils import is_identifier_invalid from corehq.util.workbook_json.excel import ( WorkbookJSONError, + WorkbookTooManyRows, WorksheetNotFound, ) from corehq.util.workbook_json.excel import ( @@ -36,9 +37,14 @@ class _FixtureWorkbook(object): def __init__(self, file_or_filename): try: - self.workbook = excel_get_workbook(file_or_filename) + self.workbook = excel_get_workbook(file_or_filename, max_row_count=MAX_FIXTURE_ROWS) except WorkbookJSONError as e: raise FixtureUploadError([str(e)]) + except WorkbookTooManyRows as e: + raise FixtureUploadError([ + f"Lookup tables can contain a maximum of {e.max_row_count} rows. " + f"The uploaded file contains {e.actual_row_count} rows." + ]) self._rows = {} self.item_keys = WeakKeyDictionary() self.ownership = WeakKeyDictionary() diff --git a/corehq/util/workbook_json/const.py b/corehq/util/workbook_json/const.py new file mode 100644 index 000000000000..7f401bebad52 --- /dev/null +++ b/corehq/util/workbook_json/const.py @@ -0,0 +1 @@ +MAX_WORKBOOK_ROWS = 1_000_000 diff --git a/corehq/util/workbook_json/excel.py b/corehq/util/workbook_json/excel.py index 72a140138100..6e5c25f60c8c 100644 --- a/corehq/util/workbook_json/excel.py +++ b/corehq/util/workbook_json/excel.py @@ -6,6 +6,7 @@ from django.core.files.uploadedfile import UploadedFile from django.utils.translation import gettext as _ +from corehq.util.workbook_json.const import MAX_WORKBOOK_ROWS class InvalidExcelFileException(Exception): pass @@ -27,6 +28,15 @@ class WorkbookJSONError(Exception): pass +class WorkbookTooManyRows(Exception): + """Workbook row count exceeds MAX_WORKBOOK_ROWS""" + + def __init__(self, max_row_count, actual_row_count): + super().__init__() + self.max_row_count = max_row_count + self.actual_row_count = actual_row_count + + class IteratorJSONReader(object): """ >>> def normalize(it): @@ -145,9 +155,9 @@ def set_field_value(cls, obj, field, value): obj[field] = value -def get_workbook(file_or_filename): +def get_workbook(file_or_filename, max_row_count=MAX_WORKBOOK_ROWS): try: - return WorkbookJSONReader(file_or_filename) + return WorkbookJSONReader(file_or_filename, max_row_count=max_row_count) except (HeaderValueError, InvalidExcelFileException) as e: raise WorkbookJSONError(_( "Upload failed! " @@ -226,10 +236,19 @@ def _convert_float(value): yield cell_values super(WorksheetJSONReader, self).__init__(iterator()) + def row_count(self): + def parse_dimension(dimension): + import re + match = re.search(r'(\d+)', dimension) + if match: + return int(match.group(1)) + return 0 + return parse_dimension(self.worksheet.calculate_dimension()) + class WorkbookJSONReader(object): - def __init__(self, file_or_filename): + def __init__(self, file_or_filename, max_row_count=MAX_WORKBOOK_ROWS): check_types = (UploadedFile, io.RawIOBase, io.BufferedIOBase) if isinstance(file_or_filename, check_types): tmp = NamedTemporaryFile(mode='wb', suffix='.xlsx', delete=False) @@ -246,12 +265,16 @@ def __init__(self, file_or_filename): self.worksheets = [] try: + total_row_count = 0 for worksheet in self.wb.worksheets: try: ws = WorksheetJSONReader(worksheet, title=worksheet.title) except IndexError: raise JSONReaderError('This Excel file has unrecognised formatting. Please try downloading ' 'the lookup table first, and then add data to it.') + total_row_count += ws.row_count() + if total_row_count > max_row_count: + raise WorkbookTooManyRows(max_row_count, total_row_count) self.worksheets_by_title[worksheet.title] = ws self.worksheets.append(ws) finally: From caefc0ed4ad26746cce3676f4d7502d1bc2f0d3a Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Fri, 22 Nov 2024 17:15:42 -0500 Subject: [PATCH 2/5] Put exceptions in separate file --- corehq/util/workbook_json/excel.py | 35 +++++---------------- corehq/util/workbook_json/excel_importer.py | 4 +-- corehq/util/workbook_json/exceptions.py | 31 ++++++++++++++++++ 3 files changed, 40 insertions(+), 30 deletions(-) create mode 100644 corehq/util/workbook_json/exceptions.py diff --git a/corehq/util/workbook_json/excel.py b/corehq/util/workbook_json/excel.py index 6e5c25f60c8c..3c1587df35f3 100644 --- a/corehq/util/workbook_json/excel.py +++ b/corehq/util/workbook_json/excel.py @@ -8,33 +8,14 @@ from corehq.util.workbook_json.const import MAX_WORKBOOK_ROWS -class InvalidExcelFileException(Exception): - pass - - -class JSONReaderError(Exception): - pass - - -class HeaderValueError(Exception): - pass - - -class StringTypeRequiredError(Exception): - pass - - -class WorkbookJSONError(Exception): - pass - - -class WorkbookTooManyRows(Exception): - """Workbook row count exceeds MAX_WORKBOOK_ROWS""" - - def __init__(self, max_row_count, actual_row_count): - super().__init__() - self.max_row_count = max_row_count - self.actual_row_count = actual_row_count +from .exceptions import ( + HeaderValueError, + InvalidExcelFileException, + JSONReaderError, + StringTypeRequiredError, + WorkbookJSONError, + WorkbookTooManyRows, +) class IteratorJSONReader(object): diff --git a/corehq/util/workbook_json/excel_importer.py b/corehq/util/workbook_json/excel_importer.py index d54d1bc8b375..36a669bf3b48 100644 --- a/corehq/util/workbook_json/excel_importer.py +++ b/corehq/util/workbook_json/excel_importer.py @@ -6,9 +6,7 @@ from corehq.util.workbook_json.excel import WorkbookJSONReader - -class UnknownFileRefException(Exception): - pass +from .exceptions import UnknownFileRefException class ExcelImporter(object): diff --git a/corehq/util/workbook_json/exceptions.py b/corehq/util/workbook_json/exceptions.py new file mode 100644 index 000000000000..a3fc529dd1b5 --- /dev/null +++ b/corehq/util/workbook_json/exceptions.py @@ -0,0 +1,31 @@ +class HeaderValueError(Exception): + pass + + +class InvalidExcelFileException(Exception): + pass + + +class JSONReaderError(Exception): + pass + + +class StringTypeRequiredError(Exception): + pass + + +class UnknownFileRefException(Exception): + pass + + +class WorkbookJSONError(Exception): + pass + + +class WorkbookTooManyRows(Exception): + """Workbook row count exceeds MAX_WORKBOOK_ROWS""" + + def __init__(self, max_row_count, actual_row_count): + super().__init__() + self.max_row_count = max_row_count + self.actual_row_count = actual_row_count From 3a06254a9e988e20ebf075a98d5e3f49c8e6fe1f Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Fri, 22 Nov 2024 17:47:02 -0500 Subject: [PATCH 3/5] Fix linting errors --- corehq/apps/fixtures/exceptions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/corehq/apps/fixtures/exceptions.py b/corehq/apps/fixtures/exceptions.py index 9b54fb09598f..0975282dea94 100644 --- a/corehq/apps/fixtures/exceptions.py +++ b/corehq/apps/fixtures/exceptions.py @@ -11,12 +11,15 @@ class FixtureAPIRequestError(FixtureException): class FixtureUploadError(FixtureException): + def __init__(self, errors): self.errors = errors + class FixtureTooManyRows(FixtureException): """Raised when an uploaded fixture exceeds MAX_FIXTURE_ROWS""" + class FixtureTypeCheckError(Exception): pass From 27b63a5990814c4631e3087aaafa5a750d10b1eb Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Tue, 3 Dec 2024 13:55:51 -0500 Subject: [PATCH 4/5] Use worksheet.max_row --- corehq/util/workbook_json/excel.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/corehq/util/workbook_json/excel.py b/corehq/util/workbook_json/excel.py index 3c1587df35f3..8f9affcf85f9 100644 --- a/corehq/util/workbook_json/excel.py +++ b/corehq/util/workbook_json/excel.py @@ -217,15 +217,6 @@ def _convert_float(value): yield cell_values super(WorksheetJSONReader, self).__init__(iterator()) - def row_count(self): - def parse_dimension(dimension): - import re - match = re.search(r'(\d+)', dimension) - if match: - return int(match.group(1)) - return 0 - return parse_dimension(self.worksheet.calculate_dimension()) - class WorkbookJSONReader(object): @@ -253,7 +244,7 @@ def __init__(self, file_or_filename, max_row_count=MAX_WORKBOOK_ROWS): except IndexError: raise JSONReaderError('This Excel file has unrecognised formatting. Please try downloading ' 'the lookup table first, and then add data to it.') - total_row_count += ws.row_count() + total_row_count += worksheet.max_row if total_row_count > max_row_count: raise WorkbookTooManyRows(max_row_count, total_row_count) self.worksheets_by_title[worksheet.title] = ws From dc58ae4fa337cf777e428b99cba269ffc669eeb8 Mon Sep 17 00:00:00 2001 From: Graham Herceg Date: Tue, 3 Dec 2024 13:59:33 -0500 Subject: [PATCH 5/5] Add comment explaining why we force calculating dimensions --- corehq/util/workbook_json/excel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/corehq/util/workbook_json/excel.py b/corehq/util/workbook_json/excel.py index 8f9affcf85f9..613c1d9e60c0 100644 --- a/corehq/util/workbook_json/excel.py +++ b/corehq/util/workbook_json/excel.py @@ -194,6 +194,8 @@ def __init__(self, worksheet, title=None): break else: width += 1 + + # ensure _max_row and _max_column properties are set self.worksheet.calculate_dimension(force=True) def iterator():