Merge pull request #543 from oakdbca/ocr-bulk-import

Ocr bulk import
dbca-wa · Dec 3, 2024 · d30f3e8 · d30f3e8
2 parents 4a4ec2d + c2a83fa
commit d30f3e8
Show file tree

Hide file tree

Showing 5 changed files with 250 additions and 86 deletions.
diff --git a/boranga/components/occurrence/models.py b/boranga/components/occurrence/models.py
@@ -2455,7 +2455,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCRHabitat Condition: {self.id} for Occurrence Report: {self.occurrence_report}"
 
 
 class OCRVegetationStructure(models.Model):
@@ -2486,7 +2486,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Vegetation Structure: {self.id} for Occurrence Report: {self.occurrence_report}"
 
 
 class Intensity(ArchivableModel):
@@ -2544,7 +2544,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Fire History: {self.id} for Occurrence Report: {self.occurrence_report}"
 
 
 class OCRAssociatedSpecies(models.Model):
@@ -2573,7 +2573,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Associated Species {self.id} for Occurrence Report {self.occurrence_report}"
 
 
 class ObservationMethod(ArchivableModel):
@@ -2631,7 +2631,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Observation Detail: {self.id} for Occurrence Report: {self.occurrence_report}"
 
 
 class PlantCountMethod(ArchivableModel):
@@ -2815,7 +2815,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Plant Count: {self.id} for Occurrence Report: {self.occurrence_report}"
 
 
 # used for Animal Observation(MultipleSelect)
@@ -3019,7 +3019,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Animal Observation: {self.id} for Occurrence Report: {self.occurrence_report}"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -3179,7 +3179,7 @@ class Meta:
         app_label = "boranga"
 
     def __str__(self):
-        return str(self.occurrence_report)
+        return f"OCR Identification: {self.id} for Occurrence Report: {self.occurrence_report}"
 
 
 class OccurrenceReportDocument(Document):
@@ -5626,6 +5626,20 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
             return
 
         ocr_migrated_from_id = row[0]
+
+        # If the migrated from id is None then complain and return
+        if not ocr_migrated_from_id:
+            error_message = "Row does not have an Occurrence Report migrated from id"
+            errors.append(
+                {
+                    "row_index": index,
+                    "error_type": "missing_migrated_from_id",
+                    "data": row,
+                    "error_message": error_message,
+                }
+            )
+            return
+
         mode = "create"
         if (
             ocr_migrated_from_id in ocr_migrated_from_ids
@@ -5647,8 +5661,31 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
         geometries = {}
         many_to_many_fields = {}
 
-        # Validate each cell
+        # Find any sets of columns (models) where the fields are all empty
+        # so that we can exclude them from the row data (and therefor not bother validating them)
+        required_content_types = []
         for column_index, column in enumerate(self.schema.columns.all()):
+            cell_value = row[column_index]
+            if (
+                cell_value is not None
+                and column.django_import_content_type not in required_content_types
+            ):
+                required_content_types.append(column.django_import_content_type)
+
+        indexes_to_remove = []
+        for column_index, column in enumerate(self.schema.columns.all()):
+            if column.django_import_content_type not in required_content_types:
+                indexes_to_remove.append(column_index)
+
+        row = [i for j, i in enumerate(row) if j not in indexes_to_remove]
+        headers = [i for j, i in enumerate(headers) if j not in indexes_to_remove]
+
+        # Validate each cell
+        for column_index, column in enumerate(
+            self.schema.columns.filter(
+                django_import_content_type__in=required_content_types
+            )
+        ):
             column_error_count = 0
 
             cell_value = row[column_index]
@@ -5688,9 +5725,6 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
                     {"field": column.django_import_field_name, "value": cell_value}
                 )
 
-                # Continue to the next column without adding the cell value to the model data
-                continue
-
             column_error_count += errors_added
 
             row_error_count += column_error_count
@@ -5705,6 +5739,12 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
                 models[model_name] = {"field_names": [], "values": []}
 
             models[model_name]["field_names"].append(column.django_import_field_name)
+
+            # Attempting to assign the cell value directly to the m2m field
+            # will raise an error, so we skip it here and handle it later
+            if type(field) is ManyToManyField:
+                continue
+
             models[model_name]["values"].append(cell_value)
 
         if row_error_count > 0:
@@ -5785,10 +5825,42 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
                         )
                         return
                 else:
+                    if (
+                        model_data.get("occurrence_name", None)
+                        and Occurrence.objects.filter(
+                            occurrence_name=model_data["occurrence_name"]
+                        ).exists()
+                    ):
+                        error_message = (
+                            f"An occurrence with the name '{model_data['occurrence_name']}' "
+                            "already exists in the database. Please populate the Occurrence "
+                            "Number or OCC Occurrence Migrated From ID field instead."
+                        )
+                        errors.append(
+                            {
+                                "row_index": index,
+                                "error_type": "duplicate_occurrence_name",
+                                "data": model_data,
+                                "error_message": error_message,
+                            }
+                        )
+                        return
+
                     if not occ_migrated_from_id:
                         if not model_data.get("group_type"):
                             model_data["group_type"] = self.schema.group_type
+
                         current_model_instance = Occurrence(**model_data)
+                        current_model_instance.occurrence_source = (
+                            Occurrence.OCCURRENCE_CHOICE_OCR
+                        )
+                        ocr_instance = model_instances[
+                            OccurrenceReport._meta.model_name
+                        ]
+                        if ocr_instance.species:
+                            current_model_instance.species = ocr_instance.species
+                        else:
+                            current_model_instance.community = ocr_instance.community
                     else:
                         if Occurrence.objects.filter(
                             migrated_from_id=occ_migrated_from_id
@@ -5800,11 +5872,17 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
                             current_model_instance = Occurrence.objects.create(
                                 migrated_from_id=occ_migrated_from_id,
                                 group_type=self.schema.group_type,
-                                species=model_instances[
-                                    OccurrenceReport._meta.model_name
-                                ].species,
+                                occurrence_source=Occurrence.OCCURRENCE_CHOICE_OCR,
                             )
-
+                            ocr_instance = model_instances[
+                                OccurrenceReport._meta.model_name
+                            ]
+                            if ocr_instance.species:
+                                current_model_instance.species = ocr_instance.species
+                            else:
+                                current_model_instance.community = (
+                                    ocr_instance.community
+                                )
                         if (
                             not current_model_instance.group_type
                             == self.schema.group_type
@@ -5827,7 +5905,8 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
                             OccurrenceReport._meta.model_name
                         ]
                         if (
-                            not current_model_instance.species
+                            current_model_instance.species
+                            and not current_model_instance.species
                             == occurrence_report.species
                         ):
                             error_message = (
@@ -5844,6 +5923,25 @@ def process_row(self, ocr_migrated_from_ids, index, headers, row, errors):
                             )
                             return
 
+                        if (
+                            current_model_instance.community
+                            and not current_model_instance.community
+                            == occurrence_report.community
+                        ):
+                            error_message = (
+                                "The community of the occurrence does not match "
+                                "the community of the occurrence report"
+                            )
+                            errors.append(
+                                {
+                                    "row_index": index,
+                                    "error_type": "invalid_occurrence_community",
+                                    "data": model_data,
+                                    "error_message": error_message,
+                                }
+                            )
+                            return
+
                         for field, value in model_data.items():
                             setattr(current_model_instance, field, value)
 
@@ -7409,7 +7507,7 @@ def validate(self, task, cell_value, mode, index, headers, row, errors):
         if isinstance(field, gis_models.GeometryField):
             try:
                 geom_json = json.loads(cell_value)
-            except json.JSONDecodeError:
+            except (json.JSONDecodeError, TypeError):
                 error_message = f"Value {cell_value} in column {self.xlsx_column_header_name} is not a valid JSON"
                 errors.append(
                     {
@@ -7658,6 +7756,28 @@ def validate(self, task, cell_value, mode, index, headers, row, errors):
                 errors_added += 1
                 return cell_value, errors_added
 
+            if related_model_instances.count() == 0:
+                error_message = (
+                    f"Can't find any {self.django_import_field_name} records by looking up "
+                    f"{lookup_field} with value {cell_value} "
+                    f"for column {self.xlsx_column_header_name}"
+                )
+                if "," in cell_value[0]:
+                    error_message += (
+                        " (Hint: The delimiter for many to many fields is '"
+                        f"{settings.OCR_BULK_IMPORT_M2M_DELIMITER}' not ',')"
+                    )
+                errors.append(
+                    {
+                        "row_index": index,
+                        "error_type": "column",
+                        "data": cell_value,
+                        "error_message": error_message,
+                    }
+                )
+                errors_added += 1
+                return cell_value, errors_added
+
             # Replace the lookup cell_value with a list of model instances to be assigned
             cell_value = list(related_model_instances)
             return cell_value, errors_added
@@ -7680,6 +7800,33 @@ def validate(self, task, cell_value, mode, index, headers, row, errors):
                     errors_added += 1
                 return cell_value, errors_added
 
+            # Convert the cell value to a boolean as the application that was used
+            # to create the xlsx may have stored it as a string. For example, it often
+            # automatically formats cells with a text value of 'TRUE' or 'FALSE' as a boolean
+            # then when the file is saved the value in the cell will be converted to '=TRUE()' or '=FALSE()'
+            if isinstance(cell_value, str):
+                cell_value = cell_value.lower()
+                if cell_value in ["=true()", "true", "1", "yes", "y"]:
+                    cell_value = True
+                elif cell_value in ["=false()", "false", "0", "no", "n"]:
+                    cell_value = False
+                else:
+                    error_message = (
+                        f"Value {cell_value} in column {self.xlsx_column_header_name} "
+                        "is not a valid boolean. The bulk importer is able to convert "
+                        "the following values to booleans: 'True', '1', 'yes', 'y', "
+                        "'=TRUE()', 'False', '0', 'no', 'n', '=False()'"
+                    )
+                    errors.append(
+                        {
+                            "row_index": index,
+                            "error_type": "column",
+                            "data": cell_value,
+                            "error_message": error_message,
+                        }
+                    )
+                    errors_added += 1
+
             if cell_value not in [True, False]:
                 error_message = (
                     f"Value {cell_value} in column {self.xlsx_column_header_name} "