From b000b3099118837dc72e532f65aded76227338c0 Mon Sep 17 00:00:00 2001 From: Matt Riley Date: Wed, 11 Dec 2024 19:27:22 -0500 Subject: [PATCH 1/6] Add the ability to return the deleted form ids during deletion --- corehq/form_processor/models/forms.py | 15 ++++++++++----- corehq/form_processor/tests/test_forms.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/corehq/form_processor/models/forms.py b/corehq/form_processor/models/forms.py index cdf4f2ae5440..b53dde8db9ec 100644 --- a/corehq/form_processor/models/forms.py +++ b/corehq/form_processor/models/forms.py @@ -389,7 +389,8 @@ def soft_undelete_forms(self, domain, form_ids): return count - def hard_delete_forms(self, domain, form_ids, delete_attachments=True, *, publish_changes=True): + def hard_delete_forms( + self, domain, form_ids, return_ids=False, delete_attachments=True, *, publish_changes=True): """Delete forms permanently :param publish_changes: Flag for change feed publication. @@ -398,12 +399,16 @@ def hard_delete_forms(self, domain, form_ids, delete_attachments=True, *, publis assert isinstance(form_ids, list) deleted_count = 0 + deleted_ids = [] for db_name, split_form_ids in split_list_by_db_partition(form_ids): # cascade should delete the operations - _, deleted_models = self.using(db_name).filter( - domain=domain, form_id__in=split_form_ids - ).delete() + query = self.using(db_name).filter(domain=domain, form_id__in=split_form_ids) + if return_ids: + found_forms = list(query.values_list('form_id', flat=True)) + _, deleted_models = query.delete() deleted_count += deleted_models.get(self.model._meta.label, 0) + if return_ids: + deleted_ids.extend(found_forms) if delete_attachments and deleted_count: if deleted_count != len(form_ids): @@ -421,7 +426,7 @@ def hard_delete_forms(self, domain, form_ids, delete_attachments=True, *, publis if publish_changes: self.publish_deleted_forms(domain, form_ids) - return deleted_count + return deleted_ids if return_ids else deleted_count @staticmethod def publish_deleted_forms(domain, form_ids): diff --git a/corehq/form_processor/tests/test_forms.py b/corehq/form_processor/tests/test_forms.py index 07810d2d97fc..1e8ad386cf1e 100644 --- a/corehq/form_processor/tests/test_forms.py +++ b/corehq/form_processor/tests/test_forms.py @@ -354,6 +354,22 @@ def test_hard_delete_forms(self): self.assertEqual(1, len(forms)) self.assertEqual(form_ids[0], forms[0].form_id) + def test_hard_delete_forms_returns_forms_found(self): + for i in range(3): + create_form_for_test(DOMAIN, form_id=str(i)) + + deleted_form_ids = set(XFormInstance.objects.hard_delete_forms(DOMAIN, ['0', '1', '2'], return_ids=True)) + + self.assertEqual(deleted_form_ids, {'0', '1', '2'}) + + def test_hard_delete_forms_does_not_include_missing_form_ids(self): + create_form_for_test(DOMAIN, form_id='1') + create_form_for_test(DOMAIN, form_id='3') + + deleted_form_ids = set(XFormInstance.objects.hard_delete_forms(DOMAIN, ['1', '2', '3'], return_ids=True)) + + self.assertEqual(deleted_form_ids, {'1', '3'}) + def assert_form_xml_attachment(self, form): attachments = XFormInstance.objects.get_attachments(form.form_id) self.assertEqual([a.name for a in attachments], ["form.xml"]) From 9ca88fde78f5dbeb3f7ea32a277ccb5fe897df76 Mon Sep 17 00:00:00 2001 From: Matt Riley Date: Wed, 11 Dec 2024 19:28:09 -0500 Subject: [PATCH 2/6] Added form deletion script --- .../management/commands/delete_forms.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 corehq/apps/cleanup/management/commands/delete_forms.py diff --git a/corehq/apps/cleanup/management/commands/delete_forms.py b/corehq/apps/cleanup/management/commands/delete_forms.py new file mode 100644 index 000000000000..b97f42109e69 --- /dev/null +++ b/corehq/apps/cleanup/management/commands/delete_forms.py @@ -0,0 +1,52 @@ +from django.core.management.base import BaseCommand +import csv +import itertools +from dimagi.utils.chunked import chunked +from corehq.form_processor.models import XFormInstance + + +INDEX_FORM_ID = 1 +CHUNK_SIZE = 100 + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument('domain', help='Domain name that owns the forms to be deleted') + parser.add_argument('filename', help='path to the CSV file') + parser.add_argument('--resume_id', help='form ID to start at, within the CSV file') + + def handle(self, domain, filename, resume_id=None, **options): + # expects the filename to have a CSV with a header containing "Deletion Status" and "Form ID" fields + with open(filename) as csvfile: + reader = csv.reader(csvfile, delimiter=',') + self._process_rows(reader, domain, resume_id) + + def _process_rows(self, rows, domain, resume_id): + next(rows) # skip header line + + num_deleted = 0 + + if resume_id: + print('resuming at: ', resume_id) + rows = itertools.dropwhile(lambda row: row[INDEX_FORM_ID] != resume_id, rows) + + print('Starting form deletion') + for chunk in chunked(rows, CHUNK_SIZE): + form_ids = [row[INDEX_FORM_ID] for row in chunk] + + try: + deleted_form_ids = set(XFormInstance.objects.hard_delete_forms( + domain, form_ids, return_ids=True)) + except Exception: + print('failed during processing of: ', form_ids) + raise + + for form_id in form_ids: + if form_id in deleted_form_ids: + print('Deleted: ', form_id) + else: + print('Not found:', form_id) + + num_deleted += len(deleted_form_ids) + + print(f'Complete -- removed {num_deleted} forms') From 4fccff9483682f5ae3f13ae902db5b8fa069ccbe Mon Sep 17 00:00:00 2001 From: Matt Riley Date: Thu, 12 Dec 2024 11:52:23 -0500 Subject: [PATCH 3/6] Renamed deletion script to indicate hard deletion --- .../management/commands/{delete_forms.py => hard_delete_forms.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename corehq/apps/cleanup/management/commands/{delete_forms.py => hard_delete_forms.py} (100%) diff --git a/corehq/apps/cleanup/management/commands/delete_forms.py b/corehq/apps/cleanup/management/commands/hard_delete_forms.py similarity index 100% rename from corehq/apps/cleanup/management/commands/delete_forms.py rename to corehq/apps/cleanup/management/commands/hard_delete_forms.py From 0537ba7f6e7cac2ba992cb91bf0ae8075d16aa8c Mon Sep 17 00:00:00 2001 From: Matt Riley Date: Thu, 12 Dec 2024 11:55:19 -0500 Subject: [PATCH 4/6] Remove extraneous column and have better checks on data validity --- .../cleanup/management/commands/hard_delete_forms.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/corehq/apps/cleanup/management/commands/hard_delete_forms.py b/corehq/apps/cleanup/management/commands/hard_delete_forms.py index b97f42109e69..e9a33cd6655d 100644 --- a/corehq/apps/cleanup/management/commands/hard_delete_forms.py +++ b/corehq/apps/cleanup/management/commands/hard_delete_forms.py @@ -1,11 +1,11 @@ -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError import csv import itertools from dimagi.utils.chunked import chunked from corehq.form_processor.models import XFormInstance -INDEX_FORM_ID = 1 +INDEX_FORM_ID = 0 CHUNK_SIZE = 100 @@ -17,12 +17,16 @@ def add_arguments(self, parser): def handle(self, domain, filename, resume_id=None, **options): # expects the filename to have a CSV with a header containing "Deletion Status" and "Form ID" fields - with open(filename) as csvfile: + with open(filename, mode='r', encoding='utf-8-sig') as csvfile: reader = csv.reader(csvfile, delimiter=',') self._process_rows(reader, domain, resume_id) def _process_rows(self, rows, domain, resume_id): - next(rows) # skip header line + header_row = next(rows) # skip header line + if header_row[INDEX_FORM_ID] != 'Form ID': + raise CommandError( + f'Expected Column {INDEX_FORM_ID} to be "Form ID", found "{header_row[INDEX_FORM_ID]}". Exiting' + ) num_deleted = 0 From 620c669ddc67603c0e46a98f9b6ae4e7ceb3a9f8 Mon Sep 17 00:00:00 2001 From: Matt Riley Date: Thu, 12 Dec 2024 11:58:16 -0500 Subject: [PATCH 5/6] make hard deletion actions atomic --- corehq/form_processor/models/forms.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/corehq/form_processor/models/forms.py b/corehq/form_processor/models/forms.py index b53dde8db9ec..52b12e52299e 100644 --- a/corehq/form_processor/models/forms.py +++ b/corehq/form_processor/models/forms.py @@ -403,9 +403,10 @@ def hard_delete_forms( for db_name, split_form_ids in split_list_by_db_partition(form_ids): # cascade should delete the operations query = self.using(db_name).filter(domain=domain, form_id__in=split_form_ids) - if return_ids: - found_forms = list(query.values_list('form_id', flat=True)) - _, deleted_models = query.delete() + with transaction.atomic(): + if return_ids: + found_forms = list(query.values_list('form_id', flat=True)) + _, deleted_models = query.delete() deleted_count += deleted_models.get(self.model._meta.label, 0) if return_ids: deleted_ids.extend(found_forms) From 1d208a7249c83795429ce037e5c6075cf763695d Mon Sep 17 00:00:00 2001 From: Matt Riley Date: Thu, 12 Dec 2024 12:50:21 -0500 Subject: [PATCH 6/6] fixed comment --- corehq/apps/cleanup/management/commands/hard_delete_forms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/apps/cleanup/management/commands/hard_delete_forms.py b/corehq/apps/cleanup/management/commands/hard_delete_forms.py index e9a33cd6655d..9adf19be1511 100644 --- a/corehq/apps/cleanup/management/commands/hard_delete_forms.py +++ b/corehq/apps/cleanup/management/commands/hard_delete_forms.py @@ -16,7 +16,7 @@ def add_arguments(self, parser): parser.add_argument('--resume_id', help='form ID to start at, within the CSV file') def handle(self, domain, filename, resume_id=None, **options): - # expects the filename to have a CSV with a header containing "Deletion Status" and "Form ID" fields + # expects the filename to have a CSV with a header containing a "Form ID" field with open(filename, mode='r', encoding='utf-8-sig') as csvfile: reader = csv.reader(csvfile, delimiter=',') self._process_rows(reader, domain, resume_id)