Skip to content

Commit

Permalink
Merge pull request #33999 from dimagi/gh/data-dump/use-natural-foreig…
Browse files Browse the repository at this point in the history
…n-keys

Use natural keys for foreign key relationships during serialization/deserialization
  • Loading branch information
gherceg authored Jan 26, 2024
2 parents 7ba4d2d + 52a7011 commit d3468c9
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 9 deletions.
17 changes: 16 additions & 1 deletion corehq/apps/dump_reload/sql/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,20 @@ class SqlDataDumper(DataDumper):
slug = 'sql'

def dump(self, output_stream):
"""
When serializing data using JsonLinesSerializer().serialize(...), the additional parameters are set for
the following reasons:
use_natural_primary_keys is necessary for sharded models to ensure the primary key, which will not be
unique across shards, is not used. This can be thought of as "use natural primary keys when defined".
use_natural_foreign_keys is necessary for foreign keys that reference primary keys on models that have a
natural_key method defined. This can be thought of as "use natural foreign keys when defined". For example,
SQLUserData has a foreign key to User based on the primary key. However a natural_key method is defined on
the User model, so its primary key will not be serialized when use_natural_primary_keys=True. To resolve,
we set use_natural_foreign_keys=True which will result in natural keys being serialized as part of the
foreign key field when referencing a model with natural_key defined.
"""
stats = Counter()
objects = get_objects_to_dump(
self.domain,
Expand All @@ -248,9 +262,10 @@ def dump(self, output_stream):
stats_counter=stats,
stdout=self.stdout,
)

JsonLinesSerializer().serialize(
objects,
use_natural_foreign_keys=False,
use_natural_foreign_keys=True,
use_natural_primary_keys=True,
stream=output_stream
)
Expand Down
84 changes: 80 additions & 4 deletions corehq/apps/dump_reload/tests/test_serialization.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,97 @@
import json
from io import StringIO
from unittest.mock import patch

from django.contrib.auth.models import User
from django.core.serializers.python import Deserializer
from django.test import SimpleTestCase

from corehq.apps.dump_reload.sql.dump import SqlDataDumper
from corehq.apps.users.models import SQLUserData
from corehq.apps.users.models_role import Permission, RolePermission, UserRole
from corehq.form_processor.models.cases import CaseTransaction, CommCareCase
from corehq.form_processor.models.forms import XFormInstance, XFormOperation


class TestJSONFieldSerialization(SimpleTestCase):
"""See https://github.com/bradjasper/django-jsonfield/pull/173"""
"""
See https://github.com/bradjasper/django-jsonfield/pull/173
We just need to test that a model that uses jsonfield.JSONField is serialized correctly
"""

def test(self):
serialized_model_with_primary_key = {
'model': 'form_processor.XFormInstance', 'pk': 1, 'fields': {'auth_context': '{}'}
'model': 'accounting.BillingContactInfo', 'pk': 1, 'fields': {'email_list': '{}'}
}
serialized_model_with_natural_key = {
'model': 'form_processor.XFormInstance', 'fields': {'auth_context': '{}'}
'model': 'accounting.BillingContactInfo', 'fields': {'email_list': '{}'}
}

def _test_json_field_after_serialization(serialized):
for obj in Deserializer([serialized]):
self.assertIsInstance(obj.object.auth_context, dict)
self.assertIsInstance(obj.object.email_list, dict)

_test_json_field_after_serialization(serialized_model_with_primary_key)
_test_json_field_after_serialization(serialized_model_with_natural_key)


class TestForeignKeyFieldSerialization(SimpleTestCase):
"""
We use natural foreign keys when dumping SQL data, but CommCareCase and XFormInstance have natural_key methods
that intentionally return a string for the case_id or form_id, rather than a tuple as Django recommends for
all natural_key methods. We made this decision to optimize loading deserialized data back into a database. If
the natural_key method returns a tuple, it will use the get_by_natural_key method on the foreign key model's
default object manager to fetch the foreign keyed object, resulting in a database lookup everytime we write
a model that foreign keys to cases or forms in SqlDataLoader.
"""

def test_natural_foreign_key_returns_iterable_when_serialized(self):
user = User(username='testuser')
user_data = SQLUserData(django_user=user, data={'test': 1})

output_stream = StringIO()
with patch('corehq.apps.dump_reload.sql.dump.get_objects_to_dump', return_value=[user_data]):
SqlDataDumper('test', [], []).dump(output_stream)

deserialized_model = json.loads(output_stream.getvalue())
fk_field = deserialized_model['fields']['django_user']
self.assertEqual(fk_field, ['testuser'])

def test_foreign_key_on_model_without_natural_key_returns_primary_key_when_serialized(self):
role = UserRole(pk=10, domain='test', name='test-role')
permission = Permission(pk=500, value='test')
role_permission = RolePermission(role=role, permission_fk=permission)

output_stream = StringIO()
with patch('corehq.apps.dump_reload.sql.dump.get_objects_to_dump', return_value=[role_permission]):
SqlDataDumper('test', [], []).dump(output_stream)

deserialized_model = json.loads(output_stream.getvalue())
role_field = deserialized_model['fields']['role']
self.assertEqual(role_field, 10)
permission_field = deserialized_model['fields']['permission_fk']
self.assertEqual(permission_field, 500)

def test_natural_foreign_key_for_CommCareCase_returns_str_when_serialized(self):
cc_case = CommCareCase(domain='test', case_id='abc123')
transaction = CaseTransaction(case=cc_case)

output_stream = StringIO()
with patch('corehq.apps.dump_reload.sql.dump.get_objects_to_dump', return_value=[transaction]):
SqlDataDumper('test', [], []).dump(output_stream)

deserialized_model = json.loads(output_stream.getvalue())
fk_field = deserialized_model['fields']['case']
self.assertEqual(fk_field, 'abc123')

def test_natural_foreign_key_for_XFormInstance_returns_str_when_serialized(self):
xform = XFormInstance(domain='test', form_id='abc123')
operation = XFormOperation(form=xform)

output_stream = StringIO()
with patch('corehq.apps.dump_reload.sql.dump.get_objects_to_dump', return_value=[operation]):
SqlDataDumper('test', [], []).dump(output_stream)

deserialized_model = json.loads(output_stream.getvalue())
fk_field = deserialized_model['fields']['form']
self.assertEqual(fk_field, 'abc123')
71 changes: 71 additions & 0 deletions corehq/apps/dump_reload/tests/test_sql_data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import json

from django.contrib.auth.models import User
from django.test import TestCase

from corehq.apps.dump_reload.sql.load import SqlDataLoader
from corehq.apps.users.models import SQLUserData
from corehq.apps.users.models_role import Permission, RolePermission, UserRole
from corehq.form_processor.models.cases import CaseTransaction
from corehq.form_processor.tests.utils import create_case


class TestSqlDataLoader(TestCase):

def test_loading_foreign_keys_using_iterable_natural_key(self):
user = User.objects.create(username='testuser')
model = {
"model": "users.sqluserdata",
"fields": {
"domain": "test",
"user_id": "testuser",
"django_user": ["testuser"],
"modified_on": "2024-01-01T12:00:00.000000Z",
"profile": None,
"data": {"test": "1"},
},
}
serialized_model = json.dumps(model)

SqlDataLoader().load_objects([serialized_model])

user_data = SQLUserData.objects.get(django_user=user)
self.assertEqual(user_data.django_user.pk, user.pk)

def test_loading_foreign_keys_using_non_iterable_natural_key(self):
# create_case will create a CaseTransaction too so test verifies the serialized one is saved properly
cc_case = create_case('test', case_id='abc123', save=True)
model = {
"model": "form_processor.casetransaction",
"fields": {
"case": "abc123",
"form_id": "fk-test",
"sync_log_id": None,
"server_date": "2024-01-01T12:00:00.000000Z",
"_client_date": None,
"type": 1,
"revoked": False,
"details": {},
},
}
serialized_model = json.dumps(model)

SqlDataLoader().load_objects([serialized_model])

transaction = CaseTransaction.objects.partitioned_query('abc123').get(case=cc_case, form_id='fk-test')
self.assertEqual(transaction.case_id, 'abc123')

def test_loading_foreign_keys_using_primary_key(self):
role = UserRole.objects.create(domain='test', name='test-role')
permission = Permission.objects.create(value='test')
model = {
"model": "users.rolepermission",
"pk": 1,
"fields": {"role": role.pk, "permission_fk": permission.pk, "allow_all": True, "allowed_items": []},
}
serialized_model = json.dumps(model)

SqlDataLoader().load_objects([serialized_model])

role_permission = RolePermission.objects.get(role=role, permission_fk=permission)
self.assertEqual(role_permission.pk, 1)
20 changes: 20 additions & 0 deletions corehq/apps/dump_reload/tests/test_sql_dump_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,26 @@ def test_users(self):

self._dump_and_load(expected_object_counts)

def test_sqluserdata(self):
from corehq.apps.users.models import SQLUserData, WebUser
from django.contrib.auth.models import User

expected_object_counts = Counter({User: 1, SQLUserData: 1})

web_user = WebUser.create(
domain=self.domain_name,
username='webuser_t1',
password='secret',
created_by=None,
created_via=None,
email='[email protected]',
)
self.addCleanup(web_user.delete, self.domain_name, deleted_by=None)
user = web_user.get_django_user()
SQLUserData.objects.create(domain=self.domain_name, data={'test': 1}, django_user=user)

self._dump_and_load(expected_object_counts)

def test_dump_roles(self):
from corehq.apps.users.models import UserRole, HqPermissions, RoleAssignableBy, RolePermission

Expand Down
11 changes: 9 additions & 2 deletions corehq/blobs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from memoized import memoized

from corehq.sql_db.models import PartitionedModel
from corehq.sql_db.models import PartitionedModel, RequireDBManager
from corehq.util.models import NullJsonField

from .util import get_content_md5
Expand All @@ -23,10 +23,17 @@ def uuid4_hex():
return uuid4().hex


class BlobMetaManager(RequireDBManager):

def get_by_natural_key(self, parent_id, key):
return self.partitioned_query(parent_id).get(key=key)


class BlobMeta(PartitionedModel, Model):
"""Metadata about an object stored in the blob db"""

partition_attr = "parent_id"
objects = BlobMetaManager()

domain = CharField(max_length=255)
parent_id = CharField(
Expand Down Expand Up @@ -119,7 +126,7 @@ def content_md5(self):
def natural_key(self):
# necessary for dumping models from a sharded DB so that we exclude the
# SQL 'id' field which won't be unique across all the DB's
return self.key
return self.parent_id, self.key


class DeletedBlobMeta(PartitionedModel, Model):
Expand Down
18 changes: 17 additions & 1 deletion corehq/form_processor/models/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,13 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def natural_key(self):
"""
Django requires returning a tuple in natural_key methods:
https://docs.djangoproject.com/en/3.2/topics/serialization/#serialization-of-natural-keys
We intentionally do not follow this to optimize corehq.apps.dump_reload.sql.load.SqlDataLoader when other
models reference CommCareCase or XFormInstance via a foreign key. This means our loader code may break in
future Django upgrades.
"""
# necessary for dumping models from a sharded DB so that we exclude the
# SQL 'id' field which won't be unique across all the DB's
return self.case_id
Expand Down Expand Up @@ -836,6 +843,9 @@ def get_index_map(indices):

class CaseAttachmentManager(RequireDBManager):

def get_by_natural_key(self, case_id, attachment_id):
return self.partitioned_query(case_id).get(attachment_id=attachment_id)

def get_attachments(self, case_id):
return list(self.partitioned_query(case_id).filter(case_id=case_id))

Expand Down Expand Up @@ -903,7 +913,7 @@ def key(self, value):
def natural_key(self):
# necessary for dumping models from a sharded DB so that we exclude the
# SQL 'id' field which won't be unique across all the DB's
return self.attachment_id
return self.case_id, self.attachment_id

def from_form_attachment(self, attachment, attachment_src):
"""
Expand Down Expand Up @@ -966,6 +976,9 @@ class Meta(object):

class CommCareCaseIndexManager(RequireDBManager):

def get_by_natural_key(self, domain, case_id, identifier):
return self.partitioned_query(case_id).get(domain=domain, case_id=case_id, identifier=identifier)

def get_indices(self, domain, case_id):
query = self.partitioned_query(case_id)
return list(query.filter(case_id=case_id, domain=domain))
Expand Down Expand Up @@ -1176,6 +1189,9 @@ class Meta(object):

class CaseTransactionManager(RequireDBManager):

def get_by_natural_key(self, case_id, form_id, transaction_type):
return self.partitioned_query(case_id).get(case_id=case_id, form_id=form_id, type=transaction_type)

def get_transactions(self, case_id):
return list(
self.partitioned_query(case_id)
Expand Down
10 changes: 10 additions & 0 deletions corehq/form_processor/models/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ def publish_deleted_forms(domain, form_ids):

class XFormOperationManager(RequireDBManager):

def get_by_natural_key(self, form_id, user_id, date):
return self.partitioned_query(form_id).get(form_id=form_id, user_id=user_id, date=date)

def get_form_operations(self, form_id):
return list(self.partitioned_query(form_id).filter(form_id=form_id).order_by('date'))

Expand Down Expand Up @@ -538,6 +541,13 @@ def original_operations(self):
return XFormOperation.objects.get_form_operations(self.__original_form_id)

def natural_key(self):
"""
Django requires returning a tuple in natural_key methods:
https://docs.djangoproject.com/en/3.2/topics/serialization/#serialization-of-natural-keys
We intentionally do not follow this to optimize corehq.apps.dump_reload.sql.load.SqlDataLoader when other
models reference CommCareCase or XFormInstance via a foreign key. This means our loader code may break in
future Django upgrades.
"""
# necessary for dumping models from a sharded DB so that we exclude the
# SQL 'id' field which won't be unique across all the DB's
return self.form_id
Expand Down
18 changes: 17 additions & 1 deletion corehq/form_processor/models/ledgers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,25 @@

from memoized import memoized

from corehq.sql_db.models import PartitionedModel
from corehq.sql_db.models import PartitionedModel, RequireDBManager
from corehq.util.models import TruncatingCharField

from ..track_related import TrackRelatedChanges
from .mixin import SaveStateMixin


class LedgerValueManager(RequireDBManager):

def get_by_natural_key(self, case_id, section_id, entry_id):
return self.partitioned_query(case_id).get(case_id=case_id, section_id=section_id, entry_id=entry_id)


class LedgerValue(PartitionedModel, SaveStateMixin, models.Model, TrackRelatedChanges):
"""
Represents the current state of a ledger.
"""
partition_attr = 'case_id'
objects = LedgerValueManager()

domain = models.CharField(max_length=255, null=False, default=None)
case = models.ForeignKey(
Expand Down Expand Up @@ -99,8 +106,17 @@ class Meta(object):
unique_together = ("case", "section_id", "entry_id")


class LedgerTransactionManager(RequireDBManager):

def get_by_natural_key(self, case_id, form_id, section_id, entry_id):
return self.partitioned_query(case_id).get(
case_id=case_id, form_id=form_id, section_id=section_id, entry_id=entry_id
)


class LedgerTransaction(PartitionedModel, SaveStateMixin, models.Model):
partition_attr = 'case_id'
objects = LedgerTransactionManager()

TYPE_BALANCE = 1
TYPE_TRANSFER = 2
Expand Down

0 comments on commit d3468c9

Please sign in to comment.