Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/onspd header checks and db flag #33

Merged
merged 2 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,15 @@ def root(*x):
"PASSWORD": "",
"HOST": "localhost",
"PORT": "",
}
},
"other": {
"ENGINE": "django.contrib.gis.db.backends.postgis",
"NAME": "test_other",
"USER": "postgres",
"PASSWORD": "",
"HOST": "localhost",
"PORT": "",
},
},
INSTALLED_APPS=("uk_geo_utils",),
)
Expand Down
10 changes: 7 additions & 3 deletions uk_geo_utils/base_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import psutil
from django.core.management.base import BaseCommand
from django.db import connection, transaction
from django.db import DEFAULT_DB_ALIAS, connections, transaction


def unzip(filepath):
Expand Down Expand Up @@ -37,14 +37,15 @@ def __init__(self, *args, **kwargs):
self.primary_key_constraint = None
self.tempdir = None
self.data_path = None
self.cursor = connection.cursor()
self.cursor = None
self.table_name = self.get_table_name()
self.temp_table_name = self.table_name + "_temp"

def add_arguments(self, parser):
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--url", action="store")
group.add_argument("--data-path", action="store")
parser.add_argument("--database", default=DEFAULT_DB_ALIAS)

@abc.abstractmethod
def get_table_name(self) -> str:
Expand Down Expand Up @@ -314,12 +315,15 @@ def get_constraints_and_index_statements(self):
self.foreign_key_constraints = self.get_foreign_key_constraints()
self.check_for_other_constraints()

def handle(self, **options):
def handle(self, *args, **options):
if not check_memory():
raise Exception(
"This instance has less than the recommended memory. Try running the import from a larger instance."
)

db_name = options["database"]
self.cursor = connections[db_name].cursor()

self.get_data_path(options)

self.get_constraints_and_index_statements()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pcd,pcd2,pcds,dointr,doterm,oscty,ced,oslaua,osward,parish,usertype,oseast1m,osnrth1m,osgrdind,oshlthau,nhser,ctry,rgn,streg,pcon,eer,teclec,ttwa,pct,itl,statsward,oa01,casward,park,lsoa01,msoa01,ur01ind,oac01,oa11,lsoa11,msoa11,wz11,ccg,bua11,buasd11,ru11ind,oac11,lat,long,lep1,lep2,pfa,imd,calncv,stp,oa21,lsoa21,msoa21
"AL1 1AA","AL1 1AA","AL1 1AA","199002","199405","E10000015","E58000666","E07000240","E05013966","E43000079","1","514600","0206900","8","E18000006","E40000007","E92000001","E12000006","5","E14000960","E15000006","E24000005","E30000237","E16000150","E07000240","26UGGR","E00120568","26UGGR","E99999999","E01023743","E02004937","5","2B2","E00120568","E01023743","E02004937","E33024349","E38000079","E34004707","E35001183","C1","5A3",51.749084,-0.341337,"E37000048","","E23000027",30564,"E56000023","E54000025","E00120568","E01023743","E02004937"
"AL1 1AB","AL1 1AB","AL1 1AB","200401","200401","E10000015","E58000664","E07000240","E05013956","E43000079","1","517328","0206823","1","E18000006","E40000007","E92000001","E12000006","5","E14000960","E15000006","E24000005","E30000237","E16000150","E07000240","26UGFX","E00120196","26UGFX","E99999999","E01023667","E02004936","5","6D1","E00120196","E01023667","E02004936","E33027453","E38000079","E34004707","E35001183","C1","5A2",51.747828,-0.301865,"E37000048","","E23000027",30628,"E56000023","E54000025","E00120196","E01023667","E02004936"
"IM1 1AA","IM1 1AA","IM1 1AA","199405","199912","M99999999","M99999999","M99999999","M99999999","M99999999","1","","","9","M00000001","M99999999","M83000003","M99999999","0","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","99ZZ00","M99999999","99ZZ00","M99999999","M99999999","M99999999","9","9Z9","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","M99999999","Z9","9Z9",99.999999,0.000000,"M99999999","M99999999","M99999999",0,"M99999999","M99999999","M99999999","M99999999","M99999999"
"IM1 1AD","IM1 1AD","IM1 1AD","199405","","M99999999","M99999999","M99999999","M99999999","M99999999","0","","","9","M00000001","M99999999","M83000003","M99999999","0","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","99ZZ00","M99999999","99ZZ00","M99999999","M99999999","M99999999","9","9Z9","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","M99999999","Z9","9Z9",99.999999,0.000000,"M99999999","M99999999","M99999999",0,"M99999999","M99999999","M99999999","M99999999","M99999999"
pcd,pcd2,pcds,dointr,doterm,oscty,ced,oslaua,osward,parish,usertype,oseast1m,osnrth1m,osgrdind,oshlthau,nhser,ctry,rgn,streg,pcon,eer,teclec,ttwa,pct,itl,statsward,oa01,casward,npark,lsoa01,msoa01,ur01ind,oac01,oa11,lsoa11,msoa11,wz11,sicbl,bua22,ru11ind,oac11,lat,long,lep1,lep2,pfa,imd,calncv,icb,oa21,lsoa21,msoa21
"AL1 1AA","AL1 1AA","AL1 1AA","199002","199405","E10000015","E58000666","E07000240","E05013966","E43000290","1","514600","0206900","8","E18000006","E40000007","E92000001","E12000006","5","E14001507","E15000006","E24000005","E30000237","E16000150","E07000240","26UGGR","E00120568","26UGGR","E65000001","E01023743","E02004937","5","2B2","E00120568","E01023743","E02004937","E33024349","E38000079","E63004441","C1","5A3",51.749084,-0.341337,"E37000048","","E23000027",30564,"E56000035","E54000025","E00120568","E01023743","E02004937"
"AL1 1AB","AL1 1AB","AL1 1AB","200401","200401","E10000015","E58000664","E07000240","E05013956","E43000290","1","517328","0206823","1","E18000006","E40000007","E92000001","E12000006","5","E14001507","E15000006","E24000005","E30000237","E16000150","E07000240","26UGFX","E00120196","26UGFX","E65000001","E01023667","E02004936","5","6D1","E00120196","E01023667","E02004936","E33027453","E38000079","E63004441","C1","5A2",51.747828,-0.301865,"E37000048","","E23000027",30628,"E56000035","E54000025","E00120196","E01023667","E02004936"
"IM1 1AA","IM1 1AA","IM1 1AA","199405","199912","M99999999","M99999999","M99999999","M99999999","M99999999","1","","","9","M00000001","M99999999","M83000003","M99999999","0","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","99ZZ00","M99999999","99ZZ00","M99999999","M99999999","M99999999","9","9Z9","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","Z9","9Z9",99.999999,0.000000,"M99999999","M99999999","M99999999",0,"M99999999","M99999999","M99999999","M99999999","M99999999"
"IM1 1AD","IM1 1AD","IM1 1AD","199405","","M99999999","M99999999","M99999999","M99999999","M99999999","0","","","9","M00000001","M99999999","M83000003","M99999999","0","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","99ZZ00","M99999999","99ZZ00","M99999999","M99999999","M99999999","9","9Z9","M99999999","M99999999","M99999999","M99999999","M01000001","M99999999","Z9","9Z9",99.999999,0.000000,"M99999999","M99999999","M99999999",0,"M99999999","M99999999","M99999999","M99999999","M99999999"
6 changes: 1 addition & 5 deletions uk_geo_utils/management/commands/import_cleaned_addresses.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os

from django.db import connection

from uk_geo_utils.base_importer import BaseImporter
from uk_geo_utils.helpers import get_address_model

Expand All @@ -25,10 +23,8 @@ def import_addressbase(self, table_name):
)

with open(cleaned_file_path, "r") as fp:
cursor = connection.cursor()

self.stdout.write("importing from %s.." % (cleaned_file_path))
cursor.copy_expert(
self.cursor.copy_expert(
"""
COPY %s (UPRN,address,postcode,location,addressbase_postal)
FROM STDIN (FORMAT CSV, DELIMITER ',', quote '"');
Expand Down
92 changes: 53 additions & 39 deletions uk_geo_utils/management/commands/import_onspd.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,11 @@
import glob
import os

from django.db import connection
from django.core.management import CommandError

from uk_geo_utils.base_importer import BaseImporter
from uk_geo_utils.helpers import get_onspd_model

HEADERS = {
"may2018": """
pcd, pcd2, pcds, dointr, doterm, oscty, ced, oslaua, osward,
parish, usertype, oseast1m, osnrth1m, osgrdind, oshlthau,
nhser, ctry, rgn, streg, pcon, eer, teclec, ttwa, pct, nuts,
statsward, oa01, casward, park, lsoa01, msoa01, ur01ind,
oac01, oa11, lsoa11, msoa11, wz11, ccg, bua11, buasd11,
ru11ind, oac11, lat, long, lep1, lep2, pfa, imd, calncv, stp
""",
"aug2022": """
pcd, pcd2, pcds, dointr, doterm, oscty, ced, oslaua, osward,
parish, usertype, oseast1m, osnrth1m, osgrdind, oshlthau,
nhser, ctry, rgn, streg, pcon, eer, teclec, ttwa, pct, nuts,
statsward, oa01, casward, park, lsoa01, msoa01, ur01ind,
oac01, oa11, lsoa11, msoa11, wz11, ccg, bua11, buasd11,
ru11ind, oac11, lat, long, lep1, lep2, pfa, imd, calncv, stp,
oa21, lsoa21, msoa21
""",
}


class Command(BaseImporter):
"""
Expand All @@ -35,22 +15,59 @@ class Command(BaseImporter):
python manage.py update_onspd --data-path /path/to/ONSPD_MAY_2024/Data
"""

def add_arguments(self, parser):
super().add_arguments(parser)

parser.add_argument(
"--header",
help="Specify which header the csv has",
default="aug2022",
choices=["may2018", "aug2022"],
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.derived_fields = ["location"]

def get_table_name(self):
return get_onspd_model()._meta.db_table

def import_data_to_temp_table(self):
self.import_onspd(self.temp_table_name)

def check_header(self, f):
self.stdout.write(f"checking header of {f}")
with open(f, "r") as fp:
# get field names from file
header_row = fp.readline()
file_header = sorted([f.strip() for f in header_row.split(",")])

# get field names from db excluding derived fields (i.e. location)
expected_header = sorted(
[
field.name
for field in get_onspd_model()._meta.get_fields()
if field.name not in self.derived_fields
]
)

if file_header == expected_header:
self.stdout.write(self.style.SUCCESS("✓ Headers match"))
return header_row

# find missing and unexpected fields
missing_fields = set(expected_header) - set(file_header)
unexpected_fields = set(file_header) - set(expected_header)

error_msg = [
f"\nProblem with the fields in {f}",
f" File header: {file_header}",
f" Expected header: {expected_header}",
]
if missing_fields:
error_msg.append(" Fields missing from file:")
for field in sorted(missing_fields):
error_msg.append(f" - {field}")

if unexpected_fields:
error_msg.append(" Unexpected fields found in file:")
for field in sorted(unexpected_fields):
error_msg.append(f" + {field}")
GeoWill marked this conversation as resolved.
Show resolved Hide resolved
error_msg.append(
"This probably means ONSPD has changed their csv format and we need to update our model."
)
raise CommandError("\n".join(error_msg))

def import_onspd(self, table_name):
glob_str = os.path.join(self.data_path, "*.csv")
files = glob.glob(glob_str)
Expand All @@ -59,24 +76,23 @@ def import_onspd(self, table_name):
"No CSV files found in %s" % (self.data_path)
)

cursor = connection.cursor()

self.stdout.write("importing from files..")
for f in files:
self.stdout.write(f)
header = self.check_header(f)
self.stdout.write(f"Importing {f}")
with open(f, "r") as fp:
cursor.copy_expert(
self.cursor.copy_expert(
"""
COPY %s (
%s
) FROM STDIN (FORMAT CSV, DELIMITER ',', quote '"', HEADER);
) FROM STDIN (FORMAT CSV, DELIMITER ',', quote '"', HEADER MATCH);
GeoWill marked this conversation as resolved.
Show resolved Hide resolved
"""
% (table_name, self.header),
% (table_name, header),
fp,
)

# turn text lng/lat into a Point() field
cursor.execute(
self.cursor.execute(
"""
UPDATE %s SET location=CASE
WHEN ("long"='0.000000' AND lat='99.999999')
Expand All @@ -90,6 +106,4 @@ def import_onspd(self, table_name):
self.stdout.write("...done")

def handle(self, **options):
self.header = HEADERS[options.get("header", "aug2022")]

super().handle(**options)
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Generated by Django 5.1.3 on 2024-11-08 16:54

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("uk_geo_utils", "0009_onspd_lsoa21_onspd_msoa21_onspd_oa21"),
]

operations = [
migrations.RenameField(
model_name="onspd",
old_name="bua11",
new_name="bua22",
),
migrations.RenameField(
model_name="onspd",
old_name="nuts",
new_name="itl",
),
migrations.RenameField(
model_name="onspd",
old_name="park",
new_name="npark",
),
migrations.RemoveField(
model_name="onspd",
name="buasd11",
),
migrations.RemoveField(
model_name="onspd",
name="ccg",
),
migrations.RemoveField(
model_name="onspd",
name="stp",
),
migrations.AddField(
model_name="onspd",
name="icb",
field=models.CharField(blank=True, max_length=9),
),
migrations.AddField(
model_name="onspd",
name="sicbl",
field=models.CharField(blank=True, max_length=9),
),
migrations.AlterField(
model_name="onspd",
name="lsoa21",
field=models.CharField(blank=True, default="", max_length=9),
preserve_default=False,
),
migrations.AlterField(
model_name="onspd",
name="msoa21",
field=models.CharField(blank=True, default="", max_length=9),
preserve_default=False,
),
migrations.AlterField(
model_name="onspd",
name="oa21",
field=models.CharField(blank=True, default="", max_length=9),
preserve_default=False,
),
]
20 changes: 9 additions & 11 deletions uk_geo_utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,11 @@ class AbstractOnspd(models.Model):
teclec = models.CharField(blank=True, max_length=9)
ttwa = models.CharField(blank=True, max_length=9)
pct = models.CharField(blank=True, max_length=9)
nuts = models.CharField(blank=True, max_length=10)
itl = models.CharField(blank=True, max_length=10)
statsward = models.CharField(blank=True, max_length=6)
oa01 = models.CharField(blank=True, max_length=10)
casward = models.CharField(blank=True, max_length=6)
park = models.CharField(blank=True, max_length=9)
npark = models.CharField(blank=True, max_length=9)
lsoa01 = models.CharField(blank=True, max_length=9)
msoa01 = models.CharField(blank=True, max_length=9)
ur01ind = models.CharField(blank=True, max_length=1)
Expand All @@ -158,9 +158,7 @@ class AbstractOnspd(models.Model):
lsoa11 = models.CharField(blank=True, max_length=9)
msoa11 = models.CharField(blank=True, max_length=9)
wz11 = models.CharField(blank=True, max_length=9)
ccg = models.CharField(blank=True, max_length=9)
bua11 = models.CharField(blank=True, max_length=9)
buasd11 = models.CharField(blank=True, max_length=9)
bua22 = models.CharField(blank=True, max_length=9)
ru11ind = models.CharField(blank=True, max_length=2)
oac11 = models.CharField(blank=True, max_length=3)
lat = models.CharField(blank=True, max_length=10)
Expand All @@ -170,12 +168,12 @@ class AbstractOnspd(models.Model):
pfa = models.CharField(blank=True, max_length=9)
imd = models.CharField(blank=True, max_length=5)
calncv = models.CharField(blank=True, max_length=9)
stp = models.CharField(blank=True, max_length=9)

# The following three fields are nullable because they won't appear in pre-Aug2022 releases
oa21 = models.CharField(null=True, max_length=9)
lsoa21 = models.CharField(null=True, max_length=9)
msoa21 = models.CharField(null=True, max_length=9)
oa21 = models.CharField(blank=True, max_length=9)
lsoa21 = models.CharField(blank=True, max_length=9)
msoa21 = models.CharField(blank=True, max_length=9)
icb = models.CharField(blank=True, max_length=9)
sicbl = models.CharField(blank=True, max_length=9)
sicbl = models.CharField(blank=True, max_length=9)

location = models.PointField(null=True, blank=True)
objects = GeoManager()
Expand Down
Loading