Skip to content

Commit

Permalink
Decapitalize as part of normalization; add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
dmannarino committed Dec 27, 2024
1 parent 0c6d541 commit ed5f2cd
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 18 deletions.
24 changes: 12 additions & 12 deletions app/routes/thematic/geoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ async def geoencode(
None,
description="Name of the subregion to match.",
),
search_unaccented: bool = Query(
normalize_search: bool = Query(
True,
description="Whether or not to unaccent names in request.",
description="Whether or not to perform a case- and accent-insensitive search.",
),
):
"""Look up administrative boundary IDs matching a specified country name
Expand All @@ -64,13 +64,13 @@ async def geoencode(
await version_is_valid(dataset, version_str)

names: List[str | None] = sanitize_names(
search_unaccented, country, region, subregion
normalize_search, country, region, subregion
)

adm_level: str = determine_admin_level(*names)

sql: str = _admin_boundary_lookup_sql(
adm_level, search_unaccented, admin_source, *names
adm_level, normalize_search, admin_source, *names
)

json_data: List[Dict[str, Any]] = await _query_dataset_json(
Expand Down Expand Up @@ -111,13 +111,13 @@ async def geoencode(


def sanitize_names(
search_unaccented: bool,
normalize_search: bool,
country: str | None,
region: str | None,
subregion: str | None,
) -> List[str | None]:
"""Turn any empty strings into Nones, enforce the admin level hierarchy,
and optionally unaccent names.
"""Turn any empty strings into Nones, enforces the admin level hierarchy,
and optionally unaccents and decapitalizes names.
"""
names = []

Expand All @@ -128,8 +128,8 @@ def sanitize_names(
)

for name in (country, region, subregion):
if name and search_unaccented:
names.append(unidecode(name))
if name and normalize_search:
names.append(unidecode(name).lower())
elif name:
names.append(name)
else:
Expand All @@ -155,7 +155,7 @@ def determine_admin_level(

def _admin_boundary_lookup_sql(
adm_level: str,
search_unaccented: bool,
normalize_search: bool,
dataset: str,
country_name: str,
region_name: str | None,
Expand All @@ -165,8 +165,8 @@ def _admin_boundary_lookup_sql(
IDs by name.
"""
name_fields: List[str] = ["country", "name_1", "name_2"]
if search_unaccented:
match_name_fields = [name_field + "_unaccented" for name_field in name_fields]
if normalize_search:
match_name_fields = [name_field + "_normalized" for name_field in name_fields]
else:
match_name_fields = name_fields

Expand Down
63 changes: 57 additions & 6 deletions tests_v2/unit/app/routes/thematic/geoencoder/test_geoencoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,63 @@
from typing import Any, Dict, List, Optional

import pytest
from fastapi import HTTPException
from httpx import AsyncClient

from app.models.pydantic.geostore import GeostoreCommon
from app.routes.datasets.versions import get_version
from app.routes.thematic import geoencoder
from app.routes.thematic.geoencoder import _admin_boundary_lookup_sql
from app.routes.thematic.geoencoder import _admin_boundary_lookup_sql, sanitize_names


@pytest.mark.asyncio
async def test_sanitize_names_pass_through() -> None:
country = "A Country"
region = "Some region"
subregion = "SUBREGION"
normalize = False

names = sanitize_names(normalize, country, region, subregion)

assert names == [country, region, subregion]


@pytest.mark.asyncio
async def test_sanitize_names_normalize() -> None:
country = "Fictîcious de San México"
region = "Söme Reğion"
subregion = "SÜBREGION"
normalize = True

names = sanitize_names(normalize, country, region, subregion)

assert names == ["ficticious de san mexico", "some region", "subregion"]


@pytest.mark.asyncio
async def test_sanitize_names_tolerate_empty() -> None:
country = "México"
region = "Tijuana"
subregion = ""
normalize = False

names = sanitize_names(normalize, country, region, subregion)

assert names == [country, region, None]


@pytest.mark.asyncio
async def test_sanitize_names_tolerate_enforce_hierarchy() -> None:
country = "México"
region = None
subregion = "some subregion"
normalize = False

try:
_ = sanitize_names(normalize, country, region, subregion)
except HTTPException as e:
assert (
e.detail == "If subregion is specified, region must be specified as well."
)


@pytest.mark.asyncio
Expand Down Expand Up @@ -48,15 +99,15 @@ async def test__admin_boundary_lookup_sql_all() -> None:


@pytest.mark.asyncio
async def test__admin_boundary_lookup_sql_all_unaccented() -> None:
async def test__admin_boundary_lookup_sql_all_normalized() -> None:
sql = _admin_boundary_lookup_sql(
"2", True, "some_dataset", "some_country", "some_region", "some_subregion"
)
assert sql == (
"SELECT gid_0, gid_1, gid_2, country, name_1, name_2 FROM some_dataset"
" WHERE country_unaccented='some_country'"
" AND name_1_unaccented='some_region'"
" AND name_2_unaccented='some_subregion'"
" WHERE country_normalized='some_country'"
" AND name_1_normalized='some_region'"
" AND name_2_normalized='some_subregion'"
" AND adm_level='2'"
)

Expand Down

0 comments on commit ed5f2cd

Please sign in to comment.