diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8ac1f40c..a3eb0136 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -149,6 +149,7 @@ jobs: context: . platforms: linux/amd64,linux/arm64 push: true + pull: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha diff --git a/Makefile b/Makefile index e496e6f9..40c13128 100644 --- a/Makefile +++ b/Makefile @@ -23,4 +23,4 @@ test: typecheck: mypy --strict yente -check: typecheck integration-test unit-test \ No newline at end of file +check: typecheck test \ No newline at end of file diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index 1a93e726..b3c5cc6e 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -12,9 +12,10 @@ def test_reconcile_metadata(): assert data["identifierSpace"].startswith(url), data assert len(data["defaultTypes"]) > 3, data assert "suggest" in data, data + assert "extend" in data, data -def test_reconcile_post(): +def test_reconcile_post_query(): queries = {"mutti": {"query": "Yevgeny Popov"}} resp = client.post("/reconcile/default", data={"queries": json.dumps(queries)}) assert resp.status_code == 200, resp.text @@ -23,8 +24,22 @@ def test_reconcile_post(): assert res[0]["id"] == "Q18634850", res +def test_reconcile_post_extend(): + query = {"ids": ["Q7747"], "properties": [{"id": "name"}, {"id": "birthDate"}]} + resp = client.post("/reconcile/default", data={"extend": json.dumps(query)}) + assert resp.status_code == 200, resp.text + data = resp.json() + assert len(data["meta"]) == 2 + assert data["meta"][0]["id"] == "name", data["meta"] + assert "Q7747" in data["rows"], data + assert "name" in data["rows"]["Q7747"], data + names = data["rows"]["Q7747"]["name"] + assert len(names) > 0, names + assert "putin" in "".join([n["str"] for n in names]).lower(), names + + def test_reconcile_invalid(): - queries = {"mutti": {"query": 37473874}} + queries = {"mutti": {"type": "Banana"}} resp = client.post("/reconcile/default", data={"queries": json.dumps(queries)}) assert resp.status_code == 400, resp.text @@ -109,3 +124,22 @@ def test_reconcile_suggest_type_prefix_dummy(): assert "result" in data res = data["result"] assert len(res) == 0, data + + +def test_reconcile_extend_properties(): + resp = client.get("/reconcile/default/extend/property?limit=5&type=LegalEntity") + assert resp.status_code == 200, resp.text + data = resp.json() + assert "type" in data + assert data["type"] == "LegalEntity", data + assert data["limit"] == 5, data + props = data["properties"] + assert len(props) == 5 + ids = [p["id"] for p in props] + assert "name" in ids + assert "country" in ids + + +def test_reconcile_extend_properties_invalid_type(): + resp = client.get("/reconcile/default/extend/property?limit=5&type=Banana") + assert resp.status_code == 400, resp.text diff --git a/yente/data/common.py b/yente/data/common.py index 568542cb..4d5e83fe 100644 --- a/yente/data/common.py +++ b/yente/data/common.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Dict, List, Union, Optional +from typing import Any, Dict, List, Union, Optional from pydantic import BaseModel, Field from nomenklatura.matching.types import MatchingResult, FeatureDocs @@ -83,7 +83,7 @@ class SearchResponse(ResultsResponse): class EntityExample(BaseModel): id: Optional[str] = Field(None, examples=["my-entity-id"]) schema_: str = Field(..., examples=["Person"], alias="schema") - properties: Dict[str, Union[str, List[str]]] = Field( + properties: Dict[str, Union[str, List[Any]]] = Field( ..., examples=[{"name": ["John Doe"]}] ) diff --git a/yente/data/entity.py b/yente/data/entity.py index 294e33fb..054f03bf 100644 --- a/yente/data/entity.py +++ b/yente/data/entity.py @@ -47,7 +47,7 @@ def from_example(cls, example: "EntityExample") -> "Entity": log.warning( "Invalid example property", prop=prop_name, - value=str(values), + value=repr(values), ) continue obj.add(prop_name, values, cleaned=False, fuzzy=True) diff --git a/yente/data/freebase.py b/yente/data/freebase.py index 01140b97..1d17578f 100644 --- a/yente/data/freebase.py +++ b/yente/data/freebase.py @@ -1,4 +1,5 @@ -from typing import List, Optional +from enum import Enum +from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field from pydantic.networks import AnyHttpUrl from followthemoney import model @@ -88,6 +89,53 @@ class FreebasePropertySuggestResponse(FreebaseSuggestResponse): result: List[FreebaseProperty] +class FreebaseExtendProperty(BaseModel): + id: str + name: str + + +class FreebaseExtendPropertiesResponse(BaseModel): + limit: int + type: str + properties: List[FreebaseExtendProperty] + + +class FreebaseRenderMethod(str, Enum): + raw = "raw" + caption = "caption" + + +class FreebaseExtendQueryPropertySettings(BaseModel): + limit: int = 0 + render: FreebaseRenderMethod = FreebaseRenderMethod.caption + + +class FreebaseExtendQueryProperty(BaseModel): + id: str + settings: FreebaseExtendQueryPropertySettings = ( + FreebaseExtendQueryPropertySettings() + ) + + +class FreebaseExtendQuery(BaseModel): + ids: List[str] + properties: List[FreebaseExtendQueryProperty] + + +class FreebaseExtendResponseMeta(BaseModel): + id: str + name: str + + +class FreebaseExtendResponseValue(BaseModel): + str: str + + +class FreebaseExtendResponse(BaseModel): + meta: List[FreebaseExtendResponseMeta] + rows: Dict[str, Dict[str, List[FreebaseExtendResponseValue]]] + + class FreebaseManifestView(BaseModel): url: str @@ -109,14 +157,41 @@ class FreebaseManifestSuggest(BaseModel): property: FreebaseManifestSuggestType +class FreebaseManifestExtendProposeProperties(BaseModel): + service_url: AnyHttpUrl + service_path: str + + +class FreebaseManifestExtendPropertySettingChoice(BaseModel): + id: str + name: str + + +class FreebaseManifestExtendPropertySetting(BaseModel): + name: str + label: str + type: str + default: Any + help_text: str + choices: List[FreebaseManifestExtendPropertySettingChoice] = [] + + +class FreebaseManifestExtend(BaseModel): + propose_properties: FreebaseManifestExtendProposeProperties + propose_settings: List[FreebaseManifestExtendPropertySetting] + + class FreebaseManifest(BaseModel): versions: List[str] = Field(..., examples=[["0.2"]]) name: str = Field(..., examples=[settings.TITLE]) identifierSpace: AnyHttpUrl schemaSpace: AnyHttpUrl + documentation: AnyHttpUrl + batchSize: int view: FreebaseManifestView preview: FreebaseManifestPreview suggest: FreebaseManifestSuggest + extend: FreebaseManifestExtend defaultTypes: List[FreebaseType] diff --git a/yente/routers/reconcile.py b/yente/routers/reconcile.py index 7308f563..50519909 100644 --- a/yente/routers/reconcile.py +++ b/yente/routers/reconcile.py @@ -1,7 +1,7 @@ import json import asyncio from urllib.parse import urljoin -from typing import Any, Coroutine, Dict, List, Tuple, Optional +from typing import Any, Coroutine, Dict, List, Tuple, Optional, Union from fastapi import APIRouter, Query, Form, Depends from fastapi import Request, Response from fastapi import HTTPException @@ -17,11 +17,22 @@ from yente.data.freebase import ( FreebaseEntity, FreebaseEntityResult, + FreebaseExtendPropertiesResponse, + FreebaseExtendProperty, + FreebaseExtendQuery, + FreebaseExtendResponse, + FreebaseExtendResponseMeta, + FreebaseExtendResponseValue, + FreebaseManifestExtend, + FreebaseManifestExtendPropertySetting, + FreebaseManifestExtendPropertySettingChoice, + FreebaseManifestExtendProposeProperties, FreebaseManifestView, FreebaseManifestPreview, FreebaseManifestSuggest, FreebaseManifestSuggestType, FreebaseProperty, + FreebaseRenderMethod, FreebaseScoredEntity, FreebaseType, FreebaseEntitySuggestResponse, @@ -30,11 +41,16 @@ FreebaseManifest, ) from yente.search.queries import entity_query, prefix_query -from yente.search.search import search_entities, result_entities, result_total +from yente.search.search import ( + get_entity, + search_entities, + result_entities, + result_total, +) from yente.search.search import get_matchable_schemata from yente.provider import SearchProvider, get_provider from yente.scoring import score_results -from yente.util import match_prefix, limit_window, typed_url +from yente.util import EntityRedirect, match_prefix, limit_window, typed_url from yente.routers.util import PATH_DATASET, QUERY_PREFIX from yente.routers.util import TS_PATTERN, ALGO_HELP from yente.routers.util import get_algorithm_by_name, get_dataset @@ -82,6 +98,8 @@ async def reconcile( identifierSpace=typed_url("https://www.opensanctions.org/reference/#schema"), schemaSpace=typed_url("https://www.opensanctions.org/reference/#schema"), view=FreebaseManifestView(url="https://www.opensanctions.org/entities/{{id}}/"), + documentation=typed_url("https://www.opensanctions.org/docs/"), + batchSize=settings.DEFAULT_PAGE, preview=FreebaseManifestPreview( url="https://www.opensanctions.org/entities/preview/{{id}}/", width=430, @@ -98,6 +116,35 @@ async def reconcile( service_url=base_url, service_path=f"/suggest/property{query_string}" ), ), + extend=FreebaseManifestExtend( + propose_properties=FreebaseManifestExtendProposeProperties( + service_url=base_url, service_path=f"/extend/property{query_string}" + ), + propose_settings=[ + FreebaseManifestExtendPropertySetting( + name="limit", + label="Limit", + type="number", + default=0, + help_text="Maximum number of values to return per row (0 for no limit).", + ), + FreebaseManifestExtendPropertySetting( + name="render", + label="Value rendering", + type="select", + default=FreebaseRenderMethod.caption, + choices=[ + FreebaseManifestExtendPropertySettingChoice( + id=FreebaseRenderMethod.caption, name="User-readable value" + ), + FreebaseManifestExtendPropertySettingChoice( + id=FreebaseRenderMethod.raw, name="Machine-readable value" + ), + ], + help_text="Return readable value (e.g. 'Russia') instead of raw value ('ru').", + ), + ], + ), defaultTypes=[FreebaseType.from_schema(s) for s in schemata], ) @@ -106,7 +153,7 @@ async def reconcile( "/reconcile/{dataset}", summary="Reconciliation queries", tags=["Reconciliation"], - response_model=Dict[str, FreebaseEntityResult], + response_model=Union[Dict[str, FreebaseEntityResult], FreebaseExtendResponse], responses={ 400: {"model": ErrorResponse, "description": "Invalid query"}, 500: {"model": ErrorResponse, "description": "Server error"}, @@ -117,6 +164,7 @@ async def reconcile_post( response: Response, dataset: str = PATH_DATASET, queries: str = Form(None, description="JSON-encoded reconciliation queries"), + extend: str = Form(None, description="JSON-encoded reconciliation queries"), algorithm: str = Query( settings.BEST_ALGORITHM, title=ALGO_HELP, @@ -127,9 +175,14 @@ async def reconcile_post( title="Match against entities that were updated since the given date", ), provider: SearchProvider = Depends(get_provider), -) -> Dict[str, FreebaseEntityResult]: +) -> Union[Dict[str, FreebaseEntityResult], FreebaseExtendResponse]: """Reconciliation API, emulates Google Refine API. This endpoint is used by clients for matching, refer to the discovery endpoint for details.""" + if extend is not None and len(extend.strip()): + extend_resp = await reconcile_extend(provider, extend) + response.headers["x-batch-size"] = str(len(extend_resp.rows)) + return extend_resp + ds = await get_dataset(dataset) resp = await reconcile_queries(provider, ds, queries, algorithm, changed_since) response.headers["x-batch-size"] = str(len(resp)) @@ -202,6 +255,52 @@ async def reconcile_query( return name, FreebaseEntityResult(result=results) +async def reconcile_extend( + provider: SearchProvider, + data: str, +) -> FreebaseExtendResponse: + try: + extendq: Any = json.loads(data) + except (TypeError, ValueError): + raise HTTPException(400, detail="Cannot decode extension request") + query = FreebaseExtendQuery.model_validate(extendq) + + if len(query.ids) > settings.MAX_BATCH: + msg = "Too many queries in one batch (limit: %d)" % settings.MAX_BATCH + raise HTTPException(400, detail=msg) + + try: + queries = [get_entity(provider, entity_id) for entity_id in query.ids] + entities = await asyncio.gather(*queries) + except EntityRedirect: + msg = "Please specify the canonical entity ID, not a referent" + raise HTTPException(400, detail=msg) + + metas: Dict[str, FreebaseExtendResponseMeta] = {} + resp = FreebaseExtendResponse(meta=[], rows={e: {} for e in query.ids}) + for entity in entities: + if entity is None: + continue + row: Dict[str, List[FreebaseExtendResponseValue]] = {} + for qprop in query.properties: + prop = entity.schema.get(qprop.id) + if prop is None: + continue + if qprop.id not in metas: + metas[qprop.id] = FreebaseExtendResponseMeta( + id=prop.name, name=prop.label + ) + values = entity.get(prop.name) + if qprop.settings.limit > 0: + values = values[: qprop.settings.limit] + if qprop.settings.render == FreebaseRenderMethod.caption: + values = [prop.type.caption(v) or v for v in values] + row[qprop.id] = [FreebaseExtendResponseValue(str=v) for v in values] + resp.rows[entity.id] = row + resp.meta = list(metas.values()) + return resp + + @router.get( "/reconcile/{dataset}/suggest/entity", summary="Suggest entity", @@ -296,3 +395,45 @@ async def reconcile_suggest_type( matches.append(FreebaseType.from_schema(schema)) result = matches[: settings.MATCH_PAGE] return FreebaseTypeSuggestResponse(prefix=prefix, result=result) + + +@router.get( + "/reconcile/{dataset}/extend/property", + summary="Extend properties proposal", + tags=["Reconciliation"], + response_model=FreebaseExtendPropertiesResponse, + include_in_schema=False, +) +async def reconcile_extend_properties( + dataset: str = PATH_DATASET, + type: str = Query( + settings.BASE_SCHEMA, + min_length=0, + description="Type of the entity for which properties should be proposed.", + ), + limit: int = Query( + 0, + description="Number of suggestions to return.", + ), +) -> FreebaseExtendPropertiesResponse: + """Given a type (schema), suggest a set of properties that could be retrieved + for data extension.""" + schema = model.get(type) + if schema is None: + raise HTTPException(400, detail="Invalid type: %s" % type) + properties: List[FreebaseExtendProperty] = [] + for featured in schema.featured: + prop = schema.get(featured) + if prop is not None: + properties.append(FreebaseExtendProperty(id=prop.name, name=prop.label)) + for prop in schema.properties.values(): + if prop.hidden or prop.type == registry.entity: + continue + if prop.name not in schema.featured: + properties.append(FreebaseExtendProperty(id=prop.name, name=prop.label)) + if limit > 0: + properties = properties[:limit] + properties = sorted(properties, key=lambda p: p.name) + return FreebaseExtendPropertiesResponse( + limit=limit, type=schema.name, properties=properties + ) diff --git a/yente/routers/search.py b/yente/routers/search.py index 6ca7a0fc..1765be78 100644 --- a/yente/routers/search.py +++ b/yente/routers/search.py @@ -73,7 +73,9 @@ async def search( [], title="Filter by entity topics (e.g. sanction, role.pep)" ), datasets: List[str] = Query([], title="Use `include_dataset` instead"), - limit: int = Query(10, title="Number of results to return", le=settings.MAX_PAGE), + limit: int = Query( + settings.DEFAULT_PAGE, title="Number of results to return", le=settings.MAX_PAGE + ), offset: int = Query( 0, title="Start at result with given offset", le=settings.MAX_OFFSET ), diff --git a/yente/settings.py b/yente/settings.py index 91c8203e..e9a37f3b 100644 --- a/yente/settings.py +++ b/yente/settings.py @@ -156,6 +156,9 @@ def random_cron() -> str: # How many results to return per page of search results max: MAX_PAGE = 500 +# How many entities to accept in a /search-type endpoint by default: +DEFAULT_PAGE = 10 + # How many entities to accept in a /match batch at most: MAX_BATCH = int(env_str("YENTE_MAX_BATCH", "100")) MAX_RESULTS = 9999