From 1e2d02527fdd44fe5a4ede7b65f2a875f5ee269f Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Tue, 19 Dec 2023 16:30:56 +0545 Subject: [PATCH 01/20] Added HDX exporter class and converted legacy yaml to json format --- API/hdx.py | 440 +++++++++++++++++++++++++++++++++++ API/main.py | 2 + API/stats.py | 2 +- requirements.txt | 5 +- src/app.py | 248 +++++++++++++++++++- src/config.py | 20 ++ src/query_builder/builder.py | 68 ++++++ src/validation/models.py | 36 ++- 8 files changed, 804 insertions(+), 17 deletions(-) create mode 100644 API/hdx.py diff --git a/API/hdx.py b/API/hdx.py new file mode 100644 index 00000000..3d8703b8 --- /dev/null +++ b/API/hdx.py @@ -0,0 +1,440 @@ +from enum import Enum +from typing import Dict, List + +from fastapi import APIRouter, Body, Query, Request +from fastapi_versioning import version +from pydantic import BaseModel, Field, validator + +from src.app import HDX +from src.config import LIMITER as limiter +from src.config import RATE_LIMIT_PER_MIN + +router = APIRouter(prefix="/hdx", tags=["HDX"]) + + +class HDXModel(BaseModel): + tags: List[str] = Field( + ..., + description="List of tags for the HDX model.", + example=["roads", "transportation", "geodata"], + ) + caveats: str = Field( + ..., + description="Caveats for the HDX model.", + example="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + ) + + +class CategoryModel(BaseModel): + hdx: HDXModel + types: List[str] = Field( + ..., + description="List of feature types (points, lines, polygons).", + example=["lines"], + ) + select: List[str] = Field( + ..., + description="List of selected fields.", + example=["name", "highway"], + ) + where: str = Field( + ..., + description="SQL-like condition to filter features.", + example="highway IS NOT NULL", + ) + formats: List[str] = Field( + ..., + description="List of Export Formats (suffixes).", + example=["gpkg", "fgb"], + ) + + @validator("types") + def validate_types(cls, value): + allowed_types = {"points", "lines", "polygons"} + for item in value: + if item not in allowed_types: + raise ValueError( + f"Invalid type: {item}. Allowed types are {', '.join(allowed_types)}" + ) + return value + + @validator("formats") + def validate_export_types(cls, value): + for export_type in value: + if export_type not in EXPORT_TYPE_MAPPING: + raise ValueError(f"Unsupported export type: {export_type}") + return [EXPORT_TYPE_MAPPING[export_type] for export_type in value] + + +class ExportTypeInfo: + def __init__(self, suffix, driver_name, layer_creation_options, format_option): + self.suffix = suffix + self.driver_name = driver_name + self.layer_creation_options = layer_creation_options + self.format_option = format_option + + +EXPORT_TYPE_MAPPING = { + "geojson": ExportTypeInfo("geojson", "GeoJSON", [], "GDAL"), + "shp": ExportTypeInfo("shp", "ESRI Shapefile", [], "GDAL"), + "gpkg": ExportTypeInfo("gpkg", "GeoPackage", [], "GDAL"), + "sqlite": ExportTypeInfo("sqlite", "SQLite", [], "GDAL"), + "fgb": ExportTypeInfo("fgb", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), + "mvt": ExportTypeInfo("mvt", "MVT", [], "GDAL"), + "kl": ExportTypeInfo("kml", "KML", [], "GDAL"), + "gpx": ExportTypeInfo("gpx", "GPX", [], "GDAL"), + "parquet": ExportTypeInfo("parquet", "PARQUET", [], "PARQUET"), +} + + +class DynamicCategoriesModel(BaseModel): + iso3: str = Field( + ..., + description="ISO3 Country Code.", + min_length=3, + max_length=3, + example="USA", + ) + + categories: List[Dict[str, CategoryModel]] = Field( + ..., + description="List of dynamic categories.", + example=[ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines", "polygons"], + "select": ["name", "highway"], + "where": "highway IS NOT NULL", + "formats": ["fgb"], + } + } + ], + ) + + +@router.post("/submit/") +@limiter.limit(f"{RATE_LIMIT_PER_MIN}/minute") +@version(1) +async def process_data( + request: Request, + params: DynamicCategoriesModel = Body( + ..., + description="Input parameters including ISO3 country code and dynamic categories.", + examples={ + "normal": { + "summary": "Example: Road extraction set", + "description": "Query to extract road in Nepal", + "value": { + "iso3": "NPL", + "categories": [ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": ["name", "highway"], + "where": "tags['highway'][1] IS NOT NULL", + "formats": ["fgb"], + } + } + ], + }, + }, + "fullset": { + "summary": "Full HDX Dataset default", + "description": "Full yaml conversion for dataset", + "value": { + "iso3": "NPL", + "categories": [ + { + "Buildings": { + "hdx": { + "tags": [ + "facilities-infrastructure", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["polygons"], + "select": [ + "name", + "building", + "building:levels", + "building:materials", + "addr:full", + "addr:housenumber", + "addr:street", + "addr:city", + "office", + "source", + ], + "where": "tags['building'][1] IS NOT NULL", + "formats": ["fgb"], + } + }, + { + "Roads": { + "hdx": { + "tags": ["transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": [ + "name", + "highway", + "surface", + "smoothness", + "width", + "lanes", + "oneway", + "bridge", + "layer", + "source", + ], + "where": "tags['highway'][1] IS NOT NULL", + "formats": ["fgb"], + } + }, + { + "Waterways": { + "hdx": { + "tags": ["hydrology", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines", "polygons"], + "select": [ + "name", + "waterway", + "covered", + "width", + "depth", + "layer", + "blockage", + "tunnel", + "natural", + "water", + "source", + ], + "where": "tags['waterway'][1] IS NOT NULL OR tags['water'][1] IS NOT NULL OR tags['natural'][1] IN ('water','wetland','bay')", + "formats": ["fgb"], + } + }, + { + "Points of Interest": { + "hdx": { + "tags": [ + "facilities-infrastructure", + "points-of-interest-poi", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points", "polygons"], + "select": [ + "name", + "amenity", + "man_made", + "shop", + "tourism", + "opening_hours", + "beds", + "rooms", + "addr:full", + "addr:housenumber", + "addr:street", + "addr:city", + "source", + ], + "where": "tags['amenity'][1] IS NOT NULL OR tags['man_made'][1] IS NOT NULL OR tags['shop'][1] IS NOT NULL OR tags['tourism'][1] IS NOT NULL", + "formats": ["fgb"], + } + }, + { + "Airports": { + "hdx": { + "tags": [ + "aviation", + "facilities-infrastructure", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points", "lines", "polygons"], + "select": [ + "name", + "aeroway", + "building", + "emergency", + "emergency:helipad", + "operator:type", + "capacity:persons", + "addr:full", + "addr:city", + "source", + ], + "where": "tags['aeroway'][1] IS NOT NULL OR tags['building'][1] = 'aerodrome' OR tags['emergency:helipad'][1] IS NOT NULL OR tags['emergency'][1] = 'landing_site'", + "formats": ["fgb"], + } + }, + { + "Sea Ports": { + "hdx": { + "tags": [ + "facilities-infrastructure", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points", "lines", "polygons"], + "select": [ + "name", + "amenity", + "building", + "port", + "operator:type", + "addr:full", + "addr:city", + "source", + ], + "where": "tags['amenity'][1] = 'ferry_terminal' OR tags['building'][1] = 'ferry_terminal' OR tags['port'][1] IS NOT NULL", + "formats": ["fgb"], + } + }, + { + "Education Facilities": { + "hdx": { + "tags": [ + "education-facilities-schools", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points", "polygons"], + "select": [ + "name", + "amenity", + "building", + "operator:type", + "capacity:persons", + "addr:full", + "addr:city", + "source", + ], + "where": "tags['amenity'][1] IN ('kindergarten', 'school', 'college', 'university') OR building IN ('kindergarten', 'school', 'college', 'university')", + "formats": ["fgb"], + } + }, + { + "Health Facilities": { + "hdx": { + "tags": ["geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points", "polygons"], + "select": [ + "name", + "amenity", + "building", + "healthcare", + "healthcare:speciality", + "operator:type", + "capacity:persons", + "addr:full", + "addr:city", + "source", + ], + "where": "tags['healthcare'][1] IS NOT NULL OR tags['amenity'][1] IN ('doctors', 'dentist', 'clinic', 'hospital', 'pharmacy')", + "formats": ["fgb"], + } + }, + { + "Populated Places": { + "hdx": { + "tags": [ + "populated-places-settlements", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points"], + "select": [ + "name", + "place", + "population", + "is_in", + "source", + ], + "where": "tags['place'][1] IN ('isolated_dwelling', 'town', 'village', 'hamlet', 'city')", + "formats": ["fgb"], + } + }, + { + "Financial Services": { + "hdx": { + "tags": ["economics", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["points", "polygons"], + "select": [ + "name", + "amenity", + "operator", + "network", + "addr:full", + "addr:city", + "source", + ], + "where": "tags['amenity'][1] IN ('mobile_money_agent','bureau_de_change','bank','microfinance','atm','sacco','money_transfer','post_office')", + "formats": ["fgb"], + } + }, + { + "Railways": { + "hdx": { + "tags": [ + "facilities-infrastructure", + "railways", + "transportation", + "geodata", + ], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": [ + "name", + "railway", + "ele", + "operator:type", + "layer", + "addr:full", + "addr:city", + "source", + ], + "where": "tags['railway'][1] IN ('rail','station')", + "formats": ["fgb"], + } + }, + ], + }, + }, + }, + ), +): + """ + Process data based on dynamic categories. + + Args: + request: FastAPI Request object. + params (DynamicCategoriesModel): Input parameters including ISO3 country code and dynamic categories. + + Returns: + dict: Result message. + """ + hdx_set = HDX(params.iso3).process_hdx_tags(params) + return {"message": "Data processed successfully"} diff --git a/API/main.py b/API/main.py index 4c9ed793..c72ec28b 100644 --- a/API/main.py +++ b/API/main.py @@ -40,6 +40,7 @@ from src.db_session import database_instance from .auth.routers import router as auth_router +from .hdx import router as hdx_router from .raw_data import router as raw_data_router from .tasks import router as tasks_router @@ -66,6 +67,7 @@ app.include_router(auth_router) app.include_router(raw_data_router) app.include_router(tasks_router) +app.include_router(hdx_router) if ENABLE_POLYGON_STATISTICS_ENDPOINTS: app.include_router(stats_router) diff --git a/API/stats.py b/API/stats.py index 8fa5ec24..302bc163 100644 --- a/API/stats.py +++ b/API/stats.py @@ -22,6 +22,6 @@ async def get_polygon_stats(request: Request, params: StatsRequestParams): Returns: dict: A dictionary containing statistics for the specified polygon. """ - generator = PolygonStats(params.geometry) + generator = PolygonStats(params.geometry, params.iso3) return generator.get_summary_stats() diff --git a/requirements.txt b/requirements.txt index ed1904dc..75a706ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,7 @@ neoteroi-mkdocs==0.1.2 pdocs==1.0.1 ##sozipfile -sozipfile==0.3.2 \ No newline at end of file +sozipfile==0.3.2 + +##duckdb +duckdb==0.9.2 \ No newline at end of file diff --git a/src/app.py b/src/app.py index 7b002057..59698508 100644 --- a/src/app.py +++ b/src/app.py @@ -17,19 +17,27 @@ # 1100 13th Street NW Suite 800 Washington, D.C. 20005 # """Page contains Main core logic of app""" +import concurrent.futures import os +import pathlib +import re +import shutil import subprocess import sys -import threading import time +import uuid from datetime import datetime +from datetime import datetime as dt +from datetime import timezone from json import dumps from json import loads as json_loads import boto3 +import duckdb import humanize import orjson import requests +import sozipfile.sozipfile as zipfile from area import area from fastapi import HTTPException from geojson import FeatureCollection @@ -40,6 +48,7 @@ AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, + ENABLE_POLYGON_STATISTICS_ENDPOINTS, ENABLE_TILES, EXPORT_MAX_AREA_SQKM, ) @@ -47,17 +56,21 @@ from src.config import INDEX_THRESHOLD as index_threshold from src.config import POLYGON_STATISTICS_API_URL from src.config import USE_CONNECTION_POOLING as use_connection_pooling -from src.config import get_db_connection_params, level +from src.config import USE_S3_TO_UPLOAD, get_db_connection_params, level from src.config import logger as logging from src.query_builder.builder import ( check_exisiting_country, check_last_updated_rawdata, + extract_features_duckdb, extract_geometry_type_query, generate_polygon_stats_graphql_query, get_countries_query, + get_country_from_iso, get_country_geojson, + get_country_geom_from_iso, get_country_id_query, get_osm_feature_query, + postgres2duckdb_query, raw_currentdata_extraction_query, raw_extract_plain_geojson, ) @@ -96,6 +109,11 @@ def print_psycopg2_exception(err): raise err +def convert_dict_to_conn_str(db_dict): + conn_str = " ".join([f"{key}={value}" for key, value in db_dict.items()]) + return conn_str + + def check_for_json(result_str): """Check if the Payload is a JSON document @@ -866,13 +884,14 @@ def get_bucket_location(self, bucket_name): raise ex return bucket_location or "us-east-1" - def upload(self, file_path, file_name, file_suffix="zip"): + def upload(self, file_path, file_name, file_suffix=None): """Used for transferring file to s3 after reading path from the user , It will wait for the upload to complete Parameters :file_path --- your local file path to upload , file_prefix -- prefix for the filename which is stored sample function call : S3FileTransfer.transfer(file_path="exports",file_prefix="upload_test")""" - file_name = f"{file_name}.{file_suffix}" + if file_suffix: + file_name = f"{file_name}.{file_suffix}" logging.debug("Started Uploading %s from %s", file_name, file_path) # instantiate upload start_time = time.time() @@ -894,7 +913,7 @@ def upload(self, file_path, file_name, file_suffix="zip"): class PolygonStats: """Generates stats for polygon""" - def __init__(self, geojson): + def __init__(self, geojson=None, iso3=None): """ Initialize PolygonStats with the provided GeoJSON. @@ -902,7 +921,22 @@ def __init__(self, geojson): geojson (dict): GeoJSON representation of the polygon. """ self.API_URL = POLYGON_STATISTICS_API_URL - self.INPUT_GEOM = dumps(geojson) + if geojson is None and iso3 is None: + raise HTTPException( + status_code=404, detail="Either geojson or iso3 should be passed" + ) + + if iso3: + dbdict = get_db_connection_params() + d_b = Database(dbdict) + con, cur = d_b.connect() + cur.execute(get_country_geom_from_iso(iso3)) + result = cur.fetchone() + if result is None: + raise HTTPException(status_code=404, detail="Invalid iso3 code") + self.INPUT_GEOM = result[0] + else: + self.INPUT_GEOM = dumps(geojson) @staticmethod def get_building_pattern_statement( @@ -1063,3 +1097,205 @@ def get_summary_stats(self): } return return_stats + + +class DuckDB: + def __init__(self, db_path): + dbdict = get_db_connection_params() + self.db_con_str = convert_dict_to_conn_str(db_dict=dbdict) + self.db_path = db_path + if os.path.exists(self.db_path): + os.remove(self.db_path) + con = duckdb.connect(self.db_path) + con.sql(f"""ATTACH '{self.db_con_str}' AS postgres_db (TYPE POSTGRES)""") + con.install_extension("spatial") + con.install_extension("json") + con.load_extension("spatial") + con.load_extension("json") + + def run_query(self, query, attach_pgsql=False, load_spatial=False): + with duckdb.connect(self.db_path) as con: + if attach_pgsql: + con.execute( + f"""ATTACH '{self.db_con_str}' AS postgres_db (TYPE POSTGRES)""" + ) + load_spatial = True + if load_spatial: + con.load_extension("spatial") + # con.load_extension("json") + con.execute(query) + + +class HDXUploader: + def __init__(self, dataset_prefix, export_url, category): + self.dataset_prefix = dataset_prefix + self.export_url = export_url + self.category = category + + # def + + +class HDX: + def __init__(self, ISO3): + self.iso3 = ISO3.lower() + dbdict = get_db_connection_params() + d_b = Database(dbdict) + con, cur = d_b.connect() + cur.execute(get_country_from_iso(self.iso3)) + result = cur.fetchall()[0] + if not result: + raise HTTPException(status_code=404, detail="Invalid iso3 code") + + ( + self.cid, + self.dataset_name, + self.dataset_prefix, + self.dataset_locations, + ) = result + + self.uuid = str(uuid.uuid4()) + self.default_export_path = os.path.join( + export_path, self.uuid, "HDX", self.iso3.upper() + ) + if os.path.exists(self.default_export_path): + shutil.rmtree(self.default_export_path) + os.makedirs(self.default_export_path) + self.duck_db_instance = DuckDB( + os.path.join(self.default_export_path, f"{self.iso3}.db") + ) + + def types_to_tables(self, type_list: list): + mapping = { + "points": ["nodes"], + "lines": ["ways_line", "relations"], + "polygons": ["ways_poly", "relations"], + } + + table_set = set() + + for t in type_list: + if t in mapping: + table_set.update(mapping[t]) + + return list(table_set) + + def format_where_clause(self, where_clause): + pattern = r"tags\['([^']+)'\]\[1\]" + match = re.search(pattern, where_clause) + + if match: + key = match.group(1) + return where_clause.replace(match.group(0), key) + else: + return where_clause + + # def s3url_to_hdx(self , url, category): + + def zip_to_s3(self, zip_path): + s3_upload_name = os.path.relpath(zip_path, os.path.join(export_path, self.uuid)) + + if not USE_S3_TO_UPLOAD: + raise HTTPException( + status_code=404, detail="S3 Export service is disabled on server" + ) + file_transfer_obj = S3FileTransfer() + download_url = file_transfer_obj.upload( + zip_path, + str(s3_upload_name), + ) + return download_url + # if ENABLE_POLYGON_STATISTICS_ENDPOINTS: + # polygon_stats = PolygonStats(iso3=self.iso3).get_summary_stats() + # readme_content += f'{polygon_stats["summary"]["building"]}\n' + # readme_content += f'{polygon_stats["summary"]["road"]}\n' + # readme_content += "Read about what this summary means: indicators: https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/indicators.md,metrics: https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/metrics.md" + + def file_to_zip(self, working_dir, zip_path): + zf = zipfile.ZipFile( + zip_path, + "w", + compression=zipfile.ZIP_DEFLATED, + chunk_size=zipfile.SOZIP_DEFAULT_CHUNK_SIZE, + ) + for file_path in pathlib.Path(working_dir).iterdir(): + zf.write(file_path, arcname=file_path.name) + utc_now = dt.now(timezone.utc) + utc_offset = utc_now.strftime("%z") + # Adding metadata readme.txt + readme_content = f"Exported Timestamp (UTC{utc_offset}): {utc_now.strftime('%Y-%m-%d %H:%M:%S')}\n" + readme_content += "Exported through Raw-data-api (https://github.com/hotosm/raw-data-api) using OpenStreetMap data.\n" + readme_content += "Learn more about OpenStreetMap and its data usage policy : https://www.openstreetmap.org/about \n" + zf.writestr("Readme.txt", readme_content) + zf.close() + shutil.rmtree(working_dir) + return zip_path + + def query_to_file(self, query, category_name, feature_type, export_formats): + category_name = category_name.lower().replace(" ", "_") + file_export_path = os.path.join( + self.default_export_path, category_name, feature_type + ) + for export_format in export_formats: + export_format_path = os.path.join(file_export_path, export_format.suffix) + os.makedirs(export_format_path, exist_ok=True) + + export_filename = f"""{self.dataset_prefix}_{category_name}_{feature_type}_{export_format.suffix}""" + export_file_path = os.path.join( + export_format_path, f"{export_filename}.{export_format.suffix}" + ) + + if os.path.exists(export_file_path): + os.remove(export_file_path) + + layer_creation_options_str = ( + " ".join( + [f"'{option}'" for option in export_format.layer_creation_options] + ) + if export_format.layer_creation_options + else "" + ) + executable_query = f"""COPY ({query.strip()}) TO '{export_file_path}' WITH (FORMAT {export_format.format_option}, DRIVER '{export_format.driver_name}'{f', LAYER_CREATION_OPTIONS {layer_creation_options_str}' if layer_creation_options_str else ''})""" + self.duck_db_instance.run_query(executable_query.strip(), load_spatial=True) + zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") + zip_path = self.file_to_zip(export_format_path, zip_file_path) + return zip_path + + def process_category(self, category): + category_name, category_data = list(category.items())[0] + for feature_type in category_data.types: + extract_query = extract_features_duckdb( + self.iso3, category_data.select, feature_type, category_data.where + ) + zip_path = self.query_to_file( + extract_query, category_name, feature_type, category_data.formats + ) + s3_download_url = self.zip_to_s3(zip_path) + return s3_download_url + + def process_hdx_tags(self, params): + table_type = [ + cat_type + for category in params.categories + for cat_type in list(category.values())[0].types + ] + table_names = self.types_to_tables(list(set(table_type))) + + for table in table_names: + create_table = postgres2duckdb_query(self.iso3, self.cid, table) + self.duck_db_instance.run_query(create_table.strip(), attach_pgsql=True) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=os.cpu_count() * 2 + ) as executor: + futures = { + executor.submit(self.process_category, category): category + for category in params.categories + } + + for future in concurrent.futures.as_completed(futures): + category = futures[future] + try: + result = future.result() + print(result, category) + except Exception as e: + logging.error(f"An error occurred for category {category}: {e}") diff --git a/src/config.py b/src/config.py index e0c9872f..a7b4c34f 100644 --- a/src/config.py +++ b/src/config.py @@ -175,6 +175,26 @@ "POLYGON_STATISTICS_API_RATE_LIMIT" ) or config.get("API_CONFIG", "POLYGON_STATISTICS_API_RATE_LIMIT", fallback=5) +ENABLE_HDX_EXPORTS = os.environ.get("ENABLE_HDX_EXPORTS") or config.getboolean( + "HDX", "ENABLE_HDX_EXPORTS", fallback=False +) + +HDX_SITE = os.environ.get("HDX_SITE") or config.getboolean( + "HDX", "HDX_SITE", fallback="demo" +) +HDX_API_KEY = os.environ.get("HDX_API_KEY") or config.getboolean( + "HDX", "HDX_API_KEY", fallback=None +) + +if ENABLE_HDX_EXPORTS: + from hdx.api.configuration import Configuration + + HDX_URL_PREFIX = Configuration.create( + hdx_site=HDX_SITE, + hdx_key=HDX_API_KEY, + user_agent="HDXPythonLibrary/6.2.0-HOTOSM OSM Exports", + ) + def get_db_connection_params() -> dict: """Return a python dict that can be passed to psycopg2 connections diff --git a/src/query_builder/builder.py b/src/query_builder/builder.py index b2355a57..eefabb5c 100644 --- a/src/query_builder/builder.py +++ b/src/query_builder/builder.py @@ -806,3 +806,71 @@ def generate_polygon_stats_graphql_query(geojson_feature): query = query % dumps(geojson_feature) return query + + +def get_country_from_iso(iso3): + query = f"""SELECT + b.cid::int as fid, b.description as name, b.dataset_name as dataset_prefix, b.locations as locations + FROM + countries b + WHERE + LOWER(iso_3) = '{iso3}' + """ + return query + + +def postgres2duckdb_query(iso3, cid, table, enable_users_detail=False): + select_query = ( + """osm_id, version, changeset, timestamp, tags, ST_AsBinary(geom) as geometry""" + ) + create_select_duck_db = """osm_id,version, changeset, timestamp, cast(tags::json AS map(varchar, varchar)) AS tags, cast(ST_GeomFromWKB(geometry) as GEOMETRY) AS geometry""" + + if enable_users_detail: + select_query = """osm_id, uid, user, version, changeset, timestamp, tags, ST_AsBinary(geom) as geometry""" + create_select_duck_db = """osm_id, uid, user, version, changeset, timestamp, cast(tags::json AS map(varchar, varchar)) AS tags, cast(ST_GeomFromWKB(geometry) as GEOMETRY) AS geometry""" + + duck_db_create = f"""CREATE TABLE {iso3}_{table} AS SELECT {create_select_duck_db} FROM postgres_query("postgres_db", "SELECT {select_query} FROM {table} WHERE country <@ ARRAY [{cid}]") """ + + return duck_db_create + + +def extract_features_duckdb(iso3, select, feature_type, where): + map_tables = { + "points": {"table": ["nodes"], "where": {"nodes": where}}, + "lines": { + "table": ["ways_line", "relations"], + "where": { + "ways_line": where, + "relations": f"{where} and ST_GeometryType(geometry)='MULTILINESTRING'", + }, + }, + "polygons": { + "table": ["ways_poly", "relations"], + "where": { + "ways_poly": where, + "relations": f"{where} and (ST_GeometryType(geometry)='MULTIPOLYGON' or ST_GeometryType(geometry)='POLYGON')", + }, + }, + } + + select = [f"tags['{item}'][1] as '{item}'" for item in select] + select += ["osm_id", "geometry"] + select_query = ", ".join(select) + + from_query = map_tables[feature_type]["table"] + base_query = [] + for table in from_query: + query = f"""select {select_query} from {f"{iso3}_{table}"} where {map_tables[feature_type]['where'][table]}""" + base_query.append(query) + return " UNION ALL ".join(base_query) + + +def get_country_geom_from_iso(iso3): + query = f"""SELECT + ST_AsGeoJSON(geometry) as geom + FROM + countries b + WHERE + LOWER(iso_3) = '{iso3}' + """ + return query diff --git a/src/validation/models.py b/src/validation/models.py index c8cf2fae..a0f857ea 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -252,6 +252,7 @@ class Config: class StatsRequestParams(BaseModel): geometry: Union[Polygon, MultiPolygon] = Field( + default=None, example={ "type": "Polygon", "coordinates": [ @@ -265,13 +266,30 @@ class StatsRequestParams(BaseModel): ], }, ) + so3: str = Field( + default=None, + description="ISO3 Country Code.", + min_length=3, + max_length=3, + example="NPL", + ) - @validator("geometry", allow_reuse=True) - def get_value_as_feature(cls, value): - """Converts geometry to geojson feature""" - feature = { - "type": "Feature", - "geometry": json.loads(value.json()), - "properties": {}, - } - return feature + @validator("geometry", pre=True, always=True) + def set_geometry_or_iso3(cls, value, values): + """Either geometry or iso3 should be supplied.""" + if value is not None and values.get("iso3") is not None: + raise ValueError("Only one of geometry or iso3 should be supplied.") + if value is None and values.get("iso3") is None: + raise ValueError("Either geometry or iso3 should be supplied.") + return value + + @validator("geometry", pre=True, always=True) + def validate_geometry(cls, value): + """Converts geometry to geojson feature.""" + if value is not None: + feature = { + "type": "Feature", + "geometry": json.loads(value.json()), + "properties": {}, + } + return feature From d8acc5e5279d927ce182ac9ad4af91c71742ee4a Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Tue, 19 Dec 2023 16:41:10 +0545 Subject: [PATCH 02/20] added hdx python api in requirements --- requirements.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 75a706ce..3162cf60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,7 @@ pdocs==1.0.1 sozipfile==0.3.2 ##duckdb -duckdb==0.9.2 \ No newline at end of file +duckdb==0.9.2 + +##hdx +hdx-python-api==6.2.0 \ No newline at end of file From 8d6f61e028a9f0b1976430f81c6d60e17495e468 Mon Sep 17 00:00:00 2001 From: kshtiijrajsharma Date: Tue, 19 Dec 2023 20:30:51 +0545 Subject: [PATCH 03/20] Upgrade python and added first version of hdx export --- API/hdx.py | 3 +- API/raw_data.py | 8 +-- requirements.txt | 43 ++++++----- src/app.py | 152 ++++++++++++++++++++++++++++++--------- src/config.py | 3 +- src/validation/models.py | 8 +-- 6 files changed, 151 insertions(+), 66 deletions(-) diff --git a/API/hdx.py b/API/hdx.py index 3d8703b8..e2853935 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -95,7 +95,6 @@ class DynamicCategoriesModel(BaseModel): max_length=3, example="USA", ) - categories: List[Dict[str, CategoryModel]] = Field( ..., description="List of dynamic categories.", @@ -124,7 +123,7 @@ async def process_data( params: DynamicCategoriesModel = Body( ..., description="Input parameters including ISO3 country code and dynamic categories.", - examples={ + openapi_examples={ "normal": { "summary": "Example: Road extraction set", "description": "Query to extract road in Nepal", diff --git a/API/raw_data.py b/API/raw_data.py index 82669feb..830ccfa0 100644 --- a/API/raw_data.py +++ b/API/raw_data.py @@ -64,7 +64,7 @@ def get_osm_current_snapshot_as_file( request: Request, params: RawDataCurrentParams = Body( default={}, - examples={ + openapi_examples={ "normal": { "summary": "Example : Extract Evertyhing in the area", "description": "**Query** to Extract everything in the area , You can pass your geometry only and you will get everything on that area", @@ -450,7 +450,7 @@ def get_osm_current_snapshot_as_file( return JSONResponse({"task_id": task.id, "track_link": f"/tasks/status/{task.id}/"}) -@router.post("/snapshot/plain/", response_model=FeatureCollection) +@router.post("/snapshot/plain/") @version(1) def get_osm_current_snapshot_as_plain_geojson( request: Request, @@ -482,14 +482,14 @@ def get_osm_current_snapshot_as_plain_geojson( return result -@router.get("/countries/", response_model=FeatureCollection) +@router.get("/countries/") @version(1) def get_countries(q: str = ""): result = RawData().get_countries_list(q) return result -@router.get("/osm_id/", response_model=FeatureCollection) +@router.get("/osm_id/") @version(1) def get_osm_feature(osm_id: int): return RawData().get_osm_feature(osm_id) diff --git a/requirements.txt b/requirements.txt index 3162cf60..04d7fb81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,37 +1,34 @@ -aiofiles==0.7.0 -asgiref==3.3.4 -click==8.0.3 -fastapi==0.65.2 -h11==0.12.0 -importlib-metadata==4.5.0 -psycopg2==2.9.1 -pydantic==1.10.2 -starlette==0.14.2 -typing-extensions==4.1.0 -uvicorn==0.14.0 -zipp==3.4.1 -geojson-pydantic==0.3.0 -pytest == 7.3.1 -geojson == 2.5.0 +fastapi==0.105.0 +uvicorn==0.24.0 +psycopg2==2.9.9 +geojson-pydantic==1.0.1 +pytest == 7.4.3 +geojson == 3.1.0 + # Used for new relic monitoring newrelic == 7.2.4.171 sentry-sdk == 1.5.12 +## Third party area==1.1.1 orjson==3.9.10 boto3==1.24.38 fastapi-versioning==0.10.0 redis==4.3.4 celery==5.2.7 -flower==1.2.0 slowapi==0.1.6 -osm-login-python==0.0.2 +osm-login-python==1.0.2 +humanize==4.9.0 +python-slugify==8.0.1 #''' required for generating documentations ''' -mkdocs-material==8.5.11 -mkdocs-jupyter==0.22.0 -neoteroi-mkdocs==0.1.2 -pdocs==1.0.1 +# mkdocs-material==8.5.11 +# mkdocs-jupyter==0.22.0 +# neoteroi-mkdocs==0.1.2 +# pdocs==1.0.1 + +## flower +# flower==1.2.0 ##sozipfile sozipfile==0.3.2 @@ -40,4 +37,6 @@ sozipfile==0.3.2 duckdb==0.9.2 ##hdx -hdx-python-api==6.2.0 \ No newline at end of file +hdx-python-api==6.2.0 + + diff --git a/src/app.py b/src/app.py index 59698508..1b4c5150 100644 --- a/src/app.py +++ b/src/app.py @@ -33,7 +33,6 @@ from json import loads as json_loads import boto3 -import duckdb import humanize import orjson import requests @@ -43,11 +42,13 @@ from geojson import FeatureCollection from psycopg2 import OperationalError, connect from psycopg2.extras import DictCursor +from slugify import slugify from src.config import ( AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, + ENABLE_HDX_EXPORTS, ENABLE_POLYGON_STATISTICS_ENDPOINTS, ENABLE_TILES, EXPORT_MAX_AREA_SQKM, @@ -83,8 +84,10 @@ database_instance = None import logging as log -# assigning global variable of pooling so that it -# will be accessible from any function within this script +if ENABLE_HDX_EXPORTS: + import duckdb + from hdx.data.dataset import Dataset + global LOCAL_CON_POOL # getting the pool instance which was fireup when API is started @@ -658,7 +661,7 @@ def get_grid_id(geom, cur): @staticmethod def geojson2tiles(geojson_path, tile_path, tile_layer_name): """Responsible for geojson to tiles""" - cmd = """tippecanoe -zg --projection=EPSG:4326 -o {tile_output_path} -l {tile_layer_name} {geojson_input_path} --force""".format( + cmd = """tippecanoe -zg --projection=EPSG:4326 -o {tile_output_path} -l {tile_layer_name} --force {geojson_input_path}""".format( tile_output_path=tile_path, tile_layer_name=tile_layer_name, geojson_input_path=geojson_path, @@ -1126,15 +1129,6 @@ def run_query(self, query, attach_pgsql=False, load_spatial=False): con.execute(query) -class HDXUploader: - def __init__(self, dataset_prefix, export_url, category): - self.dataset_prefix = dataset_prefix - self.export_url = export_url - self.category = category - - # def - - class HDX: def __init__(self, ISO3): self.iso3 = ISO3.lower() @@ -1189,21 +1183,24 @@ def format_where_clause(self, where_clause): else: return where_clause - # def s3url_to_hdx(self , url, category): - - def zip_to_s3(self, zip_path): - s3_upload_name = os.path.relpath(zip_path, os.path.join(export_path, self.uuid)) + def zip_to_s3(self, resources): + for resource in resources: + s3_upload_name = os.path.relpath( + resource["zip_path"], os.path.join(export_path, self.uuid) + ) - if not USE_S3_TO_UPLOAD: - raise HTTPException( - status_code=404, detail="S3 Export service is disabled on server" + if not USE_S3_TO_UPLOAD: + raise HTTPException( + status_code=404, detail="S3 Export service is disabled on server" + ) + file_transfer_obj = S3FileTransfer() + download_url = file_transfer_obj.upload( + resource["zip_path"], + str(s3_upload_name), ) - file_transfer_obj = S3FileTransfer() - download_url = file_transfer_obj.upload( - zip_path, - str(s3_upload_name), - ) - return download_url + resource["download_url"] = download_url + + return resources # if ENABLE_POLYGON_STATISTICS_ENDPOINTS: # polygon_stats = PolygonStats(iso3=self.iso3).get_summary_stats() # readme_content += f'{polygon_stats["summary"]["building"]}\n' @@ -1235,6 +1232,7 @@ def query_to_file(self, query, category_name, feature_type, export_formats): file_export_path = os.path.join( self.default_export_path, category_name, feature_type ) + resources = [] for export_format in export_formats: export_format_path = os.path.join(file_export_path, export_format.suffix) os.makedirs(export_format_path, exist_ok=True) @@ -1258,7 +1256,14 @@ def query_to_file(self, query, category_name, feature_type, export_formats): self.duck_db_instance.run_query(executable_query.strip(), load_spatial=True) zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) - return zip_path + resource = {} + resource["filename"] = f"{export_filename}.zip" + resource["zip_path"] = zip_path + resource["format_suffix"] = export_format.suffix + resource["format_description"] = export_format.driver_name + + resources.append(resource) + return resources def process_category(self, category): category_name, category_data = list(category.items())[0] @@ -1266,11 +1271,11 @@ def process_category(self, category): extract_query = extract_features_duckdb( self.iso3, category_data.select, feature_type, category_data.where ) - zip_path = self.query_to_file( + resources = self.query_to_file( extract_query, category_name, feature_type, category_data.formats ) - s3_download_url = self.zip_to_s3(zip_path) - return s3_download_url + uploaded_resources = self.zip_to_s3(resources) + return uploaded_resources def process_hdx_tags(self, params): table_type = [ @@ -1295,7 +1300,88 @@ def process_hdx_tags(self, params): for future in concurrent.futures.as_completed(futures): category = futures[future] try: - result = future.result() - print(result, category) + uploaded_resources = future.result() + print(uploaded_resources, category) + self.resource_to_hdx(uploaded_resources, category) + except Exception as e: - logging.error(f"An error occurred for category {category}: {e}") + raise e + # logging.error(f"An error occurred for category {category}: {e}") + + def resource_to_hdx(self, uploaded_resources, category): + uploader = HDXUploader(category) + uploader.init_dataset( + self.dataset_prefix, self.dataset_name, self.dataset_locations + ) + for resource in uploaded_resources: + uploader.add_resource( + resource["filename"], + resource["format_suffix"], + resource["format_description"], + resource["download_url"], + ) + uploader.upload_dataset() + + +class HDXUploader: + def __init__(self, category): + self.category_name, self.category_data = list(category.items())[0] + self.dataset = None + + def slugify(self, name): + return slugify(name).replace("-", "_") + + def add_resource( + self, resource_name, resource_format, resource_description, export_url + ): + if self.dataset: + resource = { + "name": resource_name, + "format": resource_format, + "description": resource_description, + "url": export_url, + "last_modified": datetime.now().isoformat(), + } + print(resource) + self.dataset.add_update_resource(resource) + + def upload_dataset(self): + if self.dataset: + exists = Dataset.read_from_hdx(self.dataset["name"]) + if exists: + # self.dataset.set_date_of_dataset(datetime.now()) + self.dataset.update_in_hdx() + else: + # self.dataset.set_date_of_dataset(datetime.now()) + self.dataset.create_in_hdx(allow_no_resources=True) + + def init_dataset( + self, + dataset_prefix, + dataset_name, + dataset_locations, + ): + self.dataset = Dataset( + { + "name": "{0}_{1}".format( + dataset_prefix, self.slugify(self.category_name) + ), + "title": "{0} {1} (OpenStreetMap Export)".format( + dataset_name, self.category_name + ), + "owner_org": "225b9f7d-e7cb-4156-96a6-44c9c58d31e3", + "maintainer": "6a0688ce-8521-46e2-8edd-8e26c0851ebd", + "dataset_source": "OpenStreetMap contributors", + "methodology": "Other", + "methodology_other": "Volunteered geographic information", + "license_id": "hdx-odc-odbl", + "updated_by_script": f'Hotosm OSM Exports ({datetime.now().strftime("%Y-%m-%dT%H:%M:%S")}', + "data_update_frequency": -2, + "caveats": self.category_data.hdx.caveats, + ## notes , private and subnational option + } + ) + for location in dataset_locations: + self.dataset.add_country_location(location) + for tag in self.category_data.hdx.tags: + self.dataset.add_tag(tag) diff --git a/src/config.py b/src/config.py index a7b4c34f..ad7efaca 100644 --- a/src/config.py +++ b/src/config.py @@ -182,7 +182,7 @@ HDX_SITE = os.environ.get("HDX_SITE") or config.getboolean( "HDX", "HDX_SITE", fallback="demo" ) -HDX_API_KEY = os.environ.get("HDX_API_KEY") or config.getboolean( +HDX_API_KEY = os.environ.get("HDX_API_KEY") or config.get( "HDX", "HDX_API_KEY", fallback=None ) @@ -194,6 +194,7 @@ hdx_key=HDX_API_KEY, user_agent="HDXPythonLibrary/6.2.0-HOTOSM OSM Exports", ) + print(HDX_URL_PREFIX) def get_db_connection_params() -> dict: diff --git a/src/validation/models.py b/src/validation/models.py index a0f857ea..3c793241 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -44,7 +44,7 @@ def to_camel(string: str) -> str: class BaseModel(PydanticModel): class Config: alias_generator = to_camel - allow_population_by_field_name = True + populate_by_name = True use_enum_values = True # extra = "forbid" @@ -204,7 +204,7 @@ class SnapshotResponse(BaseModel): track_link: str class Config: - schema_extra = { + json_schema_extra = { "example": { "task_id": "aa539af6-83d4-4aa3-879e-abf14fffa03f", "track_link": "/tasks/status/aa539af6-83d4-4aa3-879e-abf14fffa03f/", @@ -227,7 +227,7 @@ class SnapshotTaskResponse(BaseModel): result: SnapshotTaskResult class Config: - schema_extra = { + json_schema_extra = { "example": { "id": "3fded368-456f-4ef4-a1b8-c099a7f77ca4", "status": "SUCCESS", @@ -247,7 +247,7 @@ class StatusResponse(BaseModel): last_updated: str class Config: - schema_extra = {"example": {"lastUpdated": "2022-06-27 19:59:24+05:45"}} + json_schema_extra = {"example": {"lastUpdated": "2022-06-27 19:59:24+05:45"}} class StatsRequestParams(BaseModel): From 79ffe0779e037231d7dc6303f085c942d2cd0964 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Wed, 20 Dec 2023 21:30:02 +0545 Subject: [PATCH 04/20] Adds custom polygon support along with all other fields required for hdx upload --- API/api_worker.py | 4 +- API/hdx.py | 153 +++++++++++++++++++++++++++-- API/test.py | 19 ++++ requirements.txt | 3 +- setup.py | 10 +- src/app.py | 180 +++++++++++++++++++++++------------ src/config.py | 55 ++++++++--- src/query_builder/builder.py | 38 +++++++- src/validation/models.py | 4 +- 9 files changed, 370 insertions(+), 96 deletions(-) create mode 100644 API/test.py diff --git a/API/api_worker.py b/API/api_worker.py index 9184ce7c..257a973f 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -105,8 +105,8 @@ def process_raw_data(self, params): readme_content += "Exported through Raw-data-api (https://github.com/hotosm/raw-data-api) using OpenStreetMap data.\n" readme_content += "Learn more about OpenStreetMap and its data usage policy : https://www.openstreetmap.org/about \n" if polygon_stats: - readme_content += f'{polygon_stats["summary"]["building"]}\n' - readme_content += f'{polygon_stats["summary"]["road"]}\n' + readme_content += f'{polygon_stats["summary"]["buildings"]}\n' + readme_content += f'{polygon_stats["summary"]["roads"]}\n' readme_content += "Read about what this summary means: indicators: https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/indicators.md,metrics: https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/metrics.md" zf.writestr("Readme.txt", readme_content) diff --git a/API/hdx.py b/API/hdx.py index e2853935..3fe7e664 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -1,11 +1,13 @@ from enum import Enum -from typing import Dict, List +from typing import Dict, List, Optional, Union from fastapi import APIRouter, Body, Query, Request from fastapi_versioning import version +from geojson_pydantic import MultiPolygon, Polygon from pydantic import BaseModel, Field, validator from src.app import HDX +from src.config import ALLOWED_HDX_TAGS, ALLOWED_HDX_UPDATE_FREQUENCIES from src.config import LIMITER as limiter from src.config import RATE_LIMIT_PER_MIN @@ -19,10 +21,24 @@ class HDXModel(BaseModel): example=["roads", "transportation", "geodata"], ) caveats: str = Field( - ..., - description="Caveats for the HDX model.", + default="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + description="Caveats/Warning for the Datasets.", example="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", ) + notes: str = Field( + default="", + description="Extra notes to append in notes section of hdx datasets", + example="Sample notes to append", + ) + + @validator("tags") + def validate_tags(cls, value): + for item in value: + if item.strip() not in ALLOWED_HDX_TAGS: + raise ValueError( + f"Invalid tag {item.strip()} , Should be within {ALLOWED_HDX_TAGS}" + ) + return value class CategoryModel(BaseModel): @@ -87,14 +103,59 @@ def __init__(self, suffix, driver_name, layer_creation_options, format_option): } +class DatasetConfig(BaseModel): + private: bool = Field( + default=False, + description="Make dataset private , By default False , Public is recommended", + example="False", + ) + subnational: bool = Field( + default=False, + description="Make it true if dataset doesn't cover nation/country", + example="False", + ) + update_frequency: str = Field( + default="as needed", + description="Update frequncy to be added on uploads", + example="daily", + ) + dataset_title: str = Field( + default=None, + description="Dataset title which appears at top of the page", + example="Nepal", + ) + dataset_prefix: str = Field( + default=None, + description="Dataset prefix to be appended before category name, Will be ignored if iso3 is supplied", + example="hotosm_npl", + ) + dataset_locations: List[str] = Field( + default=None, + description="Valid dataset locations iso3", + example="['npl']", + ) + + @validator("update_frequency") + def validate_frequency(cls, value): + if value.strip() not in ALLOWED_HDX_UPDATE_FREQUENCIES: + raise ValueError( + f"Invalid update frequency , Should be within {ALLOWED_HDX_UPDATE_FREQUENCIES}" + ) + return value.strip() + + class DynamicCategoriesModel(BaseModel): - iso3: str = Field( - ..., - description="ISO3 Country Code.", + iso3: Optional[str] = Field( + default=None, + description="ISO3 Country Code", min_length=3, max_length=3, example="USA", ) + dataset: Optional[DatasetConfig] = Field( + description="Dataset Configurations for HDX Upload" + ) + categories: List[Dict[str, CategoryModel]] = Field( ..., description="List of dynamic categories.", @@ -113,6 +174,38 @@ class DynamicCategoriesModel(BaseModel): } ], ) + geometry: Optional[Union[Polygon, MultiPolygon]] = Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, + ) + + @validator("geometry", pre=True, always=True) + def set_geometry_or_iso3(cls, value, values): + """Either geometry or iso3 should be supplied.""" + if value is not None and values.get("iso3") is not None: + raise ValueError("Only one of geometry or iso3 should be supplied.") + if value is None and values.get("iso3") is None: + raise ValueError("Either geometry or iso3 should be supplied.") + if value is not None: + dataset = values.get("dataset").dict() + if dataset is None: + raise ValueError("Dataset config should be supplied for custom polygon") + + for item in dataset.keys(): + if dataset.get(item) is None: + raise ValueError(f"Missing, Dataset config : {item}") + return value @router.post("/submit/") @@ -124,8 +217,8 @@ async def process_data( ..., description="Input parameters including ISO3 country code and dynamic categories.", openapi_examples={ - "normal": { - "summary": "Example: Road extraction set", + "normal_iso": { + "summary": "Example: Road extraction using iso3", "description": "Query to extract road in Nepal", "value": { "iso3": "NPL", @@ -145,9 +238,47 @@ async def process_data( ], }, }, + "normal_polygon": { + "summary": "Example: Road extraction set using custom polygon", + "description": "Query to extract road in Pokhara, Nepal", + "value": { + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, + "dataset": { + "subnational": True, + "dataset_title": "Pokhara", + "dataset_prefix": "hotosm_pkr", + "dataset_locations": ["npl"], + }, + "categories": [ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": ["name", "highway"], + "where": "tags['highway'][1] IS NOT NULL", + "formats": ["fgb"], + } + } + ], + }, + }, "fullset": { "summary": "Full HDX Dataset default", - "description": "Full yaml conversion for dataset", + "description": "Full yaml conversion for dataset with iso3 example", "value": { "iso3": "NPL", "categories": [ @@ -435,5 +566,7 @@ async def process_data( Returns: dict: Result message. """ - hdx_set = HDX(params.iso3).process_hdx_tags(params) + if not params.dataset: + params.dataset = DatasetConfig() + hdx_set = HDX(params).process_hdx_tags() return {"message": "Data processed successfully"} diff --git a/API/test.py b/API/test.py new file mode 100644 index 00000000..57d01d72 --- /dev/null +++ b/API/test.py @@ -0,0 +1,19 @@ +import re + + +def replace_key(input_str): + pattern = r"tags\['([^']+)'\]\[1\]" + match = re.search(pattern, input_str) + + if match: + key = match.group(1) + return input_str.replace(match.group(0), key) + else: + return input_str + + +# Example usage: +input_str = "tags['railway'][1] IN ('rail','station')" +result = replace_key(input_str) + +print(result) diff --git a/requirements.txt b/requirements.txt index 04d7fb81..59c11f54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,10 +16,11 @@ boto3==1.24.38 fastapi-versioning==0.10.0 redis==4.3.4 celery==5.2.7 -slowapi==0.1.6 +slowapi==0.1.8 osm-login-python==1.0.2 humanize==4.9.0 python-slugify==8.0.1 +geomet==1.1.0 #''' required for generating documentations ''' # mkdocs-material==8.5.11 diff --git a/setup.py b/setup.py index 4f2cdfe2..cb5655b0 100644 --- a/setup.py +++ b/setup.py @@ -9,14 +9,14 @@ description="The Raw Data API module makes it simple for you to get osm data stats provided by api in your own project", packages=setuptools.find_packages(), install_requires=[ - "pytest == 7.3.1", + "pytest == 7.4.3", "psycopg2", "boto3==1.24.38", - "fastapi==0.65.2", - "geojson == 2.5.0", + "fastapi==0.105.0", + "geojson == 7.4.3", "area==1.1.1", "orjson==3.9.10", - "slowapi==0.1.6", + "slowapi==0.1.8", ], classifiers=[ "Programming Language :: Python :: 3", @@ -24,7 +24,7 @@ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires=">=3.6", + python_requires=">=3.8", long_description=long_description, long_description_content_type="text/markdown", author="Hot Tech Team", diff --git a/src/app.py b/src/app.py index 1b4c5150..f9ed2646 100644 --- a/src/app.py +++ b/src/app.py @@ -54,12 +54,15 @@ EXPORT_MAX_AREA_SQKM, ) from src.config import EXPORT_PATH as export_path +from src.config import HDX_MAINTAINER, HDX_OWNER_ORG from src.config import INDEX_THRESHOLD as index_threshold from src.config import POLYGON_STATISTICS_API_URL from src.config import USE_CONNECTION_POOLING as use_connection_pooling from src.config import USE_S3_TO_UPLOAD, get_db_connection_params, level from src.config import logger as logging from src.query_builder.builder import ( + HDX_FILTER_CRITERIA, + HDX_MARKDOWN, check_exisiting_country, check_last_updated_rawdata, extract_features_duckdb, @@ -67,13 +70,10 @@ generate_polygon_stats_graphql_query, get_countries_query, get_country_from_iso, - get_country_geojson, get_country_geom_from_iso, - get_country_id_query, get_osm_feature_query, postgres2duckdb_query, raw_currentdata_extraction_query, - raw_extract_plain_geojson, ) from src.validation.models import RawDataOutputType @@ -939,7 +939,7 @@ def __init__(self, geojson=None, iso3=None): raise HTTPException(status_code=404, detail="Invalid iso3 code") self.INPUT_GEOM = result[0] else: - self.INPUT_GEOM = dumps(geojson) + self.INPUT_GEOM = dumps(json_loads(geojson.json())) @staticmethod def get_building_pattern_statement( @@ -1073,7 +1073,7 @@ def get_summary_stats(self): ) return_stats = { - "summary": {"building": building_summary, "road": road_summary}, + "summary": {"buildings": building_summary, "roads": road_summary}, "raw": { "population": combined_data["population"], "populatedAreaKm2": combined_data["populatedAreaKm2"], @@ -1130,32 +1130,50 @@ def run_query(self, query, attach_pgsql=False, load_spatial=False): class HDX: - def __init__(self, ISO3): - self.iso3 = ISO3.lower() - dbdict = get_db_connection_params() - d_b = Database(dbdict) - con, cur = d_b.connect() - cur.execute(get_country_from_iso(self.iso3)) - result = cur.fetchall()[0] - if not result: - raise HTTPException(status_code=404, detail="Invalid iso3 code") + def __init__(self, params): + self.params = params + self.iso3 = self.params.iso3 + if self.iso3: + self.iso3 = self.iso3.lower() + self.cid = None + if self.iso3: + dbdict = get_db_connection_params() + d_b = Database(dbdict) + con, cur = d_b.connect() + cur.execute(get_country_from_iso(self.iso3)) + result = cur.fetchall()[0] + if not result: + raise HTTPException(status_code=404, detail="Invalid iso3 code") - ( - self.cid, - self.dataset_name, - self.dataset_prefix, - self.dataset_locations, - ) = result + ( + self.cid, + dataset_title, + dataset_prefix, + dataset_locations, + ) = result + + if not self.params.dataset.dataset_title: + self.params.dataset.dataset_title = dataset_title + if not self.params.dataset.dataset_prefix: + self.params.dataset.dataset_prefix = dataset_prefix + if not self.params.dataset.dataset_locations: + self.params.dataset.dataset_locations = dataset_locations self.uuid = str(uuid.uuid4()) self.default_export_path = os.path.join( - export_path, self.uuid, "HDX", self.iso3.upper() + export_path, + self.uuid, + "HDX", + self.iso3.upper() if self.iso3 else self.params.dataset.dataset_prefix, ) if os.path.exists(self.default_export_path): shutil.rmtree(self.default_export_path) os.makedirs(self.default_export_path) self.duck_db_instance = DuckDB( - os.path.join(self.default_export_path, f"{self.iso3}.db") + os.path.join( + self.default_export_path, + f"{self.iso3 if self.iso3 else self.params.dataset.dataset_prefix}.db", + ) ) def types_to_tables(self, type_list: list): @@ -1199,13 +1217,8 @@ def zip_to_s3(self, resources): str(s3_upload_name), ) resource["download_url"] = download_url - + os.remove(resource["zip_path"]) return resources - # if ENABLE_POLYGON_STATISTICS_ENDPOINTS: - # polygon_stats = PolygonStats(iso3=self.iso3).get_summary_stats() - # readme_content += f'{polygon_stats["summary"]["building"]}\n' - # readme_content += f'{polygon_stats["summary"]["road"]}\n' - # readme_content += "Read about what this summary means: indicators: https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/indicators.md,metrics: https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/metrics.md" def file_to_zip(self, working_dir, zip_path): zf = zipfile.ZipFile( @@ -1237,7 +1250,7 @@ def query_to_file(self, query, category_name, feature_type, export_formats): export_format_path = os.path.join(file_export_path, export_format.suffix) os.makedirs(export_format_path, exist_ok=True) - export_filename = f"""{self.dataset_prefix}_{category_name}_{feature_type}_{export_format.suffix}""" + export_filename = f"""{self.params.dataset.dataset_prefix}_{category_name}_{feature_type}_{export_format.suffix}""" export_file_path = os.path.join( export_format_path, f"{export_filename}.{export_format.suffix}" ) @@ -1269,7 +1282,10 @@ def process_category(self, category): category_name, category_data = list(category.items())[0] for feature_type in category_data.types: extract_query = extract_features_duckdb( - self.iso3, category_data.select, feature_type, category_data.where + self.iso3 if self.iso3 else self.params.dataset.dataset_prefix, + category_data.select, + feature_type, + category_data.where, ) resources = self.query_to_file( extract_query, category_name, feature_type, category_data.formats @@ -1277,16 +1293,21 @@ def process_category(self, category): uploaded_resources = self.zip_to_s3(resources) return uploaded_resources - def process_hdx_tags(self, params): + def process_hdx_tags(self): table_type = [ cat_type - for category in params.categories + for category in self.params.categories for cat_type in list(category.values())[0].types ] table_names = self.types_to_tables(list(set(table_type))) for table in table_names: - create_table = postgres2duckdb_query(self.iso3, self.cid, table) + create_table = postgres2duckdb_query( + self.iso3 if self.iso3 else self.params.dataset.dataset_prefix, + table, + self.cid, + self.params.geometry, + ) self.duck_db_instance.run_query(create_table.strip(), attach_pgsql=True) with concurrent.futures.ThreadPoolExecutor( @@ -1294,25 +1315,31 @@ def process_hdx_tags(self, params): ) as executor: futures = { executor.submit(self.process_category, category): category - for category in params.categories + for category in self.params.categories } for future in concurrent.futures.as_completed(futures): category = futures[future] try: uploaded_resources = future.result() - print(uploaded_resources, category) - self.resource_to_hdx(uploaded_resources, category) + self.resource_to_hdx( + uploaded_resources, self.params.dataset, category + ) except Exception as e: raise e # logging.error(f"An error occurred for category {category}: {e}") - def resource_to_hdx(self, uploaded_resources, category): - uploader = HDXUploader(category) - uploader.init_dataset( - self.dataset_prefix, self.dataset_name, self.dataset_locations + def resource_to_hdx(self, uploaded_resources, dataset_config, category): + uploader = HDXUploader( + hdx=dataset_config, + category=category, + completeness_metadata={ + "iso3": self.iso3, + "geometry": self.params.geometry, + }, ) + uploader.init_dataset() for resource in uploaded_resources: uploader.add_resource( resource["filename"], @@ -1324,13 +1351,51 @@ def resource_to_hdx(self, uploaded_resources, category): class HDXUploader: - def __init__(self, category): + def __init__(self, category, hdx, completeness_metadata=None): + self.hdx = hdx self.category_name, self.category_data = list(category.items())[0] self.dataset = None + self.completeness_metadata = completeness_metadata + self.data_completeness_stats = None def slugify(self, name): return slugify(name).replace("-", "_") + def filter_formatter(self, where_str): + pattern = r"tags\['([^']+)'\]\[1\]" + match = re.search(pattern, where_str) + + if match: + key = match.group(1) + return where_str.replace(match.group(0), key) + else: + return where_str + + def add_notes(self): + columns = [] + for key in self.category_data.select: + columns.append( + "- [{0}](http://wiki.openstreetmap.org/wiki/Key:{0})".format(key) + ) + columns = "\n".join(columns) + filter_str = HDX_FILTER_CRITERIA.format( + criteria=self.filter_formatter(self.category_data.where) + ) + if self.category_name.lower() in ["roads", "buildings"]: + if self.data_completeness_stats is None: + if self.completeness_metadata: + self.data_completeness_stats = PolygonStats( + iso3=self.completeness_metadata["iso3"], + geojson=self.completeness_metadata["geometry"], + ).get_summary_stats() + if self.data_completeness_stats: + self.category_data.hdx.notes += f'{self.data_completeness_stats["summary"][self.category_name.lower()]}\n' + self.category_data.hdx.notes += "Read about what this summary means, [indicators](https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/indicators.md) , [metrics](https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/metrics.md)" + + return self.category_data.hdx.notes + HDX_MARKDOWN.format( + columns=columns, filter_str=filter_str + ) + def add_resource( self, resource_name, resource_format, resource_description, export_url ): @@ -1347,40 +1412,35 @@ def add_resource( def upload_dataset(self): if self.dataset: - exists = Dataset.read_from_hdx(self.dataset["name"]) - if exists: - # self.dataset.set_date_of_dataset(datetime.now()) - self.dataset.update_in_hdx() - else: - # self.dataset.set_date_of_dataset(datetime.now()) - self.dataset.create_in_hdx(allow_no_resources=True) - - def init_dataset( - self, - dataset_prefix, - dataset_name, - dataset_locations, - ): + self.dataset.set_reference_period(datetime.now()) + self.dataset.create_in_hdx(allow_no_resources=True) + + def init_dataset(self): + dataset_prefix = self.hdx.dataset_prefix + dataset_title = self.hdx.dataset_title + dataset_locations = self.hdx.dataset_locations self.dataset = Dataset( { "name": "{0}_{1}".format( dataset_prefix, self.slugify(self.category_name) ), "title": "{0} {1} (OpenStreetMap Export)".format( - dataset_name, self.category_name + dataset_title, self.category_name ), - "owner_org": "225b9f7d-e7cb-4156-96a6-44c9c58d31e3", - "maintainer": "6a0688ce-8521-46e2-8edd-8e26c0851ebd", + "owner_org": HDX_OWNER_ORG, + "maintainer": HDX_MAINTAINER, "dataset_source": "OpenStreetMap contributors", "methodology": "Other", "methodology_other": "Volunteered geographic information", "license_id": "hdx-odc-odbl", "updated_by_script": f'Hotosm OSM Exports ({datetime.now().strftime("%Y-%m-%dT%H:%M:%S")}', - "data_update_frequency": -2, "caveats": self.category_data.hdx.caveats, - ## notes , private and subnational option + "private": self.hdx.private, + "notes": self.add_notes(), + "subnational": 1 if self.hdx.subnational else 0, } ) + self.dataset.set_expected_update_frequency(self.hdx.update_frequency) for location in dataset_locations: self.dataset.add_country_location(location) for tag in self.category_data.hdx.tags: diff --git a/src/config.py b/src/config.py index ad7efaca..6cdc6c4c 100644 --- a/src/config.py +++ b/src/config.py @@ -179,22 +179,55 @@ "HDX", "ENABLE_HDX_EXPORTS", fallback=False ) -HDX_SITE = os.environ.get("HDX_SITE") or config.getboolean( - "HDX", "HDX_SITE", fallback="demo" -) -HDX_API_KEY = os.environ.get("HDX_API_KEY") or config.get( - "HDX", "HDX_API_KEY", fallback=None -) if ENABLE_HDX_EXPORTS: + HDX_SITE = os.environ.get("HDX_SITE") or config.getboolean( + "HDX", "HDX_SITE", fallback="demo" + ) + HDX_API_KEY = os.environ.get("HDX_API_KEY") or config.get( + "HDX", "HDX_API_KEY", fallback=None + ) + HDX_OWNER_ORG = os.environ.get("HDX_OWNER_ORG") or config.get( + "HDX", "HDX_OWNER_ORG", fallback="225b9f7d-e7cb-4156-96a6-44c9c58d31e3" + ) + HDX_MAINTAINER = os.environ.get("HDX_MAINTAINER") or config.get( + "HDX", "HDX_MAINTAINER", fallback="6a0688ce-8521-46e2-8edd-8e26c0851ebd" + ) from hdx.api.configuration import Configuration - HDX_URL_PREFIX = Configuration.create( - hdx_site=HDX_SITE, - hdx_key=HDX_API_KEY, - user_agent="HDXPythonLibrary/6.2.0-HOTOSM OSM Exports", + try: + HDX_URL_PREFIX = Configuration.create( + hdx_site=HDX_SITE, + hdx_key=HDX_API_KEY, + user_agent="HDXPythonLibrary/6.2.0-HOTOSM OSM Exports", + ) + logging.debug(HDX_URL_PREFIX) + except Exception as e: + logging.error( + f"Error creating HDX configuration: {e}, Disabling the hdx exports feature" + ) + ENABLE_HDX_EXPORTS = False + +if ENABLE_HDX_EXPORTS: + from hdx.data.dataset import Dataset + from hdx.data.vocabulary import Vocabulary + + parse_list = ( + lambda value, delimiter=",": value.split(delimiter) + if isinstance(value, str) + else value or [] + ) + + ALLOWED_HDX_TAGS = parse_list( + os.environ.get("ENABLE_HDX_EXPORTS") + or config.get("HDX", "ALLOWED_HDX_TAGS", fallback=None) + or Vocabulary.approved_tags() + ) + ALLOWED_HDX_UPDATE_FREQUENCIES = parse_list( + os.environ.get("ALLOWED_HDX_UPDATE_FREQUENCIES") + or config.get("HDX", "ALLOWED_HDX_UPDATE_FREQUENCIES", fallback=None) + or Dataset.list_valid_update_frequencies() ) - print(HDX_URL_PREFIX) def get_db_connection_params() -> dict: diff --git a/src/query_builder/builder.py b/src/query_builder/builder.py index eefabb5c..18d9642e 100644 --- a/src/query_builder/builder.py +++ b/src/query_builder/builder.py @@ -18,11 +18,31 @@ # """Page Contains Query logic required for application""" import re -from json import dumps +from json import dumps, loads + +from geomet import wkt from src.config import logger as logging from src.validation.models import SupportedFilters, SupportedGeometryFilters +HDX_FILTER_CRITERIA = """ +This theme includes all OpenStreetMap features in this area matching: + +{criteria} +""" +HDX_MARKDOWN = """ +OpenStreetMap exports for use in GIS applications. +{filter_str} +Features may have these attributes: + +{columns} + +This dataset is one of many [OpenStreetMap exports on +HDX](https://data.humdata.org/organization/hot). +See the [Humanitarian OpenStreetMap Team](http://hotosm.org/) website for more +information. +""" + def get_grid_id_query(geometry_dump): base_query = f"""select @@ -819,7 +839,9 @@ def get_country_from_iso(iso3): return query -def postgres2duckdb_query(iso3, cid, table, enable_users_detail=False): +def postgres2duckdb_query( + base_table_name, table, cid=None, geometry=None, enable_users_detail=False +): select_query = ( """osm_id, version, changeset, timestamp, tags, ST_AsBinary(geom) as geometry""" ) @@ -829,12 +851,18 @@ def postgres2duckdb_query(iso3, cid, table, enable_users_detail=False): select_query = """osm_id, uid, user, version, changeset, timestamp, tags, ST_AsBinary(geom) as geometry""" create_select_duck_db = """osm_id, uid, user, version, changeset, timestamp, cast(tags::json AS map(varchar, varchar)) AS tags, cast(ST_GeomFromWKB(geometry) as GEOMETRY) AS geometry""" - duck_db_create = f"""CREATE TABLE {iso3}_{table} AS SELECT {create_select_duck_db} FROM postgres_query("postgres_db", "SELECT {select_query} FROM {table} WHERE country <@ ARRAY [{cid}]") """ + row_filter_condition = ( + f"""country <@ ARRAY [{cid}]""" + if cid + else f"""ST_within(geom,ST_GeomFromText('{wkt.dumps(loads(geometry.json()))}',4326))""" + ) + + duck_db_create = f"""CREATE TABLE {base_table_name}_{table} AS SELECT {create_select_duck_db} FROM postgres_query("postgres_db", "SELECT {select_query} FROM {table} WHERE {row_filter_condition}") """ return duck_db_create -def extract_features_duckdb(iso3, select, feature_type, where): +def extract_features_duckdb(base_table_name, select, feature_type, where): map_tables = { "points": {"table": ["nodes"], "where": {"nodes": where}}, "lines": { @@ -860,7 +888,7 @@ def extract_features_duckdb(iso3, select, feature_type, where): from_query = map_tables[feature_type]["table"] base_query = [] for table in from_query: - query = f"""select {select_query} from {f"{iso3}_{table}"} where {map_tables[feature_type]['where'][table]}""" + query = f"""select {select_query} from {f"{base_table_name}_{table}"} where {map_tables[feature_type]['where'][table]}""" base_query.append(query) return " UNION ALL ".join(base_query) diff --git a/src/validation/models.py b/src/validation/models.py index 3c793241..b7ce0a19 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -251,7 +251,7 @@ class Config: class StatsRequestParams(BaseModel): - geometry: Union[Polygon, MultiPolygon] = Field( + geometry: Optional[Union[Polygon, MultiPolygon]] = Field( default=None, example={ "type": "Polygon", @@ -266,7 +266,7 @@ class StatsRequestParams(BaseModel): ], }, ) - so3: str = Field( + iso3: Optional[str] = Field( default=None, description="ISO3 Country Code.", min_length=3, From 2f5b15a1a9bae001b874fd6df12097201918174c Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 00:38:13 +0545 Subject: [PATCH 05/20] Remove test py and added hdx supported file formats --- API/hdx.py | 83 ++++++++++++++++++++++-------------- API/test.py | 19 --------- src/app.py | 81 ++++++++++++++++++----------------- src/query_builder/builder.py | 2 +- 4 files changed, 95 insertions(+), 90 deletions(-) delete mode 100644 API/test.py diff --git a/API/hdx.py b/API/hdx.py index 3fe7e664..0eb9491a 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -61,7 +61,7 @@ class CategoryModel(BaseModel): formats: List[str] = Field( ..., description="List of Export Formats (suffixes).", - example=["gpkg", "fgb"], + example=["gpkg", "geojson"], ) @validator("types") @@ -95,9 +95,9 @@ def __init__(self, suffix, driver_name, layer_creation_options, format_option): "shp": ExportTypeInfo("shp", "ESRI Shapefile", [], "GDAL"), "gpkg": ExportTypeInfo("gpkg", "GeoPackage", [], "GDAL"), "sqlite": ExportTypeInfo("sqlite", "SQLite", [], "GDAL"), - "fgb": ExportTypeInfo("fgb", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), + "geojson": ExportTypeInfo("geojson", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), "mvt": ExportTypeInfo("mvt", "MVT", [], "GDAL"), - "kl": ExportTypeInfo("kml", "KML", [], "GDAL"), + "kml": ExportTypeInfo("kml", "KML", [], "GDAL"), "gpx": ExportTypeInfo("gpx", "GPX", [], "GDAL"), "parquet": ExportTypeInfo("parquet", "PARQUET", [], "PARQUET"), } @@ -153,7 +153,7 @@ class DynamicCategoriesModel(BaseModel): example="USA", ) dataset: Optional[DatasetConfig] = Field( - description="Dataset Configurations for HDX Upload" + default=None, description="Dataset Configurations for HDX Upload" ) categories: List[Dict[str, CategoryModel]] = Field( @@ -169,7 +169,7 @@ class DynamicCategoriesModel(BaseModel): "types": ["lines", "polygons"], "select": ["name", "highway"], "where": "highway IS NOT NULL", - "formats": ["fgb"], + "formats": ["geojson"], } } ], @@ -231,8 +231,29 @@ async def process_data( }, "types": ["lines"], "select": ["name", "highway"], - "where": "tags['highway'][1] IS NOT NULL", - "formats": ["fgb"], + "where": "tags['highway'] IS NOT NULL", + "formats": ["geojson"], + } + } + ], + }, + }, + "normal_iso_multiple_format": { + "summary": "Example: Road extraction using iso3 Multiple format", + "description": "Query to extract road in Nepal Multiple format", + "value": { + "iso3": "NPL", + "categories": [ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines"], + "select": ["name", "highway"], + "where": "tags['highway'] IS NOT NULL", + "formats": ["geojson", "gpkg", "kml", "shp"], } } ], @@ -269,8 +290,8 @@ async def process_data( }, "types": ["lines"], "select": ["name", "highway"], - "where": "tags['highway'][1] IS NOT NULL", - "formats": ["fgb"], + "where": "tags['highway'] IS NOT NULL", + "formats": ["geojson"], } } ], @@ -304,8 +325,8 @@ async def process_data( "office", "source", ], - "where": "tags['building'][1] IS NOT NULL", - "formats": ["fgb"], + "where": "tags['building'] IS NOT NULL", + "formats": ["geojson"], } }, { @@ -327,8 +348,8 @@ async def process_data( "layer", "source", ], - "where": "tags['highway'][1] IS NOT NULL", - "formats": ["fgb"], + "where": "tags['highway'] IS NOT NULL", + "formats": ["geojson"], } }, { @@ -351,8 +372,8 @@ async def process_data( "water", "source", ], - "where": "tags['waterway'][1] IS NOT NULL OR tags['water'][1] IS NOT NULL OR tags['natural'][1] IN ('water','wetland','bay')", - "formats": ["fgb"], + "where": "tags['waterway'] IS NOT NULL OR tags['water'] IS NOT NULL OR tags['natural'] IN ('water','wetland','bay')", + "formats": ["geojson"], } }, { @@ -381,8 +402,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['amenity'][1] IS NOT NULL OR tags['man_made'][1] IS NOT NULL OR tags['shop'][1] IS NOT NULL OR tags['tourism'][1] IS NOT NULL", - "formats": ["fgb"], + "where": "tags['amenity'] IS NOT NULL OR tags['man_made'] IS NOT NULL OR tags['shop'] IS NOT NULL OR tags['tourism'] IS NOT NULL", + "formats": ["geojson"], } }, { @@ -408,8 +429,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['aeroway'][1] IS NOT NULL OR tags['building'][1] = 'aerodrome' OR tags['emergency:helipad'][1] IS NOT NULL OR tags['emergency'][1] = 'landing_site'", - "formats": ["fgb"], + "where": "tags['aeroway'] IS NOT NULL OR tags['building'] = 'aerodrome' OR tags['emergency:helipad'] IS NOT NULL OR tags['emergency'] = 'landing_site'", + "formats": ["geojson"], } }, { @@ -432,8 +453,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['amenity'][1] = 'ferry_terminal' OR tags['building'][1] = 'ferry_terminal' OR tags['port'][1] IS NOT NULL", - "formats": ["fgb"], + "where": "tags['amenity'] = 'ferry_terminal' OR tags['building'] = 'ferry_terminal' OR tags['port'] IS NOT NULL", + "formats": ["geojson"], } }, { @@ -456,8 +477,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['amenity'][1] IN ('kindergarten', 'school', 'college', 'university') OR building IN ('kindergarten', 'school', 'college', 'university')", - "formats": ["fgb"], + "where": "tags['amenity'] IN ('kindergarten', 'school', 'college', 'university') OR building IN ('kindergarten', 'school', 'college', 'university')", + "formats": ["geojson"], } }, { @@ -479,8 +500,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['healthcare'][1] IS NOT NULL OR tags['amenity'][1] IN ('doctors', 'dentist', 'clinic', 'hospital', 'pharmacy')", - "formats": ["fgb"], + "where": "tags['healthcare'] IS NOT NULL OR tags['amenity'] IN ('doctors', 'dentist', 'clinic', 'hospital', 'pharmacy')", + "formats": ["geojson"], } }, { @@ -500,8 +521,8 @@ async def process_data( "is_in", "source", ], - "where": "tags['place'][1] IN ('isolated_dwelling', 'town', 'village', 'hamlet', 'city')", - "formats": ["fgb"], + "where": "tags['place'] IN ('isolated_dwelling', 'town', 'village', 'hamlet', 'city')", + "formats": ["geojson"], } }, { @@ -520,8 +541,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['amenity'][1] IN ('mobile_money_agent','bureau_de_change','bank','microfinance','atm','sacco','money_transfer','post_office')", - "formats": ["fgb"], + "where": "tags['amenity'] IN ('mobile_money_agent','bureau_de_change','bank','microfinance','atm','sacco','money_transfer','post_office')", + "formats": ["geojson"], } }, { @@ -546,8 +567,8 @@ async def process_data( "addr:city", "source", ], - "where": "tags['railway'][1] IN ('rail','station')", - "formats": ["fgb"], + "where": "tags['railway'] IN ('rail','station')", + "formats": ["geojson"], } }, ], diff --git a/API/test.py b/API/test.py deleted file mode 100644 index 57d01d72..00000000 --- a/API/test.py +++ /dev/null @@ -1,19 +0,0 @@ -import re - - -def replace_key(input_str): - pattern = r"tags\['([^']+)'\]\[1\]" - match = re.search(pattern, input_str) - - if match: - key = match.group(1) - return input_str.replace(match.group(0), key) - else: - return input_str - - -# Example usage: -input_str = "tags['railway'][1] IN ('rail','station')" -result = replace_key(input_str) - -print(result) diff --git a/src/app.py b/src/app.py index f9ed2646..95988899 100644 --- a/src/app.py +++ b/src/app.py @@ -1133,6 +1133,7 @@ class HDX: def __init__(self, params): self.params = params self.iso3 = self.params.iso3 + self.HDX_SUPPORTED_FORMATS = ["geojson", "gpkg", "kml", "shp"] if self.iso3: self.iso3 = self.iso3.lower() self.cid = None @@ -1192,12 +1193,13 @@ def types_to_tables(self, type_list: list): return list(table_set) def format_where_clause(self, where_clause): - pattern = r"tags\['([^']+)'\]\[1\]" + pattern = r"tags\['([^']+)'\]" match = re.search(pattern, where_clause) if match: key = match.group(1) - return where_clause.replace(match.group(0), key) + replacement = f"tags['{key}'][1]" + return re.sub(pattern, replacement, where_clause) else: return where_clause @@ -1285,7 +1287,7 @@ def process_category(self, category): self.iso3 if self.iso3 else self.params.dataset.dataset_prefix, category_data.select, feature_type, - category_data.where, + self.format_where_clause(category_data.where), ) resources = self.query_to_file( extract_query, category_name, feature_type, category_data.formats @@ -1293,6 +1295,33 @@ def process_category(self, category): uploaded_resources = self.zip_to_s3(resources) return uploaded_resources + def resource_to_hdx(self, uploaded_resources, dataset_config, category): + if any( + map( + lambda v: v["format_suffix"] in uploaded_resources, + self.HDX_SUPPORTED_FORMATS, + ) + ): + uploader = HDXUploader( + hdx=dataset_config, + category=category, + default_category_path=self.default_export_path, + completeness_metadata={ + "iso3": self.iso3, + "geometry": self.params.geometry, + }, + ) + uploader.init_dataset() + for resource in uploaded_resources: + if resource["format_suffix"] in self.HDX_SUPPORTED_FORMATS: + uploader.add_resource( + resource["filename"], + resource["format_suffix"], + resource["format_description"], + resource["download_url"], + ) + uploader.upload_dataset() + def process_hdx_tags(self): table_type = [ cat_type @@ -1327,33 +1356,17 @@ def process_hdx_tags(self): ) except Exception as e: - raise e - # logging.error(f"An error occurred for category {category}: {e}") - - def resource_to_hdx(self, uploaded_resources, dataset_config, category): - uploader = HDXUploader( - hdx=dataset_config, - category=category, - completeness_metadata={ - "iso3": self.iso3, - "geometry": self.params.geometry, - }, - ) - uploader.init_dataset() - for resource in uploaded_resources: - uploader.add_resource( - resource["filename"], - resource["format_suffix"], - resource["format_description"], - resource["download_url"], - ) - uploader.upload_dataset() + # raise e + logging.error(f"An error occurred for category {category}: {e}") class HDXUploader: - def __init__(self, category, hdx, completeness_metadata=None): + def __init__( + self, category, hdx, default_category_path, completeness_metadata=None + ): self.hdx = hdx self.category_name, self.category_data = list(category.items())[0] + self.category_path = os.path.join(default_category_path, self.category_name) self.dataset = None self.completeness_metadata = completeness_metadata self.data_completeness_stats = None @@ -1361,16 +1374,6 @@ def __init__(self, category, hdx, completeness_metadata=None): def slugify(self, name): return slugify(name).replace("-", "_") - def filter_formatter(self, where_str): - pattern = r"tags\['([^']+)'\]\[1\]" - match = re.search(pattern, where_str) - - if match: - key = match.group(1) - return where_str.replace(match.group(0), key) - else: - return where_str - def add_notes(self): columns = [] for key in self.category_data.select: @@ -1378,9 +1381,7 @@ def add_notes(self): "- [{0}](http://wiki.openstreetmap.org/wiki/Key:{0})".format(key) ) columns = "\n".join(columns) - filter_str = HDX_FILTER_CRITERIA.format( - criteria=self.filter_formatter(self.category_data.where) - ) + filter_str = HDX_FILTER_CRITERIA.format(criteria=self.category_data.where) if self.category_name.lower() in ["roads", "buildings"]: if self.data_completeness_stats is None: if self.completeness_metadata: @@ -1407,11 +1408,13 @@ def add_resource( "url": export_url, "last_modified": datetime.now().isoformat(), } - print(resource) self.dataset.add_update_resource(resource) def upload_dataset(self): if self.dataset: + self.dataset.save_to_json( + os.path.join(self.category_path, f"{self.dataset['name']}.json") + ) self.dataset.set_reference_period(datetime.now()) self.dataset.create_in_hdx(allow_no_resources=True) diff --git a/src/query_builder/builder.py b/src/query_builder/builder.py index 18d9642e..e8b3971f 100644 --- a/src/query_builder/builder.py +++ b/src/query_builder/builder.py @@ -26,7 +26,7 @@ from src.validation.models import SupportedFilters, SupportedGeometryFilters HDX_FILTER_CRITERIA = """ -This theme includes all OpenStreetMap features in this area matching: +This theme includes all OpenStreetMap features in this area matching (learn what tags means [here](https://wiki.openstreetmap.org/wiki/Tags)): {criteria} """ From 9fa50193ae3d8cd79e35a6fb135f2477901dee82 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 00:47:15 +0545 Subject: [PATCH 06/20] Added multiple format support for hdx --- API/hdx.py | 2 +- src/app.py | 8 +++----- src/query_builder/builder.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/API/hdx.py b/API/hdx.py index 0eb9491a..aa55761a 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -93,7 +93,7 @@ def __init__(self, suffix, driver_name, layer_creation_options, format_option): EXPORT_TYPE_MAPPING = { "geojson": ExportTypeInfo("geojson", "GeoJSON", [], "GDAL"), "shp": ExportTypeInfo("shp", "ESRI Shapefile", [], "GDAL"), - "gpkg": ExportTypeInfo("gpkg", "GeoPackage", [], "GDAL"), + "gpkg": ExportTypeInfo("gpkg", "GPKG", [], "GDAL"), "sqlite": ExportTypeInfo("sqlite", "SQLite", [], "GDAL"), "geojson": ExportTypeInfo("geojson", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), "mvt": ExportTypeInfo("mvt", "MVT", [], "GDAL"), diff --git a/src/app.py b/src/app.py index 95988899..d11a8609 100644 --- a/src/app.py +++ b/src/app.py @@ -1297,10 +1297,8 @@ def process_category(self, category): def resource_to_hdx(self, uploaded_resources, dataset_config, category): if any( - map( - lambda v: v["format_suffix"] in uploaded_resources, - self.HDX_SUPPORTED_FORMATS, - ) + item["format_suffix"] in self.HDX_SUPPORTED_FORMATS + for item in uploaded_resources ): uploader = HDXUploader( hdx=dataset_config, @@ -1356,7 +1354,7 @@ def process_hdx_tags(self): ) except Exception as e: - # raise e + raise e logging.error(f"An error occurred for category {category}: {e}") diff --git a/src/query_builder/builder.py b/src/query_builder/builder.py index e8b3971f..db0c18b9 100644 --- a/src/query_builder/builder.py +++ b/src/query_builder/builder.py @@ -26,7 +26,7 @@ from src.validation.models import SupportedFilters, SupportedGeometryFilters HDX_FILTER_CRITERIA = """ -This theme includes all OpenStreetMap features in this area matching (learn what tags means [here](https://wiki.openstreetmap.org/wiki/Tags)): +This theme includes all OpenStreetMap features in this area matching ( Learn what tags means [here](https://wiki.openstreetmap.org/wiki/Tags) ) : {criteria} """ From 6c6f2be099dc810903a9bc56a071d108a3d78559 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 00:49:29 +0545 Subject: [PATCH 07/20] updated typo --- API/hdx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/API/hdx.py b/API/hdx.py index aa55761a..3d9e7bb1 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -95,7 +95,7 @@ def __init__(self, suffix, driver_name, layer_creation_options, format_option): "shp": ExportTypeInfo("shp", "ESRI Shapefile", [], "GDAL"), "gpkg": ExportTypeInfo("gpkg", "GPKG", [], "GDAL"), "sqlite": ExportTypeInfo("sqlite", "SQLite", [], "GDAL"), - "geojson": ExportTypeInfo("geojson", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), + "fgb": ExportTypeInfo("fgb", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), "mvt": ExportTypeInfo("mvt", "MVT", [], "GDAL"), "kml": ExportTypeInfo("kml", "KML", [], "GDAL"), "gpx": ExportTypeInfo("gpx", "GPX", [], "GDAL"), From 60b66beb0802cbef763036bafc0e15417aefc073 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 19:41:33 +0545 Subject: [PATCH 08/20] Added configurations , documentation and moved api logic to queue --- API/api_worker.py | 16 +- API/hdx.py | 220 ++------------------- API/main.py | 5 +- API/raw_data.py | 3 +- API/stats.py | 48 ++++- README.md | 6 +- docs/src/installation/configurations.md | 15 ++ src/app.py | 247 ++++++++++++++++++------ src/config.py | 2 +- src/validation/models.py | 221 +++++++++++++++++++-- 10 files changed, 491 insertions(+), 292 deletions(-) diff --git a/API/api_worker.py b/API/api_worker.py index 257a973f..5dbf4d40 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -11,7 +11,7 @@ import sozipfile.sozipfile as zipfile from celery import Celery -from src.app import PolygonStats, RawData, S3FileTransfer +from src.app import HDX, PolygonStats, RawData, S3FileTransfer from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -19,7 +19,7 @@ from src.config import USE_S3_TO_UPLOAD as use_s3_to_upload from src.config import logger as logging from src.query_builder.builder import format_file_name_str -from src.validation.models import RawDataOutputType +from src.validation.models import DatasetConfig, RawDataOutputType celery = Celery("Raw Data API") celery.conf.broker_url = celery_broker_uri @@ -186,6 +186,18 @@ def process_raw_data(self, params): raise ex +@celery.task(bind=True, name="process_hdx_request") +def process_hdx_request(self, params): + if not params.dataset: + params.dataset = DatasetConfig() + hdx_object = HDX(params) + try: + return hdx_object.process_hdx_tags() + except Exception as ex: + hdx_object.clean_resources() + raise ex + + def remove_file(path: str) -> None: """Used for removing temp file dir and its all content after zip file is delivered to user""" try: diff --git a/API/hdx.py b/API/hdx.py index 3d9e7bb1..4618fbe3 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -1,217 +1,20 @@ -from enum import Enum -from typing import Dict, List, Optional, Union - -from fastapi import APIRouter, Body, Query, Request +from fastapi import APIRouter, Body, Request +from fastapi.responses import JSONResponse from fastapi_versioning import version -from geojson_pydantic import MultiPolygon, Polygon -from pydantic import BaseModel, Field, validator -from src.app import HDX -from src.config import ALLOWED_HDX_TAGS, ALLOWED_HDX_UPDATE_FREQUENCIES from src.config import LIMITER as limiter from src.config import RATE_LIMIT_PER_MIN +from src.validation.models import DynamicCategoriesModel -router = APIRouter(prefix="/hdx", tags=["HDX"]) - - -class HDXModel(BaseModel): - tags: List[str] = Field( - ..., - description="List of tags for the HDX model.", - example=["roads", "transportation", "geodata"], - ) - caveats: str = Field( - default="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", - description="Caveats/Warning for the Datasets.", - example="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", - ) - notes: str = Field( - default="", - description="Extra notes to append in notes section of hdx datasets", - example="Sample notes to append", - ) - - @validator("tags") - def validate_tags(cls, value): - for item in value: - if item.strip() not in ALLOWED_HDX_TAGS: - raise ValueError( - f"Invalid tag {item.strip()} , Should be within {ALLOWED_HDX_TAGS}" - ) - return value - - -class CategoryModel(BaseModel): - hdx: HDXModel - types: List[str] = Field( - ..., - description="List of feature types (points, lines, polygons).", - example=["lines"], - ) - select: List[str] = Field( - ..., - description="List of selected fields.", - example=["name", "highway"], - ) - where: str = Field( - ..., - description="SQL-like condition to filter features.", - example="highway IS NOT NULL", - ) - formats: List[str] = Field( - ..., - description="List of Export Formats (suffixes).", - example=["gpkg", "geojson"], - ) - - @validator("types") - def validate_types(cls, value): - allowed_types = {"points", "lines", "polygons"} - for item in value: - if item not in allowed_types: - raise ValueError( - f"Invalid type: {item}. Allowed types are {', '.join(allowed_types)}" - ) - return value - - @validator("formats") - def validate_export_types(cls, value): - for export_type in value: - if export_type not in EXPORT_TYPE_MAPPING: - raise ValueError(f"Unsupported export type: {export_type}") - return [EXPORT_TYPE_MAPPING[export_type] for export_type in value] - - -class ExportTypeInfo: - def __init__(self, suffix, driver_name, layer_creation_options, format_option): - self.suffix = suffix - self.driver_name = driver_name - self.layer_creation_options = layer_creation_options - self.format_option = format_option - +from .api_worker import process_hdx_request -EXPORT_TYPE_MAPPING = { - "geojson": ExportTypeInfo("geojson", "GeoJSON", [], "GDAL"), - "shp": ExportTypeInfo("shp", "ESRI Shapefile", [], "GDAL"), - "gpkg": ExportTypeInfo("gpkg", "GPKG", [], "GDAL"), - "sqlite": ExportTypeInfo("sqlite", "SQLite", [], "GDAL"), - "fgb": ExportTypeInfo("fgb", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), - "mvt": ExportTypeInfo("mvt", "MVT", [], "GDAL"), - "kml": ExportTypeInfo("kml", "KML", [], "GDAL"), - "gpx": ExportTypeInfo("gpx", "GPX", [], "GDAL"), - "parquet": ExportTypeInfo("parquet", "PARQUET", [], "PARQUET"), -} - - -class DatasetConfig(BaseModel): - private: bool = Field( - default=False, - description="Make dataset private , By default False , Public is recommended", - example="False", - ) - subnational: bool = Field( - default=False, - description="Make it true if dataset doesn't cover nation/country", - example="False", - ) - update_frequency: str = Field( - default="as needed", - description="Update frequncy to be added on uploads", - example="daily", - ) - dataset_title: str = Field( - default=None, - description="Dataset title which appears at top of the page", - example="Nepal", - ) - dataset_prefix: str = Field( - default=None, - description="Dataset prefix to be appended before category name, Will be ignored if iso3 is supplied", - example="hotosm_npl", - ) - dataset_locations: List[str] = Field( - default=None, - description="Valid dataset locations iso3", - example="['npl']", - ) - - @validator("update_frequency") - def validate_frequency(cls, value): - if value.strip() not in ALLOWED_HDX_UPDATE_FREQUENCIES: - raise ValueError( - f"Invalid update frequency , Should be within {ALLOWED_HDX_UPDATE_FREQUENCIES}" - ) - return value.strip() - - -class DynamicCategoriesModel(BaseModel): - iso3: Optional[str] = Field( - default=None, - description="ISO3 Country Code", - min_length=3, - max_length=3, - example="USA", - ) - dataset: Optional[DatasetConfig] = Field( - default=None, description="Dataset Configurations for HDX Upload" - ) - - categories: List[Dict[str, CategoryModel]] = Field( - ..., - description="List of dynamic categories.", - example=[ - { - "Roads": { - "hdx": { - "tags": ["roads", "transportation", "geodata"], - "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", - }, - "types": ["lines", "polygons"], - "select": ["name", "highway"], - "where": "highway IS NOT NULL", - "formats": ["geojson"], - } - } - ], - ) - geometry: Optional[Union[Polygon, MultiPolygon]] = Field( - default=None, - example={ - "type": "Polygon", - "coordinates": [ - [ - [83.96919250488281, 28.194446860487773], - [83.99751663208006, 28.194446860487773], - [83.99751663208006, 28.214869548073377], - [83.96919250488281, 28.214869548073377], - [83.96919250488281, 28.194446860487773], - ] - ], - }, - ) - - @validator("geometry", pre=True, always=True) - def set_geometry_or_iso3(cls, value, values): - """Either geometry or iso3 should be supplied.""" - if value is not None and values.get("iso3") is not None: - raise ValueError("Only one of geometry or iso3 should be supplied.") - if value is None and values.get("iso3") is None: - raise ValueError("Either geometry or iso3 should be supplied.") - if value is not None: - dataset = values.get("dataset").dict() - if dataset is None: - raise ValueError("Dataset config should be supplied for custom polygon") - - for item in dataset.keys(): - if dataset.get(item) is None: - raise ValueError(f"Missing, Dataset config : {item}") - return value +router = APIRouter(prefix="/hdx", tags=["HDX"]) @router.post("/submit/") @limiter.limit(f"{RATE_LIMIT_PER_MIN}/minute") @version(1) -async def process_data( +async def process_hdx_requests( request: Request, params: DynamicCategoriesModel = Body( ..., @@ -578,7 +381,7 @@ async def process_data( ), ): """ - Process data based on dynamic categories. + Process data based on dynamic categories, Fully flexible on filtering and select Args: request: FastAPI Request object. @@ -587,7 +390,8 @@ async def process_data( Returns: dict: Result message. """ - if not params.dataset: - params.dataset = DatasetConfig() - hdx_set = HDX(params).process_hdx_tags() - return {"message": "Data processed successfully"} + queue_name = "raw_special" + task = process_hdx_request.apply_async( + args=(params,), queue=queue_name, track_started=True + ) + return JSONResponse({"task_id": task.id, "track_link": f"/tasks/status/{task.id}/"}) diff --git a/API/main.py b/API/main.py index c72ec28b..87edae7c 100644 --- a/API/main.py +++ b/API/main.py @@ -27,6 +27,7 @@ from slowapi.errors import RateLimitExceeded from src.config import ( + ENABLE_HDX_EXPORTS, ENABLE_POLYGON_STATISTICS_ENDPOINTS, EXPORT_PATH, LIMITER, @@ -67,7 +68,9 @@ app.include_router(auth_router) app.include_router(raw_data_router) app.include_router(tasks_router) -app.include_router(hdx_router) + +if ENABLE_HDX_EXPORTS: + app.include_router(hdx_router) if ENABLE_POLYGON_STATISTICS_ENDPOINTS: app.include_router(stats_router) diff --git a/API/raw_data.py b/API/raw_data.py index 830ccfa0..b9f010e0 100644 --- a/API/raw_data.py +++ b/API/raw_data.py @@ -443,7 +443,8 @@ def get_osm_current_snapshot_as_file( ], ) - queue_name = "recurring_queue" if not params.uuid else "raw_default" + # queue_name = "raw_special" if not params.uuid else "raw_default" + queue_name = "raw_default" # Everything directs to default now task = process_raw_data.apply_async( args=(params,), queue=queue_name, track_started=True ) diff --git a/API/stats.py b/API/stats.py index 302bc163..85c3d045 100644 --- a/API/stats.py +++ b/API/stats.py @@ -1,4 +1,6 @@ -from fastapi import APIRouter, Request +import json + +from fastapi import APIRouter, Body, Request from fastapi_versioning import version from src.app import PolygonStats @@ -12,7 +14,38 @@ @router.post("/polygon/") @limiter.limit(f"{POLYGON_STATISTICS_API_RATE_LIMIT}/minute") @version(1) -async def get_polygon_stats(request: Request, params: StatsRequestParams): +async def get_polygon_stats( + request: Request, + params: StatsRequestParams = Body( + ..., + description="Get Summary and raw stats related to polygon", + openapi_examples={ + "normal_polygon": { + "summary": "Normal Example of requesting stats", + "description": "Query to extract stats using Custom Polygon", + "value": { + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + } + }, + }, + "normal_iso": { + "summary": "Query to extract stats using iso", + "description": "Extract stats using iso3 only, For eg : for Nepal", + "value": {"iso3": "npl"}, + }, + }, + ), +): """Get statistics for the specified polygon. Args: @@ -22,6 +55,15 @@ async def get_polygon_stats(request: Request, params: StatsRequestParams): Returns: dict: A dictionary containing statistics for the specified polygon. """ - generator = PolygonStats(params.geometry, params.iso3) + feature = None + if params.geometry: + feature = { + "type": "Feature", + "geometry": json.loads(params.geometry.json()), + "properties": {}, + } + if params.iso3: + params.iso3 = params.iso3.lower() + generator = PolygonStats(feature, params.iso3) return generator.get_summary_stats() diff --git a/README.md b/README.md index 1cf78f3c..f3179f43 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ uvicorn API.main:app --reload ### Queues Currently there are two type of queue implemented : -- "recurring_queue" : Queue for recurring exports which will replace the previous exports if present on the system , can be enabled through uuid:false API Param +- "raw_special" : Queue for recurring exports which will replace the previous exports if present on the system , can be enabled through uuid:false API Param - "raw_default" : Queue for default exports which will create each unique id for exports ### Start Celery Worker @@ -119,7 +119,7 @@ You should be able to start [celery](https://docs.celeryq.dev/en/stable/getting- ``` - Start for recurring queue ``` - celery --app API.api_worker worker --loglevel=INFO --queues="recurring_queue" -n 'recurring_worker' + celery --app API.api_worker worker --loglevel=INFO --queues="raw_special" -n 'recurring_worker' ``` Set no of request that a worker can take at a time by using --concurrency @@ -129,7 +129,7 @@ Set no of request that a worker can take at a time by using --concurrency Raw Data API uses flower for monitoring the Celery distributed queue. Run this command on a different shell , if you are running redis on same machine your broker could be `redis://localhost:6379//`. ``` -celery --broker=redis://redis:6379// --app API.api_worker flower --port=5000 --queues="recurring_queue,raw_default" +celery --broker=redis://redis:6379// --app API.api_worker flower --port=5000 --queues="raw_special,raw_default" ``` ### Navigate to the docs to view Raw Data API endpoints diff --git a/docs/src/installation/configurations.md b/docs/src/installation/configurations.md index 4f5baeaa..0620839c 100644 --- a/docs/src/installation/configurations.md +++ b/docs/src/installation/configurations.md @@ -31,6 +31,7 @@ The following sections are recognised. - `[API_CONFIG]` - API service related configuration. Required. - `[EXPORT_UPLOAD]` - For external file hosts like S3. Optional. - `[SENTRY]` - Sentry monitoring configuration. Optional. +- `[HDX]` - HDX Exports related configuration. Optional. The following are the different configuration options that are accepted. @@ -66,6 +67,12 @@ The following are the different configuration options that are accepted. | `AWS_SECRET_ACCESS_KEY` | `AWS_SECRET_ACCESS_KEY` | `[EXPORT_UPLOAD]` | _none_ | AWS Secret Access Key for S3 access | CONDITIONAL | | `SENTRY_DSN` | `SENTRY_DSN` | `[SENTRY]` | _none_ | Sentry Data Source Name | OPTIONAL | | `SENTRY_RATE` | `SENTRY_RATE` | `[SENTRY]` | `1.0` | Sample rate percentage for shipping errors to sentry; Allowed values between 0 (0%) to 1 (100%)| OPTIONAL | +| `ENABLE_HDX_EXPORTS` | `ENABLE_HDX_EXPORTS` | `[HDX]` | False | Enables hdx related endpoints and imports | OPTIONAL | +| `HDX_SITE` | `HDX_SITE` | `[HDX]` | 'demo' | HDX site to point , By default demo site , use prod for production | CONDITIONAL | +| `HDX_API_KEY` | `HDX_API_KEY` | `[HDX]` | None | Your API Secret key for hdx upload , should have write access and it is compulsory if ENABLE_HDX_EXPORTS is True | CONDITIONAL | +| `HDX_OWNER_ORG` | `HDX_OWNER_ORG` | `[HDX]` | None | Your HDX organization ID| CONDITIONAL | +| `HDX_MAINTAINER` | `HDX_MAINTAINER` | `[HDX]` | None | Your HDX Maintainer ID | CONDITIONAL | + ## Which Service uses which settings? @@ -102,6 +109,14 @@ The following are the different configuration options that are accepted. | `AWS_SECRET_ACCESS_KEY` | TBD | No | Yes | | `SENTRY_DSN` | TBD | Yes | No | | `SENTRY_RATE` | TBD | Yes | No | +| `ENABLE_HDX_EXPORTS` | `[HDX]` | Yes | Yes | +| `HDX_SITE` | `[HDX]` | Yes | Yes | +| `HDX_API_KEY` | `[HDX]` | Yes | Yes | +| `HDX_OWNER_ORG` | `[HDX]` | Yes | Yes | +| `HDX_MAINTAINER` | `[HDX]` | Yes | Yes | + + + ## Compulsory Configuration diff --git a/src/app.py b/src/app.py index d11a8609..1a119ecd 100644 --- a/src/app.py +++ b/src/app.py @@ -26,6 +26,7 @@ import sys import time import uuid +from collections import namedtuple from datetime import datetime from datetime import datetime as dt from datetime import timezone @@ -54,7 +55,7 @@ EXPORT_MAX_AREA_SQKM, ) from src.config import EXPORT_PATH as export_path -from src.config import HDX_MAINTAINER, HDX_OWNER_ORG +from src.config import HDX_MAINTAINER, HDX_OWNER_ORG, HDX_URL_PREFIX from src.config import INDEX_THRESHOLD as index_threshold from src.config import POLYGON_STATISTICS_API_URL from src.config import USE_CONNECTION_POOLING as use_connection_pooling @@ -939,7 +940,7 @@ def __init__(self, geojson=None, iso3=None): raise HTTPException(status_code=404, detail="Invalid iso3 code") self.INPUT_GEOM = result[0] else: - self.INPUT_GEOM = dumps(json_loads(geojson.json())) + self.INPUT_GEOM = dumps(geojson) @staticmethod def get_building_pattern_statement( @@ -1160,7 +1161,9 @@ def __init__(self, params): if not self.params.dataset.dataset_locations: self.params.dataset.dataset_locations = dataset_locations - self.uuid = str(uuid.uuid4()) + self.uuid = str(uuid.uuid4().hex) + self.parallel_process_state = False + self.default_export_path = os.path.join( export_path, self.uuid, @@ -1170,12 +1173,11 @@ def __init__(self, params): if os.path.exists(self.default_export_path): shutil.rmtree(self.default_export_path) os.makedirs(self.default_export_path) - self.duck_db_instance = DuckDB( - os.path.join( - self.default_export_path, - f"{self.iso3 if self.iso3 else self.params.dataset.dataset_prefix}.db", - ) + self.duck_db_db_path = os.path.join( + self.default_export_path, + f"{self.iso3 if self.iso3 else self.params.dataset.dataset_prefix}.db", ) + self.duck_db_instance = DuckDB(self.duck_db_db_path) def types_to_tables(self, type_list: list): mapping = { @@ -1203,22 +1205,26 @@ def format_where_clause(self, where_clause): else: return where_clause - def zip_to_s3(self, resources): - for resource in resources: - s3_upload_name = os.path.relpath( - resource["zip_path"], os.path.join(export_path, self.uuid) + def upload_to_s3(self, resource_path): + if not USE_S3_TO_UPLOAD: + raise HTTPException( + status_code=404, detail="S3 Export service is disabled on server" ) + s3_upload_name = os.path.relpath( + resource_path, os.path.join(export_path, self.uuid) + ) + file_transfer_obj = S3FileTransfer() + download_url = file_transfer_obj.upload( + resource_path, + str(s3_upload_name), + ) + return download_url - if not USE_S3_TO_UPLOAD: - raise HTTPException( - status_code=404, detail="S3 Export service is disabled on server" - ) - file_transfer_obj = S3FileTransfer() - download_url = file_transfer_obj.upload( - resource["zip_path"], - str(s3_upload_name), + def zip_to_s3(self, resources): + for resource in resources: + resource["download_url"] = self.upload_to_s3( + resource_path=resource["zip_path"] ) - resource["download_url"] = download_url os.remove(resource["zip_path"]) return resources @@ -1243,12 +1249,13 @@ def file_to_zip(self, working_dir, zip_path): return zip_path def query_to_file(self, query, category_name, feature_type, export_formats): - category_name = category_name.lower().replace(" ", "_") + category_name = slugify(category_name.lower()).replace("-", "_") file_export_path = os.path.join( self.default_export_path, category_name, feature_type ) resources = [] - for export_format in export_formats: + + def process_export_format(export_format): export_format_path = os.path.join(file_export_path, export_format.suffix) os.makedirs(export_format_path, exist_ok=True) @@ -1277,11 +1284,41 @@ def query_to_file(self, query, category_name, feature_type, export_formats): resource["format_suffix"] = export_format.suffix resource["format_description"] = export_format.driver_name + return resource + + if self.parallel_process_state is False and len(export_formats) > 1: + with concurrent.futures.ThreadPoolExecutor( + max_workers=os.cpu_count() + ) as executor: + futures = [ + executor.submit(process_export_format, export_format) + for export_format in export_formats + ] + resources = [ + future.result() + for future in concurrent.futures.as_completed(futures) + ] + else: + resource = process_export_format(export_formats[0]) resources.append(resource) + return resources + def process_category_result(self, category_result): + if self.params.hdx_upload: + return self.resource_to_hdx( + uploaded_resources=category_result.uploaded_resources, + dataset_config=self.params.dataset, + category=category_result.category, + ) + + return self.resource_to_response( + category_result.uploaded_resources, category_result.category + ) + def process_category(self, category): category_name, category_data = list(category.items())[0] + all_uploaded_resources = [] for feature_type in category_data.types: extract_query = extract_features_duckdb( self.iso3 if self.iso3 else self.params.dataset.dataset_prefix, @@ -1293,7 +1330,26 @@ def process_category(self, category): extract_query, category_name, feature_type, category_data.formats ) uploaded_resources = self.zip_to_s3(resources) - return uploaded_resources + all_uploaded_resources.extend(uploaded_resources) + return all_uploaded_resources + + def resource_to_response(self, uploaded_resources, category): + category_name, category_data = list(category.items())[0] + + dataset_info = {} + resources = [] + for resource in uploaded_resources: + resource_meta = { + "name": resource["filename"], + "format": resource["format_suffix"], + "description": resource["format_description"], + "url": resource["download_url"], + "last_modifed": datetime.now().isoformat(), + } + resource_meta["uploaded_to_hdx"]: False + resources.append(resource_meta) + dataset_info["resources"] = resources + return {category_name: dataset_info} def resource_to_hdx(self, uploaded_resources, dataset_config, category): if any( @@ -1304,21 +1360,35 @@ def resource_to_hdx(self, uploaded_resources, dataset_config, category): hdx=dataset_config, category=category, default_category_path=self.default_export_path, + uuid=self.uuid, completeness_metadata={ "iso3": self.iso3, "geometry": self.params.geometry, }, ) uploader.init_dataset() + non_hdx_resources = [] for resource in uploaded_resources: + resource_meta = { + "name": resource["filename"], + "format": resource["format_suffix"], + "description": resource["format_description"], + "url": resource["download_url"], + "last_modifed": datetime.now().isoformat(), + } if resource["format_suffix"] in self.HDX_SUPPORTED_FORMATS: - uploader.add_resource( - resource["filename"], - resource["format_suffix"], - resource["format_description"], - resource["download_url"], - ) - uploader.upload_dataset() + uploader.add_resource(resource_meta) + else: + resource_meta["uploaded_to_hdx"]: False + non_hdx_resources.append(resource_meta) + category_name, hdx_dataset_info = uploader.upload_dataset(self.params.meta) + hdx_dataset_info["resources"].extend(non_hdx_resources) + return {category_name: hdx_dataset_info} + + def clean_resources(self): + temp_dir = os.path.join(export_path, self.uuid) + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) def process_hdx_tags(self): table_type = [ @@ -1327,47 +1397,92 @@ def process_hdx_tags(self): for cat_type in list(category.values())[0].types ] table_names = self.types_to_tables(list(set(table_type))) - + base_table_name = self.iso3 if self.iso3 else self.params.dataset.dataset_prefix for table in table_names: create_table = postgres2duckdb_query( - self.iso3 if self.iso3 else self.params.dataset.dataset_prefix, + base_table_name, table, self.cid, self.params.geometry, ) self.duck_db_instance.run_query(create_table.strip(), attach_pgsql=True) + CategoryResult = namedtuple( + "CategoryResult", ["category", "uploaded_resources"] + ) + + tag_process_results = [] + dataset_results = [] + if len(self.params.categories) > 1: + self.parallel_process_state = True + with concurrent.futures.ThreadPoolExecutor( + max_workers=os.cpu_count() * 2 + ) as executor: + futures = { + executor.submit(self.process_category, category): category + for category in self.params.categories + } + + for future in concurrent.futures.as_completed(futures): + category = futures[future] + uploaded_resources = future.result() + category_result = CategoryResult( + category=category, uploaded_resources=uploaded_resources + ) + tag_process_results.append(category_result) + else: + resources = self.process_category(self.params.categories[0]) + category_result = CategoryResult( + category=self.params.categories[0], uploaded_resources=resources + ) + tag_process_results.append(category_result) with concurrent.futures.ThreadPoolExecutor( max_workers=os.cpu_count() * 2 ) as executor: futures = { - executor.submit(self.process_category, category): category - for category in self.params.categories + executor.submit(self.process_category_result, result): result + for result in tag_process_results } for future in concurrent.futures.as_completed(futures): - category = futures[future] - try: - uploaded_resources = future.result() - self.resource_to_hdx( - uploaded_resources, self.params.dataset, category - ) + result = futures[future] + result_data = future.result() + dataset_results.append(result_data) - except Exception as e: - raise e - logging.error(f"An error occurred for category {category}: {e}") + result = {"datasets": dataset_results} + if self.params.meta: + db_dump_path = os.path.join( + self.default_export_path, + "DB_DUMP", + ) + os.makedirs(db_dump_path, exist_ok=True) + export_db = f"""EXPORT DATABASE '{db_dump_path}' (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 100000);""" + self.duck_db_instance.run_query(export_db, load_spatial=True) + db_zip_download_url = self.upload_to_s3( + self.file_to_zip( + working_dir=db_dump_path, + zip_path=os.path.join(self.default_export_path, "dbdump.zip"), + ) + ) + result["db_dump"] = db_zip_download_url + self.clean_resources() + return result class HDXUploader: def __init__( - self, category, hdx, default_category_path, completeness_metadata=None + self, category, hdx, uuid, default_category_path, completeness_metadata=None ): self.hdx = hdx self.category_name, self.category_data = list(category.items())[0] - self.category_path = os.path.join(default_category_path, self.category_name) + self.category_path = os.path.join( + default_category_path, slugify(self.category_name.lower()).replace("-", "_") + ) self.dataset = None + self.uuid = uuid self.completeness_metadata = completeness_metadata self.data_completeness_stats = None + self.resources = [] def slugify(self, name): return slugify(name).replace("-", "_") @@ -1385,36 +1500,48 @@ def add_notes(self): if self.completeness_metadata: self.data_completeness_stats = PolygonStats( iso3=self.completeness_metadata["iso3"], - geojson=self.completeness_metadata["geometry"], + geojson=self.completeness_metadata["geometry"].json() + if self.completeness_metadata["geometry"] + else None, ).get_summary_stats() if self.data_completeness_stats: self.category_data.hdx.notes += f'{self.data_completeness_stats["summary"][self.category_name.lower()]}\n' - self.category_data.hdx.notes += "Read about what this summary means, [indicators](https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/indicators.md) , [metrics](https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/metrics.md)" + self.category_data.hdx.notes += "Read about what this summary means, [indicators](https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/indicators.md) , [metrics](https://github.com/hotosm/raw-data-api/tree/develop/docs/src/stats/metrics.md)\n" return self.category_data.hdx.notes + HDX_MARKDOWN.format( columns=columns, filter_str=filter_str ) - def add_resource( - self, resource_name, resource_format, resource_description, export_url - ): + def add_resource(self, resource_meta): if self.dataset: - resource = { - "name": resource_name, - "format": resource_format, - "description": resource_description, - "url": export_url, - "last_modified": datetime.now().isoformat(), - } - self.dataset.add_update_resource(resource) + self.resources.append(resource_meta) + self.dataset.add_update_resource(resource_meta) - def upload_dataset(self): + def upload_dataset(self, dump_config_to_s3=False): if self.dataset: + dataset_info = {} + dt_config_path = os.path.join( + self.category_path, f"{self.dataset['name']}.json" + ) self.dataset.save_to_json( os.path.join(self.category_path, f"{self.dataset['name']}.json") ) + if dump_config_to_s3: + s3_upload_name = os.path.relpath( + dt_config_path, os.path.join(export_path, self.uuid) + ) + file_transfer_obj = S3FileTransfer() + dataset_info["config"] = file_transfer_obj.upload( + dt_config_path, + str(s3_upload_name), + ) + self.dataset.set_reference_period(datetime.now()) self.dataset.create_in_hdx(allow_no_resources=True) + dataset_info["name"] = self.dataset["name"] + dataset_info["hdx_url"] = f"{HDX_URL_PREFIX}/dataset/{self.dataset['name']}" + dataset_info["resources"] = self.resources + return self.category_name, dataset_info def init_dataset(self): dataset_prefix = self.hdx.dataset_prefix diff --git a/src/config.py b/src/config.py index 6cdc6c4c..caf0229a 100644 --- a/src/config.py +++ b/src/config.py @@ -191,7 +191,7 @@ "HDX", "HDX_OWNER_ORG", fallback="225b9f7d-e7cb-4156-96a6-44c9c58d31e3" ) HDX_MAINTAINER = os.environ.get("HDX_MAINTAINER") or config.get( - "HDX", "HDX_MAINTAINER", fallback="6a0688ce-8521-46e2-8edd-8e26c0851ebd" + "HDX", "HDX_MAINTAINER", fallback=None ) from hdx.api.configuration import Configuration diff --git a/src/validation/models.py b/src/validation/models.py index b7ce0a19..b0185e23 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -29,6 +29,8 @@ from src.config import ( ALLOW_BIND_ZIP_FILTER, + ALLOWED_HDX_TAGS, + ALLOWED_HDX_UPDATE_FREQUENCIES, ENABLE_POLYGON_STATISTICS_ENDPOINTS, ENABLE_TILES, EXPORT_MAX_AREA_SQKM, @@ -251,6 +253,13 @@ class Config: class StatsRequestParams(BaseModel): + iso3: Optional[str] = Field( + default=None, + description="ISO3 Country Code.", + min_length=3, + max_length=3, + example="NPL", + ) geometry: Optional[Union[Polygon, MultiPolygon]] = Field( default=None, example={ @@ -266,12 +275,201 @@ class StatsRequestParams(BaseModel): ], }, ) + + @validator("geometry", pre=True, always=True) + def set_geometry_or_iso3(cls, value, values): + """Either geometry or iso3 should be supplied.""" + if value is not None and values.get("iso3") is not None: + raise ValueError("Only one of geometry or iso3 should be supplied.") + if value is None and values.get("iso3") is None: + raise ValueError("Either geometry or iso3 should be supplied.") + return value + + +### HDX BLock + + +class HDXModel(BaseModel): + tags: List[str] = Field( + ..., + description="List of tags for the HDX model.", + example=["roads", "transportation", "geodata"], + ) + caveats: str = Field( + default="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + description="Caveats/Warning for the Datasets.", + example="OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + ) + notes: str = Field( + default="", + description="Extra notes to append in notes section of hdx datasets", + example="Sample notes to append", + ) + + @validator("tags") + def validate_tags(cls, value): + for item in value: + if item.strip() not in ALLOWED_HDX_TAGS: + raise ValueError( + f"Invalid tag {item.strip()} , Should be within {ALLOWED_HDX_TAGS}" + ) + return value + + +class CategoryModel(BaseModel): + hdx: HDXModel + types: List[str] = Field( + ..., + description="List of feature types (points, lines, polygons).", + example=["lines"], + ) + select: List[str] = Field( + ..., + description="List of selected fields.", + example=["name", "highway"], + ) + where: str = Field( + ..., + description="SQL-like condition to filter features.", + example="highway IS NOT NULL", + ) + formats: List[str] = Field( + ..., + description="List of Export Formats (suffixes).", + example=["gpkg", "geojson"], + ) + + @validator("types") + def validate_types(cls, value): + allowed_types = {"points", "lines", "polygons"} + for item in value: + if item not in allowed_types: + raise ValueError( + f"Invalid type: {item}. Allowed types are {', '.join(allowed_types)}" + ) + return value + + @validator("formats") + def validate_export_types(cls, value): + for export_type in value: + if export_type not in EXPORT_TYPE_MAPPING: + raise ValueError(f"Unsupported export type: {export_type}") + return [EXPORT_TYPE_MAPPING[export_type] for export_type in value] + + +class ExportTypeInfo: + def __init__(self, suffix, driver_name, layer_creation_options, format_option): + self.suffix = suffix + self.driver_name = driver_name + self.layer_creation_options = layer_creation_options + self.format_option = format_option + + +EXPORT_TYPE_MAPPING = { + "geojson": ExportTypeInfo("geojson", "GeoJSON", [], "GDAL"), + "shp": ExportTypeInfo("shp", "ESRI Shapefile", [], "GDAL"), + "gpkg": ExportTypeInfo("gpkg", "GPKG", [], "GDAL"), + "sqlite": ExportTypeInfo("sqlite", "SQLite", [], "GDAL"), + "fgb": ExportTypeInfo("fgb", "FlatGeobuf", ["VERIFY_BUFFERS=NO"], "GDAL"), + "mvt": ExportTypeInfo("mvt", "MVT", [], "GDAL"), + "kml": ExportTypeInfo("kml", "KML", [], "GDAL"), + "gpx": ExportTypeInfo("gpx", "GPX", [], "GDAL"), + "parquet": ExportTypeInfo("parquet", "PARQUET", [], "PARQUET"), +} + + +class DatasetConfig(BaseModel): + private: bool = Field( + default=False, + description="Make dataset private , By default False , Public is recommended", + example="False", + ) + subnational: bool = Field( + default=False, + description="Make it true if dataset doesn't cover nation/country", + example="False", + ) + update_frequency: str = Field( + default="as needed", + description="Update frequncy to be added on uploads", + example="daily", + ) + dataset_title: str = Field( + default=None, + description="Dataset title which appears at top of the page", + example="Nepal", + ) + dataset_prefix: str = Field( + default=None, + description="Dataset prefix to be appended before category name, Will be ignored if iso3 is supplied", + example="hotosm_npl", + ) + dataset_locations: List[str] = Field( + default=None, + description="Valid dataset locations iso3", + example="['npl']", + ) + + @validator("update_frequency") + def validate_frequency(cls, value): + if value.strip() not in ALLOWED_HDX_UPDATE_FREQUENCIES: + raise ValueError( + f"Invalid update frequency , Should be within {ALLOWED_HDX_UPDATE_FREQUENCIES}" + ) + return value.strip() + + +class DynamicCategoriesModel(BaseModel): iso3: Optional[str] = Field( default=None, - description="ISO3 Country Code.", + description="ISO3 Country Code", min_length=3, max_length=3, - example="NPL", + example="USA", + ) + dataset: Optional[DatasetConfig] = Field( + default=None, description="Dataset Configurations for HDX Upload" + ) + meta: bool = Field( + default=False, + description="Dumps Meta db in parquet format & hdx config json to s3", + ) + hdx_upload: bool = Field( + default=True, description="Enable/Disable uploading dataset to hdx" + ) + + categories: List[Dict[str, CategoryModel]] = Field( + ..., + description="List of dynamic categories.", + example=[ + { + "Roads": { + "hdx": { + "tags": ["roads", "transportation", "geodata"], + "caveats": "OpenStreetMap data is crowd sourced and cannot be considered to be exhaustive", + }, + "types": ["lines", "polygons"], + "select": ["name", "highway"], + "where": "highway IS NOT NULL", + "formats": ["geojson"], + } + } + ], + ) + geometry: Optional[Union[Polygon, MultiPolygon]] = Field( + default=None, + example={ + "type": "Polygon", + "coordinates": [ + [ + [83.96919250488281, 28.194446860487773], + [83.99751663208006, 28.194446860487773], + [83.99751663208006, 28.214869548073377], + [83.96919250488281, 28.214869548073377], + [83.96919250488281, 28.194446860487773], + ] + ], + }, ) @validator("geometry", pre=True, always=True) @@ -281,15 +479,12 @@ def set_geometry_or_iso3(cls, value, values): raise ValueError("Only one of geometry or iso3 should be supplied.") if value is None and values.get("iso3") is None: raise ValueError("Either geometry or iso3 should be supplied.") - return value - - @validator("geometry", pre=True, always=True) - def validate_geometry(cls, value): - """Converts geometry to geojson feature.""" if value is not None: - feature = { - "type": "Feature", - "geometry": json.loads(value.json()), - "properties": {}, - } - return feature + dataset = values.get("dataset").dict() + if dataset is None: + raise ValueError("Dataset config should be supplied for custom polygon") + + for item in dataset.keys(): + if dataset.get(item) is None: + raise ValueError(f"Missing, Dataset config : {item}") + return value From 2376428c7a77bb0c7037bd48a138c8229db6bbbd Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:06:41 +0545 Subject: [PATCH 09/20] upgrade python to 3.10 for workflows --- .github/workflows/Unit-Test.yml | 4 ++-- .github/workflows/build.yml | 4 ++-- .github/workflows/code-check.yml | 6 +++--- .github/workflows/publish_mkdocs.yml | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/Unit-Test.yml b/.github/workflows/Unit-Test.yml index e021283e..123e1960 100644 --- a/.github/workflows/Unit-Test.yml +++ b/.github/workflows/Unit-Test.yml @@ -31,10 +31,10 @@ jobs: options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 2 steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v1 with: - python-version: 3.8 + python-version: 3.10 - name: Insert sample db data run: | diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b689ac9d..fe9dc0f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,10 +33,10 @@ jobs: options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 2 steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v1 with: - python-version: 3.8 + python-version: 3.10 - name: Install necessary dependecies for rawdata loading run: sudo apt-get update && sudo apt-get install osm2pgsql - name: check version diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml index 35a56d58..1b19d083 100644 --- a/.github/workflows/code-check.yml +++ b/.github/workflows/code-check.yml @@ -2,7 +2,7 @@ name: Code Check - Linting using flake8 on: push: paths-ignore: - - 'infra/**' + - "infra/**" branches: - master - develop @@ -16,10 +16,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.10 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/publish_mkdocs.yml b/.github/workflows/publish_mkdocs.yml index 5f511b5a..ed95c3b6 100644 --- a/.github/workflows/publish_mkdocs.yml +++ b/.github/workflows/publish_mkdocs.yml @@ -6,8 +6,8 @@ on: - master paths: # Only rebuild documentation when docs have changed - - 'docs/**' - - '.github/workflows/publish_mkdocs.yml' + - "docs/**" + - ".github/workflows/publish_mkdocs.yml" permissions: contents: write jobs: @@ -15,10 +15,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3.10 publish_branch: gh-pages - name: Install Dependencies run: | From 91b06b5b77c0dcd2d12cda3e8175d39080a2d4db Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:08:52 +0545 Subject: [PATCH 10/20] Replace python version to specific version of 3.10 --- .github/workflows/Unit-Test.yml | 2 +- .github/workflows/build.yml | 2 +- .github/workflows/code-check.yml | 2 +- .github/workflows/publish_mkdocs.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/Unit-Test.yml b/.github/workflows/Unit-Test.yml index 123e1960..7ba8a5a9 100644 --- a/.github/workflows/Unit-Test.yml +++ b/.github/workflows/Unit-Test.yml @@ -34,7 +34,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v1 with: - python-version: 3.10 + python-version: 3.10.13 - name: Insert sample db data run: | diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fe9dc0f1..2018770f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -36,7 +36,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v1 with: - python-version: 3.10 + python-version: 3.10.13 - name: Install necessary dependecies for rawdata loading run: sudo apt-get update && sudo apt-get install osm2pgsql - name: check version diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml index 1b19d083..c06208c4 100644 --- a/.github/workflows/code-check.yml +++ b/.github/workflows/code-check.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.10 + python-version: 3.10.13 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/publish_mkdocs.yml b/.github/workflows/publish_mkdocs.yml index ed95c3b6..79fff6f8 100644 --- a/.github/workflows/publish_mkdocs.yml +++ b/.github/workflows/publish_mkdocs.yml @@ -18,7 +18,7 @@ jobs: - name: Set up Python 3.10 uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: 3.10.13 publish_branch: gh-pages - name: Install Dependencies run: | From fc6c4ad65af1fdb0a5dc05bc60b8fbdc3876dd98 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:12:41 +0545 Subject: [PATCH 11/20] Updated requirements , fixed typo error --- requirements.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 59c11f54..494e1cae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,13 @@ fastapi==0.105.0 uvicorn==0.24.0 psycopg2==2.9.9 geojson-pydantic==1.0.1 -pytest == 7.4.3 -geojson == 3.1.0 +pytest==7.4.3 -# Used for new relic monitoring -newrelic == 7.2.4.171 -sentry-sdk == 1.5.12 +geojson==3.1.0 + +# # Used for new relic monitoring +# newrelic==7.2.4.171 +# sentry-sdk==1.5.12 ## Third party area==1.1.1 From 4129e75de64b1dc5fdf138a3479863578eaf93ce Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:17:12 +0545 Subject: [PATCH 12/20] Split requirement and fix typo in setup.py --- requirements.txt | 6 +++++- setup.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 494e1cae..938d294b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,10 +2,14 @@ fastapi==0.105.0 uvicorn==0.24.0 psycopg2==2.9.9 geojson-pydantic==1.0.1 -pytest==7.4.3 + geojson==3.1.0 +# Testing +pytest==7.4.3 + + # # Used for new relic monitoring # newrelic==7.2.4.171 # sentry-sdk==1.5.12 diff --git a/setup.py b/setup.py index cb5655b0..76b48c2d 100644 --- a/setup.py +++ b/setup.py @@ -9,11 +9,11 @@ description="The Raw Data API module makes it simple for you to get osm data stats provided by api in your own project", packages=setuptools.find_packages(), install_requires=[ - "pytest == 7.4.3", + "pytest==7.4.3", "psycopg2", "boto3==1.24.38", "fastapi==0.105.0", - "geojson == 7.4.3", + "geojson==3.1.0", "area==1.1.1", "orjson==3.9.10", "slowapi==0.1.8", From 55b67507fa7541b835baf76cd176defb0a7be5c9 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:22:10 +0545 Subject: [PATCH 13/20] Only import hdx config if it is enabled --- src/validation/models.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/validation/models.py b/src/validation/models.py index b0185e23..e586af2f 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -29,13 +29,14 @@ from src.config import ( ALLOW_BIND_ZIP_FILTER, - ALLOWED_HDX_TAGS, - ALLOWED_HDX_UPDATE_FREQUENCIES, ENABLE_POLYGON_STATISTICS_ENDPOINTS, ENABLE_TILES, - EXPORT_MAX_AREA_SQKM, + ENABLE_HDX_EXPORTS ) +if ENABLE_HDX_EXPORTS: + from src.config import ALLOWED_HDX_TAGS, ALLOWED_HDX_UPDATE_FREQUENCIES, + def to_camel(string: str) -> str: split_string = string.split("_") From 55aacbea44bc9fb53853661039b7afa008bd20b7 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:23:37 +0545 Subject: [PATCH 14/20] Remove training comma in import --- src/validation/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/validation/models.py b/src/validation/models.py index e586af2f..febcc1ac 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -29,13 +29,13 @@ from src.config import ( ALLOW_BIND_ZIP_FILTER, + ENABLE_HDX_EXPORTS, ENABLE_POLYGON_STATISTICS_ENDPOINTS, ENABLE_TILES, - ENABLE_HDX_EXPORTS ) if ENABLE_HDX_EXPORTS: - from src.config import ALLOWED_HDX_TAGS, ALLOWED_HDX_UPDATE_FREQUENCIES, + from src.config import ALLOWED_HDX_TAGS, ALLOWED_HDX_UPDATE_FREQUENCIES def to_camel(string: str) -> str: From a72cee418ef0bc9049705b294ac0449b6b7a3431 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:32:09 +0545 Subject: [PATCH 15/20] Install missing lib for unit test --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 938d294b..5e67808a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ geojson==3.1.0 # Testing pytest==7.4.3 - +httpx==0.26.0 # # Used for new relic monitoring # newrelic==7.2.4.171 From b3f0004eaf538d3ce85426eb62cfcd1ef07da260 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:37:17 +0545 Subject: [PATCH 16/20] Only import sentry if config is supplied , Also adds documentation to builder --- API/main.py | 4 +++- src/query_builder/builder.py | 43 ++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/API/main.py b/API/main.py index 87edae7c..b1323244 100644 --- a/API/main.py +++ b/API/main.py @@ -18,7 +18,6 @@ # import time -import sentry_sdk from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles @@ -48,6 +47,9 @@ if ENABLE_POLYGON_STATISTICS_ENDPOINTS: from .stats import router as stats_router +if SENTRY_DSN: + import sentry_sdk + # only use sentry if it is specified in config blocks if SENTRY_DSN: sentry_sdk.init( diff --git a/src/query_builder/builder.py b/src/query_builder/builder.py index db0c18b9..448cfe2e 100644 --- a/src/query_builder/builder.py +++ b/src/query_builder/builder.py @@ -829,6 +829,15 @@ def generate_polygon_stats_graphql_query(geojson_feature): def get_country_from_iso(iso3): + """ + Generate a SQL query to retrieve country information based on ISO3 code. + + Args: + - iso3 (str): ISO3 Country Code. + + Returns: + str: SQL query to fetch country information. + """ query = f"""SELECT b.cid::int as fid, b.description as name, b.dataset_name as dataset_prefix, b.locations as locations FROM @@ -842,6 +851,19 @@ def get_country_from_iso(iso3): def postgres2duckdb_query( base_table_name, table, cid=None, geometry=None, enable_users_detail=False ): + """ + Generate a DuckDB query to create a table from a PostgreSQL query. + + Args: + - base_table_name (str): Base table name. + - table (str): PostgreSQL table name. + - cid (int, optional): Country ID for filtering. Defaults to None. + - geometry (Polygon, optional): Custom polygon geometry. Defaults to None. + - enable_users_detail (bool, optional): Enable user details. Defaults to False. + + Returns: + str: DuckDB query for creating a table. + """ select_query = ( """osm_id, version, changeset, timestamp, tags, ST_AsBinary(geom) as geometry""" ) @@ -863,6 +885,18 @@ def postgres2duckdb_query( def extract_features_duckdb(base_table_name, select, feature_type, where): + """ + Generate a DuckDB query to extract features based on given parameters. + + Args: + - base_table_name (str): Base table name. + - select (List[str]): List of selected fields. + - feature_type (str): Type of feature (points, lines, polygons). + - where (str): SQL-like condition to filter features. + + Returns: + str: DuckDB query to extract features. + """ map_tables = { "points": {"table": ["nodes"], "where": {"nodes": where}}, "lines": { @@ -894,6 +928,15 @@ def extract_features_duckdb(base_table_name, select, feature_type, where): def get_country_geom_from_iso(iso3): + """ + Generate a SQL query to retrieve country geometry based on ISO3 code. + + Args: + - iso3 (str): ISO3 Country Code. + + Returns: + str: SQL query to fetch country geometry. + """ query = f"""SELECT ST_AsGeoJSON(geometry) as geom FROM From 82bb633abf1aa37ac87578a94e682ff89dd51a54 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 20:41:56 +0545 Subject: [PATCH 17/20] Only import hdx related config if its in config enabled , added docs for ducdb class --- src/app.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/app.py b/src/app.py index 1a119ecd..b44caddd 100644 --- a/src/app.py +++ b/src/app.py @@ -55,7 +55,6 @@ EXPORT_MAX_AREA_SQKM, ) from src.config import EXPORT_PATH as export_path -from src.config import HDX_MAINTAINER, HDX_OWNER_ORG, HDX_URL_PREFIX from src.config import INDEX_THRESHOLD as index_threshold from src.config import POLYGON_STATISTICS_API_URL from src.config import USE_CONNECTION_POOLING as use_connection_pooling @@ -89,6 +88,9 @@ import duckdb from hdx.data.dataset import Dataset + from src.config import HDX_MAINTAINER, HDX_OWNER_ORG, HDX_URL_PREFIX + + global LOCAL_CON_POOL # getting the pool instance which was fireup when API is started @@ -1104,6 +1106,13 @@ def get_summary_stats(self): class DuckDB: + """ + Constructor for the DuckDB class. + + Parameters: + - db_path (str): The path to the DuckDB database file. + """ + def __init__(self, db_path): dbdict = get_db_connection_params() self.db_con_str = convert_dict_to_conn_str(db_dict=dbdict) @@ -1118,6 +1127,14 @@ def __init__(self, db_path): con.load_extension("json") def run_query(self, query, attach_pgsql=False, load_spatial=False): + """ + Executes a query on the DuckDB database. + + Parameters: + - query (str): The SQL query to execute. + - attach_pgsql (bool): Flag to indicate whether to attach a PostgreSQL database. + - load_spatial (bool): Flag to indicate whether to load the spatial extension. + """ with duckdb.connect(self.db_path) as con: if attach_pgsql: con.execute( @@ -1131,6 +1148,13 @@ def run_query(self, query, attach_pgsql=False, load_spatial=False): class HDX: + """ + Constructor for the HDX class. + + Parameters: + - params (DynamicCategoriesModel): An instance of DynamicCategoriesModel containing configuration settings. + """ + def __init__(self, params): self.params = params self.iso3 = self.params.iso3 @@ -1180,6 +1204,15 @@ def __init__(self, params): self.duck_db_instance = DuckDB(self.duck_db_db_path) def types_to_tables(self, type_list: list): + """ + Maps feature types to corresponding database tables. + + Parameters: + - type_list (List[str]): List of feature types. + + Returns: + - List of database tables associated with the given feature types. + """ mapping = { "points": ["nodes"], "lines": ["ways_line", "relations"], @@ -1195,6 +1228,15 @@ def types_to_tables(self, type_list: list): return list(table_set) def format_where_clause(self, where_clause): + """ + Formats the where_clause by replacing certain patterns. + + Parameters: + - where_clause (str): SQL-like condition to filter features. + + Returns: + - Formatted where_clause. + """ pattern = r"tags\['([^']+)'\]" match = re.search(pattern, where_clause) From f33a0799b56f4d7e7e178eb28fc5c50c00224a2b Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 21:02:45 +0545 Subject: [PATCH 18/20] Added missing docstrings and also added default none to optional as pydantic is upgraded --- src/app.py | 132 +++++++++++++++++++++++++++++++++++++++ src/validation/models.py | 78 +++++++++++++++++++---- 2 files changed, 198 insertions(+), 12 deletions(-) diff --git a/src/app.py b/src/app.py index b44caddd..89d8f9aa 100644 --- a/src/app.py +++ b/src/app.py @@ -1248,6 +1248,15 @@ def format_where_clause(self, where_clause): return where_clause def upload_to_s3(self, resource_path): + """ + Uploads a resource file to Amazon S3. + + Parameters: + - resource_path (str): Path to the resource file on the local filesystem. + + Returns: + - Download URL for the uploaded resource. + """ if not USE_S3_TO_UPLOAD: raise HTTPException( status_code=404, detail="S3 Export service is disabled on server" @@ -1263,6 +1272,15 @@ def upload_to_s3(self, resource_path): return download_url def zip_to_s3(self, resources): + """ + Zips and uploads a list of resources to Amazon S3. + + Parameters: + - resources (List[Dict[str, Any]]): List of resource dictionaries. + + Returns: + - List of resource dictionaries with added download URLs. + """ for resource in resources: resource["download_url"] = self.upload_to_s3( resource_path=resource["zip_path"] @@ -1271,6 +1289,16 @@ def zip_to_s3(self, resources): return resources def file_to_zip(self, working_dir, zip_path): + """ + Creates a ZIP file from files in a directory. + + Parameters: + - working_dir (str): Path to the directory containing files to be zipped. + - zip_path (str): Path to the resulting ZIP file. + + Returns: + - Path to the created ZIP file. + """ zf = zipfile.ZipFile( zip_path, "w", @@ -1291,6 +1319,18 @@ def file_to_zip(self, working_dir, zip_path): return zip_path def query_to_file(self, query, category_name, feature_type, export_formats): + """ + Executes a query and exports the result to file(s). + + Parameters: + - query (str): SQL query to execute. + - category_name (str): Name of the category. + - feature_type (str): Feature type. + - export_formats (List[ExportTypeInfo]): List of export formats. + + Returns: + - List of resource dictionaries containing export information. + """ category_name = slugify(category_name.lower()).replace("-", "_") file_export_path = os.path.join( self.default_export_path, category_name, feature_type @@ -1347,6 +1387,15 @@ def process_export_format(export_format): return resources def process_category_result(self, category_result): + """ + Processes the result of a category and prepares the response. + + Parameters: + - category_result (CategoryResult): Instance of CategoryResult. + + Returns: + - Dictionary containing processed category result. + """ if self.params.hdx_upload: return self.resource_to_hdx( uploaded_resources=category_result.uploaded_resources, @@ -1359,6 +1408,15 @@ def process_category_result(self, category_result): ) def process_category(self, category): + """ + Processes a category by executing queries and handling exports. + + Parameters: + - category (Dict[str, CategoryModel]): Dictionary representing a category. + + Returns: + - List of resource dictionaries containing export information. + """ category_name, category_data = list(category.items())[0] all_uploaded_resources = [] for feature_type in category_data.types: @@ -1376,6 +1434,16 @@ def process_category(self, category): return all_uploaded_resources def resource_to_response(self, uploaded_resources, category): + """ + Converts uploaded resources to a response format. + + Parameters: + - uploaded_resources (List[Dict[str, Any]]): List of resource dictionaries. + - category (Dict[str, CategoryModel]): Dictionary representing a category. + + Returns: + - Dictionary containing the response information. + """ category_name, category_data = list(category.items())[0] dataset_info = {} @@ -1394,6 +1462,17 @@ def resource_to_response(self, uploaded_resources, category): return {category_name: dataset_info} def resource_to_hdx(self, uploaded_resources, dataset_config, category): + """ + Converts uploaded resources to an HDX dataset and uploads to HDX. + + Parameters: + - uploaded_resources (List[Dict[str, Any]]): List of resource dictionaries. + - dataset_config (DatasetConfig): Instance of DatasetConfig. + - category (Dict[str, CategoryModel]): Dictionary representing a category. + + Returns: + - Dictionary containing the HDX upload information. + """ if any( item["format_suffix"] in self.HDX_SUPPORTED_FORMATS for item in uploaded_resources @@ -1428,11 +1507,20 @@ def resource_to_hdx(self, uploaded_resources, dataset_config, category): return {category_name: hdx_dataset_info} def clean_resources(self): + """ + Cleans up temporary resources. + """ temp_dir = os.path.join(export_path, self.uuid) if os.path.exists(temp_dir): shutil.rmtree(temp_dir) def process_hdx_tags(self): + """ + Processes HDX tags and executes category processing in parallel. + + Returns: + - Dictionary containing the processed dataset information. + """ table_type = [ cat_type for category in self.params.categories @@ -1512,6 +1600,17 @@ def process_hdx_tags(self): class HDXUploader: + """ + Constructor for the HDXUploader class. + + Parameters: + - category (Dict[str, CategoryModel]): Dictionary representing a category. + - hdx (HDX): Instance of the HDX class. + - uuid (str): Universally unique identifier. + - default_category_path (str): Default path for the category. + - completeness_metadata (Optional[Dict[str, Any]]): Metadata for completeness. + """ + def __init__( self, category, hdx, uuid, default_category_path, completeness_metadata=None ): @@ -1527,9 +1626,24 @@ def __init__( self.resources = [] def slugify(self, name): + """ + Converts a string to a valid slug format. + + Parameters: + - name (str): Input string. + + Returns: + - Slugified string. + """ return slugify(name).replace("-", "_") def add_notes(self): + """ + Adds notes based on category data. + + Returns: + - Notes string. + """ columns = [] for key in self.category_data.select: columns.append( @@ -1555,11 +1669,26 @@ def add_notes(self): ) def add_resource(self, resource_meta): + """ + Adds a resource to the list of resources. + + Parameters: + - resource_meta (Dict[str, Any]): Metadata for the resource. + """ if self.dataset: self.resources.append(resource_meta) self.dataset.add_update_resource(resource_meta) def upload_dataset(self, dump_config_to_s3=False): + """ + Uploads the dataset to HDX. + + Parameters: + - dump_config_to_s3 (bool): Flag to indicate whether to dump configuration to S3. + + Returns: + - Tuple containing category name and dataset information. + """ if self.dataset: dataset_info = {} dt_config_path = os.path.join( @@ -1586,6 +1715,9 @@ def upload_dataset(self, dump_config_to_s3=False): return self.category_name, dataset_info def init_dataset(self): + """ + Initializes the HDX dataset. + """ dataset_prefix = self.hdx.dataset_prefix dataset_title = self.hdx.dataset_title dataset_locations = self.hdx.dataset_locations diff --git a/src/validation/models.py b/src/validation/models.py index febcc1ac..15895455 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -94,27 +94,27 @@ class JoinFilterType(Enum): class SQLFilter(BaseModel): - join_or: Optional[Dict[str, List[str]]] - join_and: Optional[Dict[str, List[str]]] + join_or: Optional[Dict[str, List[str]]] = Field(default=None) + join_and: Optional[Dict[str, List[str]]] = Field(default=None) class TagsFilter(BaseModel): - point: Optional[SQLFilter] - line: Optional[SQLFilter] - polygon: Optional[SQLFilter] - all_geometry: Optional[SQLFilter] + point: Optional[SQLFilter] = Field(default=None) + line: Optional[SQLFilter] = Field(default=None) + polygon: Optional[SQLFilter] = Field(default=None) + all_geometry: Optional[SQLFilter] = Field(default=None) class AttributeFilter(BaseModel): - point: Optional[List[str]] - line: Optional[List[str]] - polygon: Optional[List[str]] - all_geometry: Optional[List[str]] + point: Optional[List[str]] = Field(default=None) + line: Optional[List[str]] = Field(default=None) + polygon: Optional[List[str]] = Field(default=None) + all_geometry: Optional[List[str]] = Field(default=None) class Filters(BaseModel): - tags: Optional[TagsFilter] - attributes: Optional[AttributeFilter] + tags: Optional[TagsFilter] = Field(default=None) + attributes: Optional[AttributeFilter] = Field(default=None) class RawDataCurrentParamsBase(BaseModel): @@ -291,6 +291,15 @@ def set_geometry_or_iso3(cls, value, values): class HDXModel(BaseModel): + """ + Model for HDX configuration settings. + + Fields: + - tags (List[str]): List of tags for the HDX model. + - caveats (str): Caveats/Warning for the Datasets. + - notes (str): Extra notes to append in the notes section of HDX datasets. + """ + tags: List[str] = Field( ..., description="List of tags for the HDX model.", @@ -318,6 +327,17 @@ def validate_tags(cls, value): class CategoryModel(BaseModel): + """ + Model for category configuration settings. + + Fields: + - hdx (HDXModel): HDX configuration model. + - types (List[str]): List of feature types (points, lines, polygons). + - select (List[str]): List of selected fields. + - where (str): SQL-like condition to filter features. + - formats (List[str]): List of Export Formats (suffixes). + """ + hdx: HDXModel types: List[str] = Field( ..., @@ -359,6 +379,16 @@ def validate_export_types(cls, value): class ExportTypeInfo: + """ + Class representing export type information. + + Fields: + - suffix (str): File suffix for the export type. + - driver_name (str): GDAL driver name. + - layer_creation_options (List[str]): Layer creation options. + - format_option (str): Format option for GDAL. + """ + def __init__(self, suffix, driver_name, layer_creation_options, format_option): self.suffix = suffix self.driver_name = driver_name @@ -380,6 +410,18 @@ def __init__(self, suffix, driver_name, layer_creation_options, format_option): class DatasetConfig(BaseModel): + """ + Model for dataset configuration settings. + + Fields: + - private (bool): Make dataset private. By default False, public is recommended. + - subnational (bool): Make it true if the dataset doesn't cover the nation/country. + - update_frequency (str): Update frequency to be added on uploads. + - dataset_title (str): Dataset title that appears at the top of the page. + - dataset_prefix (str): Dataset prefix to be appended before the category name. Ignored if iso3 is supplied. + - dataset_locations (List[str]): Valid dataset locations iso3. + """ + private: bool = Field( default=False, description="Make dataset private , By default False , Public is recommended", @@ -421,6 +463,18 @@ def validate_frequency(cls, value): class DynamicCategoriesModel(BaseModel): + """ + Model for dynamic categories. + + Fields: + - iso3 (Optional[str]): ISO3 Country Code. + - dataset (Optional[DatasetConfig]): Dataset Configurations for HDX Upload. + - meta (bool): Dumps Meta db in parquet format & HDX config JSON to S3. + - hdx_upload (bool): Enable/Disable uploading the dataset to HDX. + - categories (List[Dict[str, CategoryModel]]): List of dynamic categories. + - geometry (Optional[Union[Polygon, MultiPolygon]]): Custom polygon geometry. + """ + iso3: Optional[str] = Field( default=None, description="ISO3 Country Code", From eaacec5dd7e8dd76b2c46ca1496b8afa2c83f9b1 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 21:08:29 +0545 Subject: [PATCH 19/20] Adds missing docstring in models --- src/validation/models.py | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/validation/models.py b/src/validation/models.py index 15895455..6ec2c92d 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -318,6 +318,17 @@ class HDXModel(BaseModel): @validator("tags") def validate_tags(cls, value): + """Validates tags if they are allowed from hdx allowed approved tags + + Args: + value (_type_): _description_ + + Raises: + ValueError: _description_ + + Returns: + _type_: _description_ + """ for item in value: if item.strip() not in ALLOWED_HDX_TAGS: raise ValueError( @@ -362,6 +373,17 @@ class CategoryModel(BaseModel): @validator("types") def validate_types(cls, value): + """validates geom types + + Args: + value (_type_): _description_ + + Raises: + ValueError: _description_ + + Returns: + _type_: _description_ + """ allowed_types = {"points", "lines", "polygons"} for item in value: if item not in allowed_types: @@ -372,6 +394,17 @@ def validate_types(cls, value): @validator("formats") def validate_export_types(cls, value): + """Validates export types if they are supported + + Args: + value (_type_): _description_ + + Raises: + ValueError: _description_ + + Returns: + _type_: _description_ + """ for export_type in value: if export_type not in EXPORT_TYPE_MAPPING: raise ValueError(f"Unsupported export type: {export_type}") @@ -455,6 +488,17 @@ class DatasetConfig(BaseModel): @validator("update_frequency") def validate_frequency(cls, value): + """Validates frequency + + Args: + value (_type_): _description_ + + Raises: + ValueError: _description_ + + Returns: + _type_: _description_ + """ if value.strip() not in ALLOWED_HDX_UPDATE_FREQUENCIES: raise ValueError( f"Invalid update frequency , Should be within {ALLOWED_HDX_UPDATE_FREQUENCIES}" From cceba46cdf088f96e469b2a991e02e0c91e1fcc2 Mon Sep 17 00:00:00 2001 From: kshitijrajsharma Date: Thu, 21 Dec 2023 21:32:17 +0545 Subject: [PATCH 20/20] Adds authentication on hdx endpoints and modifies tasks endpoint to get status of whats going on --- API/hdx.py | 4 +++- API/tasks.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/API/hdx.py b/API/hdx.py index 4618fbe3..1421fe6a 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Body, Request +from fastapi import APIRouter, Body, Depends, Request from fastapi.responses import JSONResponse from fastapi_versioning import version @@ -7,6 +7,7 @@ from src.validation.models import DynamicCategoriesModel from .api_worker import process_hdx_request +from .auth import AuthUser, staff_required router = APIRouter(prefix="/hdx", tags=["HDX"]) @@ -16,6 +17,7 @@ @version(1) async def process_hdx_requests( request: Request, + user: AuthUser = Depends(staff_required), params: DynamicCategoriesModel = Body( ..., description="Input parameters including ISO3 country code and dynamic categories.", diff --git a/API/tasks.py b/API/tasks.py index 03a7c903..a93b0dc6 100644 --- a/API/tasks.py +++ b/API/tasks.py @@ -68,9 +68,16 @@ def inspect_workers(): inspected = celery.control.inspect() def extract_file_name(args: str) -> str: - """Extract file_name using a pattern match.""" - match = re.search(r"file_name\s*=\s*['\"]([^'\"]+)['\"]", args) - return match.group(1) if match else None + """Extract value prioritizing file_name, then iso3, and finally dataset_title.""" + keys = ["file_name", "iso3", "dataset_title"] + + for key in keys: + pattern = re.compile(rf"{key}\s*=\s*['\"]([^'\"]+)['\"]") + match = pattern.search(args) + if match: + return match.group(1) + + return None def filter_task_details(tasks: List[dict]) -> List[dict]: """Filter task details to include only id and file_name."""