diff --git a/API/api_worker.py b/API/api_worker.py index 2f5374f2..005f7778 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -33,6 +33,7 @@ celery.conf.result_serializer = "pickle" celery.conf.accept_content = ["application/json", "application/x-python-serialize"] celery.conf.task_track_started = True +celery.conf.update(result_extended=True) @celery.task(bind=True, name="process_raw_data") @@ -194,7 +195,8 @@ def process_raw_data(self, params): @celery.task(bind=True, name="process_hdx_request") def process_hdx_request(self, params): - # params = DynamicCategoriesModel(**params) + params = DynamicCategoriesModel(**params) + if not params.dataset: params.dataset = DatasetConfig() hdx_object = HDX(params) diff --git a/API/hdx.py b/API/hdx.py index 05fa3748..7169fe78 100644 --- a/API/hdx.py +++ b/API/hdx.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Body, Depends, Request +from fastapi import APIRouter, Body, Depends, HTTPException, Request from fastapi.responses import JSONResponse from fastapi_versioning import version @@ -7,7 +7,7 @@ from src.validation.models import DynamicCategoriesModel from .api_worker import process_hdx_request -from .auth import AuthUser, staff_required +from .auth import AuthUser, UserRole, staff_required router = APIRouter(prefix="/hdx", tags=["HDX"]) @@ -691,8 +691,13 @@ async def process_hdx_requests( Returns: dict: Result message. """ - queue_name = "raw_special" + queue_name = params.queue + if params.queue != "raw_special" and user.role != UserRole.ADMIN.value: + raise HTTPException( + status_code=403, + detail=[{"msg": "Insufficient Permission to choose queue"}], + ) task = process_hdx_request.apply_async( - args=(params,), queue=queue_name, track_started=True + args=(params.model_dump(),), queue=queue_name, track_started=True ) return JSONResponse({"task_id": task.id, "track_link": f"/tasks/status/{task.id}/"}) diff --git a/API/s3.py b/API/s3.py index 1c4b8834..bb537b87 100644 --- a/API/s3.py +++ b/API/s3.py @@ -1,3 +1,4 @@ +import json from urllib.parse import quote import boto3 @@ -10,7 +11,7 @@ from src.config import AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME from src.config import LIMITER as limiter -from src.config import POLYGON_STATISTICS_API_RATE_LIMIT +from src.config import RATE_LIMIT_PER_MIN router = APIRouter(prefix="/s3", tags=["S3"]) @@ -27,7 +28,7 @@ @router.get("/files/") -@limiter.limit(f"{POLYGON_STATISTICS_API_RATE_LIMIT}/minute") +@limiter.limit(f"{RATE_LIMIT_PER_MIN}/minute") @version(1) def list_s3_files( request: Request, @@ -89,7 +90,7 @@ def list_s3_files( @router.get("/get/{file_path:path}") -@limiter.limit(f"{POLYGON_STATISTICS_API_RATE_LIMIT}/minute") +@limiter.limit(f"{RATE_LIMIT_PER_MIN}/minute") @version(1) def get_s3_file( request: Request, @@ -100,6 +101,10 @@ def get_s3_file( gt=60 * 10, le=3600 * 12 * 7, ), + read_meta: bool = Query( + default=True, + description="Whether to read and deliver the content of .json file", + ), ): bucket_name = BUCKET_NAME file_path = file_path.strip("/") @@ -116,6 +121,18 @@ def get_s3_file( status_code=404, detail=f"File or folder not found: {file_path}" ) + if read_meta and file_path.lower().endswith(".json"): + # Read and deliver the content of meta.json + try: + response = s3.get_object(Bucket=bucket_name, Key=file_path) + content = json.loads(response["Body"].read()) + return JSONResponse(content=jsonable_encoder(content)) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Error reading meta.json: {str(e)}" + ) + + # If not reading meta.json, generate a presigned URL presigned_url = s3.generate_presigned_url( "get_object", Params={"Bucket": bucket_name, "Key": file_path}, diff --git a/API/tasks.py b/API/tasks.py index f66f6f3c..bc9de65a 100644 --- a/API/tasks.py +++ b/API/tasks.py @@ -1,7 +1,5 @@ +import html import json -import re -from datetime import datetime -from typing import List import redis from celery.result import AsyncResult @@ -24,7 +22,13 @@ @router.get("/status/{task_id}/", response_model=SnapshotTaskResponse) @version(1) -def get_task_status(task_id): +def get_task_status( + task_id, + args: bool = Query( + default=False, + description="Displays argument of task as well", + ), +): """Tracks the request from the task id provided by Raw Data API for the request Args: @@ -41,11 +45,19 @@ def get_task_status(task_id): """ task_result = AsyncResult(task_id, app=celery) + task_response_result = None + if task_result.status == "SUCCESS": + task_response_result = task_result.result + if task_result.status == "FAILED": + task_response_result = html.escape(task_result.traceback) + result = { "id": task_id, "status": task_result.state, - "result": task_result.result if task_result.status == "SUCCESS" else None, + "result": task_response_result, } + if args: + result["args"] = task_result.args return JSONResponse(result) @@ -66,7 +78,7 @@ def revoke_task(task_id, user: AuthUser = Depends(staff_required)): @router.get("/inspect/") @version(1) -def inspect_workers( +async def inspect_workers( request: Request, summary: bool = Query( default=True, @@ -83,19 +95,20 @@ def inspect_workers( active_tasks_summary = [] if summary: - for worker, tasks in active_tasks.items(): - for task in tasks: - temp_task = {} - temp_task["id"] = task["id"] - temp_task["name"] = task["name"] - temp_task["time_start"] = ( - datetime.fromtimestamp(task["time_start"]).strftime( - "%Y-%m-%d %H:%M:%S" + if active_tasks: + for worker, tasks in active_tasks.items(): + for task in tasks: + temp_task = {} + temp_task["id"] = task["id"] + temp_task["name"] = task["name"] + temp_task["time_start"] = ( + datetime.fromtimestamp(task["time_start"]).strftime( + "%Y-%m-%d %H:%M:%S" + ) + if task["time_start"] + else None ) - if task["time_start"] - else None - ) - active_tasks_summary.append(temp_task) + active_tasks_summary.append(temp_task) response_data = { "active": active_tasks_summary if summary else active_tasks, @@ -146,7 +159,7 @@ def get_queue_info(): @router.get("/queue/details/{queue_name}/") @version(1) -def get_list_details(queue_name: str): +async def get_list_details(queue_name: str): if queue_name not in queues: raise HTTPException(status_code=404, detail=f"Queue '{queue_name}' not found") @@ -155,7 +168,6 @@ def get_list_details(queue_name: str): # Convert bytes to strings list_items = [item.decode("utf-8") for item in list_items] - # Create a list of dictionaries with item details items_details = [ { "index": index, diff --git a/src/app.py b/src/app.py index 5ea4e72b..efd8f5ee 100644 --- a/src/app.py +++ b/src/app.py @@ -74,7 +74,7 @@ postgres2duckdb_query, raw_currentdata_extraction_query, ) -from src.validation.models import RawDataOutputType +from src.validation.models import EXPORT_TYPE_MAPPING, RawDataOutputType # import instance for pooling if use_connection_pooling: @@ -1202,7 +1202,7 @@ def __init__(self, params): self.default_export_path = os.path.join( export_path, self.uuid, - "HDX", + self.params.dataset.dataset_folder, self.iso3.upper() if self.iso3 else self.params.dataset.dataset_prefix, ) if os.path.exists(self.default_export_path): @@ -1350,6 +1350,7 @@ def query_to_file(self, query, category_name, feature_type, export_formats): start_export_formats_time = time.time() def process_export_format(export_format): + export_format = EXPORT_TYPE_MAPPING.get(export_format) export_format_path = os.path.join(file_export_path, export_format.suffix) os.makedirs(export_format_path, exist_ok=True) logging.info( @@ -1578,7 +1579,7 @@ def process_hdx_tags(self): geometry=self.params.geometry, single_category_where=where_0_category, ) - print(create_table) + logging.debug(create_table) start = time.time() logging.info("Transfer-> Postgres Data to DuckDB Started : %s", table) self.duck_db_instance.run_query(create_table.strip(), attach_pgsql=True) @@ -1653,9 +1654,7 @@ def process_hdx_tags(self): ) result["started_at"] = started_at - meta_last_run_dump_path = os.path.join( - self.default_export_path, "meta_last_run.json" - ) + meta_last_run_dump_path = os.path.join(self.default_export_path, "meta.json") with open(meta_last_run_dump_path, "w") as json_file: json.dump(result, json_file, indent=4) self.upload_to_s3(resource_path=meta_last_run_dump_path) @@ -1756,11 +1755,9 @@ def upload_dataset(self, dump_config_to_s3=False): if self.dataset: dataset_info = {} dt_config_path = os.path.join( - self.category_path, f"{self.dataset['name']}.json" - ) - self.dataset.save_to_json( - os.path.join(self.category_path, f"{self.dataset['name']}.json") + self.category_path, f"{self.dataset['name']}_config.json" ) + self.dataset.save_to_json(dt_config_path) if dump_config_to_s3: s3_upload_name = os.path.relpath( dt_config_path, os.path.join(export_path, self.uuid) @@ -1799,7 +1796,7 @@ def init_dataset(self): "methodology": "Other", "methodology_other": "Volunteered geographic information", "license_id": "hdx-odc-odbl", - "updated_by_script": f'Hotosm OSM Exports ({datetime.now().strftime("%Y-%m-%dT%H:%M:%S")}', + "updated_by_script": f'Hotosm OSM Exports ({datetime.now().strftime("%Y-%m-%dT%H:%M:%S")})', "caveats": self.category_data.hdx.caveats, "private": self.hdx.private, "notes": self.add_notes(), diff --git a/src/validation/models.py b/src/validation/models.py index 0ea0219b..c4df904e 100644 --- a/src/validation/models.py +++ b/src/validation/models.py @@ -410,7 +410,7 @@ def validate_export_types(cls, value): for export_type in value: if export_type not in EXPORT_TYPE_MAPPING: raise ValueError(f"Unsupported export type: {export_type}") - return [EXPORT_TYPE_MAPPING[export_type] for export_type in value] + return value class ExportTypeInfo: @@ -487,6 +487,9 @@ class DatasetConfig(BaseModel): description="Valid dataset locations iso3", example="['npl']", ) + dataset_folder: str = Field( + default="HDX", description="Default base folder for the exports", example="HDX" + ) @validator("update_frequency") def validate_frequency(cls, value): @@ -531,12 +534,17 @@ class DynamicCategoriesModel(BaseModel): dataset: Optional[DatasetConfig] = Field( default=None, description="Dataset Configurations for HDX Upload" ) + queue: Optional[str] = Field( + default="raw_special", + description="Lets you decide which queue you wanna place your task, Requires admin access", + ) meta: bool = Field( default=False, description="Dumps Meta db in parquet format & hdx config json to s3", ) hdx_upload: bool = Field( - default=True, description="Enable/Disable uploading dataset to hdx" + default=False, + description="Enable/Disable uploading dataset to hdx, False by default", ) categories: List[Dict[str, CategoryModel]] = Field(