From cf06114feb2767bc711ce39cd548ddcfb51cbf41 Mon Sep 17 00:00:00 2001 From: Kshitij Raj Sharma <36752999+kshitijrajsharma@users.noreply.github.com> Date: Mon, 9 Dec 2024 09:27:29 +0100 Subject: [PATCH] Revert "Feature/post processing" --- .gitignore | 2 - API/api_worker.py | 85 +--------- backend/field_update | 1 + backend/raw_backend | 2 +- requirements.txt | 7 - src/app.py | 33 +--- src/config.py | 1 - src/post_processing/__init__.py | 0 src/post_processing/geojson_stats.py | 61 -------- src/post_processing/processor.py | 120 -------------- src/post_processing/stats_building_tpl.html | 165 -------------------- src/post_processing/stats_highway_tpl.html | 164 ------------------- src/post_processing/stats_tpl.html | 161 ------------------- src/post_processing/stats_waterway_tpl.html | 165 -------------------- src/post_processing/transliterator.py | 34 ---- src/validation/models.py | 23 --- tests/test_API.py | 60 ++++--- 17 files changed, 39 insertions(+), 1045 deletions(-) delete mode 100644 src/post_processing/__init__.py delete mode 100644 src/post_processing/geojson_stats.py delete mode 100644 src/post_processing/processor.py delete mode 100644 src/post_processing/stats_building_tpl.html delete mode 100644 src/post_processing/stats_highway_tpl.html delete mode 100644 src/post_processing/stats_tpl.html delete mode 100644 src/post_processing/stats_waterway_tpl.html delete mode 100644 src/post_processing/transliterator.py diff --git a/.gitignore b/.gitignore index 143eb173..c74b2f26 100644 --- a/.gitignore +++ b/.gitignore @@ -24,5 +24,3 @@ Pipfile.lock #backend backend/data backend/.env - -.DS_Store diff --git a/API/api_worker.py b/API/api_worker.py index c538b42c..096f2f0f 100644 --- a/API/api_worker.py +++ b/API/api_worker.py @@ -16,7 +16,6 @@ # Reader imports from src.app import CustomExport, PolygonStats, RawData, S3FileTransfer -from src.post_processing.processor import PostProcessor from src.config import ALLOW_BIND_ZIP_FILTER from src.config import CELERY_BROKER_URL as celery_broker_uri from src.config import CELERY_RESULT_BACKEND as celery_backend @@ -40,7 +39,6 @@ RawDataCurrentParams, RawDataOutputType, ) -from src.post_processing.processor import PostProcessor if ENABLE_SOZIP: # Third party imports @@ -77,12 +75,7 @@ def create_readme_content(default_readme, polygon_stats): def zip_binding( - working_dir, - exportname_parts, - geom_dump, - polygon_stats, - geojson_stats, - default_readme, + working_dir, exportname_parts, geom_dump, polygon_stats, default_readme ): logging.debug("Zip Binding Started!") upload_file_path = os.path.join( @@ -95,9 +88,6 @@ def zip_binding( ), } - if geojson_stats: - additional_files["stats.json"] = geojson_stats - for name, content in additional_files.items(): temp_path = os.path.join(working_dir, name) with open(temp_path, "w") as f: @@ -219,60 +209,11 @@ def process_raw_data(self, params, user=None): file_parts, ) - # Post-proccessing: Generate GeoJSON/HTML stats and transliterations - polygon_stats = None - geojson_stats_html = None - geojson_stats_json = None - download_html_url = None - if "include_stats" or "include_translit" in params.dict(): - post_processor = PostProcessor( - { - "include_stats": params.include_stats, - "include_translit": params.include_translit, - } - ) - - if params.include_stats: - post_processor.filters = params.filters - - post_processor.init() - - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts, post_processor.post_process_line) - - if params.include_stats: - geojson_stats_json = json.dumps(post_processor.geoJSONStats.dict()) - - # Create a HTML summary of stats - if params.include_stats_html: - tpl = "stats" - if "waterway" in post_processor.geoJSONStats.config.keys: - tpl = "stats_waterway" - elif "highway" in post_processor.geoJSONStats.config.keys: - tpl = "stats_highway" - elif "building" in post_processor.geoJSONStats.config.keys: - tpl = "stats_building" - project_root = pathlib.Path(__file__).resolve().parent - tpl_path = os.path.join( - project_root, - "../src/post_processing/{tpl}_tpl.html".format(tpl=tpl), - ) - geojson_stats_html = post_processor.geoJSONStats.html( - tpl_path - ).build() - upload_html_path = os.path.join( - working_dir, os.pardir, f"{exportname_parts[-1]}.html" - ) - with open(upload_html_path, "w") as f: - f.write(geojson_stats_html) - - else: - geom_area, geom_dump, working_dir = RawData( - params, str(self.request.id) - ).extract_current_data(file_parts) - + geom_area, geom_dump, working_dir = RawData( + params, str(self.request.id) + ).extract_current_data(file_parts) inside_file_size = 0 + polygon_stats = None if "include_stats" in params.dict(): if params.include_stats: feature = { @@ -281,14 +222,12 @@ def process_raw_data(self, params, user=None): "properties": {}, } polygon_stats = PolygonStats(feature).get_summary_stats() - if bind_zip: upload_file_path, inside_file_size = zip_binding( working_dir=working_dir, exportname_parts=exportname_parts, geom_dump=geom_dump, polygon_stats=polygon_stats, - geojson_stats=geojson_stats_json, default_readme=DEFAULT_README_TEXT, ) @@ -301,7 +240,6 @@ def process_raw_data(self, params, user=None): upload_file_path = file_path inside_file_size += os.path.getsize(file_path) break # only take one file inside dir , if contains many it should be inside zip - # check if download url will be generated from s3 or not from config if use_s3_to_upload: file_transfer_obj = S3FileTransfer() @@ -315,6 +253,7 @@ def process_raw_data(self, params, user=None): pattern = r"(hotosm_project_)(\d+)" match = re.match(pattern, exportname) if match: + prefix = match.group(1) project_number = match.group(2) if project_number: upload_name = f"TM/{project_number}/{exportname}" @@ -333,15 +272,6 @@ def process_raw_data(self, params, user=None): upload_name, file_suffix="zip" if bind_zip else params.output_type.lower(), ) - - # If there's an HTML file, upload it too - if geojson_stats_html: - download_html_url = file_transfer_obj.upload( - upload_html_path, - upload_name, - file_suffix="html", - ) - else: # give the static file download url back to user served from fastapi static export path download_url = str(upload_file_path) @@ -367,9 +297,6 @@ def process_raw_data(self, params, user=None): } if polygon_stats: final_response["stats"] = polygon_stats - if download_html_url: - final_response["download_html_url"] = download_html_url - return final_response except Exception as ex: diff --git a/backend/field_update b/backend/field_update index 0c55c83b..b663498b 100644 --- a/backend/field_update +++ b/backend/field_update @@ -114,6 +114,7 @@ class Database: try: self.cursor.execute(query) self.conn.commit() + # print(query) try: result = self.cursor.fetchall() diff --git a/backend/raw_backend b/backend/raw_backend index 4a852eb3..5ddd55c8 100644 --- a/backend/raw_backend +++ b/backend/raw_backend @@ -256,7 +256,7 @@ if __name__ == "__main__": if not args.replication: osm2pgsql.append("--drop") - + print(osm2pgsql) run_subprocess_cmd(osm2pgsql) basic_index_cmd = [ diff --git a/requirements.txt b/requirements.txt index 46eb24da..c295c164 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,10 +54,3 @@ psutil==5.9.8 ## logging tqdm==4.66.2 - -# stats for geojson data -geojson-stats==0.2.4 - -# transliterations -transliterate==1.10.2 - diff --git a/src/app.py b/src/app.py index 3ca2ad6c..fb5e82d5 100644 --- a/src/app.py +++ b/src/app.py @@ -47,7 +47,6 @@ from psycopg2.extras import DictCursor from slugify import slugify from tqdm import tqdm -from .post_processing.processor import PostProcessor # Reader imports from src.config import ( @@ -641,7 +640,7 @@ def ogr_export(query, outputtype, working_dir, dump_temp_path, params): os.remove(query_path) @staticmethod - def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn=None): + def query2geojson(con, extraction_query, dump_temp_file_path): """Function written from scratch without being dependent on any library, Provides better performance for geojson binding""" # creating geojson file pre_geojson = """{"type": "FeatureCollection","features": [""" @@ -661,12 +660,10 @@ def query2geojson(con, extraction_query, dump_temp_file_path, plugin_fn=None): for row in cursor: if first: first = False + f.write(row[0]) else: f.write(",") - if plugin_fn: - f.write(plugin_fn(row[0])) - else: - f.write((row[0])) + f.write(row[0]) cursor.close() # closing connection to avoid memory issues # close the writing geojson with last part f.write(post_geojson) @@ -714,7 +711,7 @@ def get_grid_id(geom, cur): country_export, ) - def extract_current_data(self, exportname, plugin_fn=None): + def extract_current_data(self, exportname): """Responsible for Extracting rawdata current snapshot, Initially it creates a geojson file , Generates query , run it with 1000 chunk size and writes it directly to the geojson file and closes the file after dump Args: exportname: takes filename as argument to create geojson file passed from routers @@ -780,7 +777,6 @@ def extract_current_data(self, exportname, plugin_fn=None): country_export=country_export, ), dump_temp_file_path, - plugin_fn, ) # uses own conversion class if output_type == RawDataOutputType.SHAPEFILE.value: ( @@ -1492,29 +1488,8 @@ def process_export_format(export_format): layer_creation_options=layer_creation_options_str, query_dump_path=export_format_path, ) - run_ogr2ogr_cmd(ogr2ogr_cmd) - # Post-processing GeoJSON files - # Adds: stats, HTML stats summary and transliterations - if export_format.driver_name == "GeoJSON" and ( - self.params.include_stats or self.params.include_translit - ): - post_processor = PostProcessor( - { - "include_stats": self.params.include_stats, - "include_translit": self.params.include_translit, - "include_stats_html": self.params.include_stats_html, - } - ) - post_processor.init() - post_processor.custom( - categories=self.params.categories, - export_format_path=export_format_path, - export_filename=export_filename, - file_export_path=file_export_path, - ) - zip_file_path = os.path.join(file_export_path, f"{export_filename}.zip") zip_path = self.file_to_zip(export_format_path, zip_file_path) diff --git a/src/config.py b/src/config.py index cc889318..3f6ab649 100644 --- a/src/config.py +++ b/src/config.py @@ -334,7 +334,6 @@ def not_raises(func, *args, **kwargs): logging.error( "Error creating HDX configuration: %s, Disabling the hdx exports feature", e ) - ENABLE_HDX_EXPORTS = False if ENABLE_HDX_EXPORTS: diff --git a/src/post_processing/__init__.py b/src/post_processing/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/post_processing/geojson_stats.py b/src/post_processing/geojson_stats.py deleted file mode 100644 index 9ca74d68..00000000 --- a/src/post_processing/geojson_stats.py +++ /dev/null @@ -1,61 +0,0 @@ -from geojson_stats.stats import Stats -from geojson_stats.html import Html - -CONFIG_AREA = ["building"] -CONFIG_LENGTH = ["highway", "waterway"] - - -class GeoJSONStats(Stats): - """Used for collecting stats while processing GeoJSON files line by line""" - - def __init__(self, filters, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.config.clean = True - self.config.properties_prop = "properties.tags" - - if filters and filters.tags: - for tag in CONFIG_AREA: - if self.check_filter(filters.tags, tag): - self.config.keys.append(tag) - self.config.value_keys.append(tag) - self.config.area = True - - for tag in CONFIG_LENGTH: - if self.check_filter(filters.tags, tag): - self.config.keys.append(tag) - self.config.value_keys.append(tag) - self.config.length = True - - def check_filter(self, tags, tag): - """ - Check if a tag is present in tag filters - """ - - if tags.all_geometry: - if tags.all_geometry.join_or and tag in tags.all_geometry.join_or: - return True - if tags.all_geometry.join_and and tag in tags.all_geometry.join_and: - return True - if tags.polygon: - if tags.polygon.join_or and tag in tags.polygon.join_or: - return True - if tags.polygon.join_and and tag in tags.polygon.join_and: - return True - if tags.line: - if tags.line.join_or and tag in tags.line.join_or: - return True - if tags.line.join_and and tag in tags.line.join_and: - return True - - def raw_data_line_stats(self, json_object: dict): - """ - Process a GeoJSON line (for getting stats) and return that line - """ - self.get_object_stats(json_object) - - def html(self, tpl): - """ - Returns stats Html object, generated from stats data using a template - """ - return Html(tpl, self) diff --git a/src/post_processing/processor.py b/src/post_processing/processor.py deleted file mode 100644 index 44feccff..00000000 --- a/src/post_processing/processor.py +++ /dev/null @@ -1,120 +0,0 @@ -import json -from .transliterator import Transliterator -from .geojson_stats import GeoJSONStats -import os -import pathlib - - -class PostProcessor: - """Used for posst-process data while processing GeoJSON files line by line""" - - options = {} - filters = {} - functions = [] - - def __init__(self, options, *args, **kwargs): - self.options = options - - def post_process_line(self, line: str): - """ - Parses line, run functions over it and returns it - """ - - line_object = json.loads(line) - - for fn in self.functions: - fn(line_object) - - return json.dumps(line_object) - - def custom(self, categories, export_format_path, export_filename, file_export_path): - """ - Post-process custom exports - """ - self.geoJSONStats.config.properties_prop = "properties" - - category_tag = "" - if any("Roads" in element for element in categories): - category_tag = "highway" - self.geoJSONStats.config.length = True - elif any("Buildings" in element for element in categories): - category_tag = "building" - self.geoJSONStats.config.area = True - elif any("Waterways" in element for element in categories): - category_tag = "Waterway" - self.geoJSONStats.config.length = True - - if self.options["include_stats"]: - if category_tag: - self.geoJSONStats.config.keys.append(category_tag) - self.geoJSONStats.config.value_keys.append(category_tag) - - path_input = os.path.join( - export_format_path, f"{export_filename}.geojson" - ) - path_output = os.path.join( - export_format_path, f"{export_filename}-post.geojson" - ) - - with open(path_input, "r") as input_file, open( - path_output, "w" - ) as output_file: - for line in input_file: - comma = False - if line.startswith('{ "type": "Feature"'): - json_string = "" - if line[-2:-1] == ",": - json_string = line[:-2] - comma = True - else: - json_string = line - line = self.post_process_line(json_string) - if self.options["include_translit"]: - if comma: - output_file.write(line + ",") - else: - output_file.write(line) - - if self.options["include_translit"]: - os.remove(path_input) - os.rename(path_output, path_input) - else: - os.remove(path_output) - - geojson_stats_json = json.dumps(self.geoJSONStats.dict()) - with open( - os.path.join(file_export_path, "stats.json"), - "w", - ) as f: - f.write(geojson_stats_json) - - if self.options["include_stats_html"]: - tpl = ( - "stats_{category_tag}".format(category_tag=category_tag) - if category_tag - else "stats" - ) - project_root = pathlib.Path(__file__).resolve().parent - tpl_path = os.path.join( - project_root, - "{tpl}_tpl.html".format(tpl=tpl), - ) - geojson_stats_html = self.geoJSONStats.html(tpl_path).build() - upload_html_path = os.path.join( - file_export_path, "stats-summary.html" - ) - with open(upload_html_path, "w") as f: - f.write(geojson_stats_html) - - def init(self): - """ - Initialize post-processor - """ - - if "include_stats" in self.options and self.options["include_stats"]: - self.geoJSONStats = GeoJSONStats(self.filters) - self.functions.append(self.geoJSONStats.raw_data_line_stats) - - if "include_translit" in self.options and self.options["include_translit"]: - self.transliterator = Transliterator() - self.functions.append(self.transliterator.translit) diff --git a/src/post_processing/stats_building_tpl.html b/src/post_processing/stats_building_tpl.html deleted file mode 100644 index 97c3f659..00000000 --- a/src/post_processing/stats_building_tpl.html +++ /dev/null @@ -1,165 +0,0 @@ - - - -
- - - - - - -Elements identified as distinct
-Including local language and english
-Key | -Count | -% | -
---|---|---|
Total features | -${count} | -100% | -
${key_0} | -${key_0_count} | -${key_0_percent} | -
${key_1} | -${key_1_count} | -${key_1_percent} | -
${key_2} | -${key_2_count} | -${key_2_percent} | -
${key_3} | -${key_3_count} | -${key_3_percent} | -
${key_4} | -${key_4_count} | -${key_4_percent} | -
${key_5} | -${key_5_count} | -${key_5_percent} | -
Elements identified as distinct
-Including local language and english
-Key | -Count | -% | -
---|---|---|
Total features | -${count} | -100% | -
${key_0} | -${key_0_count} | -${key_0_percent} | -
${key_1} | -${key_1_count} | -${key_1_percent} | -
${key_2} | -${key_2_count} | -${key_2_percent} | -
${key_3} | -${key_3_count} | -${key_3_percent} | -
${key_4} | -${key_4_count} | -${key_4_percent} | -
${key_5} | -${key_5_count} | -${key_5_percent} | -
Elements identified as distinct
-Including local language and english
-Key | -Count | -% | -
---|---|---|
Total features | -${count} | -100% | -
${key_0} | -${key_0_count} | -${key_0_percent} | -
${key_1} | -${key_1_count} | -${key_1_percent} | -
${key_2} | -${key_2_count} | -${key_2_percent} | -
${key_3} | -${key_3_count} | -${key_3_percent} | -
${key_4} | -${key_4_count} | -${key_4_percent} | -
${key_5} | -${key_5_count} | -${key_5_percent} | -
Elements identified as distinct
-Including local language and english
-Key | -Count | -% | -
---|---|---|
Total features | -${count} | -100% | -
${key_0} | -${key_0_count} | -${key_0_percent} | -
${key_1} | -${key_1_count} | -${key_1_percent} | -
${key_2} | -${key_2_count} | -${key_2_percent} | -
${key_3} | -${key_3_count} | -${key_3_percent} | -
${key_4} | -${key_4_count} | -${key_4_percent} | -
${key_5} | -${key_5_count} | -${key_5_percent} | -