opengeospatial · alasarr · Jun 14, 2022 · Jun 14, 2022 · Jun 15, 2022 · Jun 15, 2022
diff --git a/scripts/README.md b/scripts/README.md
@@ -44,3 +44,33 @@ virtualenv:
 ```
 poetry run pip install -U --force-reinstall pygeos --no-binary pygeos
 ```
+
+### BigQuery
+
+Convert a SQL query to parquet:
+
+```bash
+poetry run python bigquery_to_parquet.py \
+    --input-query "SELECT * FROM carto-do-public-data.carto.geography_usa_blockgroup_2019" \
+    --primary-column geom \
+    --output geography_usa_blockgroup_2019
+```
+
+Upload a parquet file or folder to BigQuery:
+```bash
+poetry run python parquet_to_bigquery.py \
+    --input geography_usa_blockgroup_2019 \
+    --output "cartodb-gcp-backend-data-team.alasarr.geography_usa_blockgroup_2019"
+```
+
+Instead of using folders, you can also work with a single file, but it might hit [bigquery limits](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet) when you upload it to BigQuery. For large parquet files you might get `UDF out of memory` errors.
+
+Convert a SQL query to single parquet file:
+
+```bash
+poetry run python bigquery_to_parquet.py \
+    --input-query "SELECT * FROM carto-do-public-data.carto.geography_usa_blockgroup_2019" \
+    --primary-column geom \
+    --mode file \
+    --output geography_usa_blockgroup_2019.parquet
+```
diff --git a/scripts/bigquery_to_parquet.py b/scripts/bigquery_to_parquet.py
@@ -0,0 +1,88 @@
+import click
+import sys
+import pyarrow.parquet as pq
+import geopandas as gpd
+
+from encoder import AVAILABLE_COMPRESSIONS, Edges, PathType, geopandas_to_arrow
+from pathlib import Path
+from google.cloud import bigquery
+
+MODES = ["FILE", "FOLDER"]
+
+def read_gdf(input_query: str, primary_column: str):
+    client = bigquery.Client()
+    df = client.query(input_query).to_dataframe()
+    df[primary_column] = gpd.GeoSeries.from_wkt(df[primary_column])
+    return gpd.GeoDataFrame(df, geometry=primary_column, crs="EPSG:4326")
+
+@click.command()
+@click.option(
+    "-q",
+    "--input-query",
+    type=str,
+    help="SQL query of the data to export",
+    required=True,
+)
+@click.option(
+    "--primary-column",
+    type=str,
+    help="The primary column name with geometry data",
+    required=True,
+)
+@click.option(
+    "-o",
+    "--output",
+    type=PathType(file_okay=True, dir_okay=True, writable=True),
+    help="Path to output",
+    required=True,
+)
+@click.option(
+    "-m",
+    "--mode",
+    type=click.Choice(MODES, case_sensitive=False),
+    help="Mode to use FILE or FOLDER",
+    default="FOLDER",
+    show_default=True
+)
+@click.option(
+    "--compression",
+    type=click.Choice(AVAILABLE_COMPRESSIONS, case_sensitive=False),
+    default="SNAPPY",
+    help="Compression codec to use when writing to Parquet.",
+    show_default=True,
+)
+@click.option(
+    "--partition-size",
+    type=int,
+    default=5000,
+    help="Number of records per partition. Ignored if --single-file is provided.",
+    show_default=True,
+)
+def main(input_query: str, primary_column: str, output: Path, mode: str, compression: str , partition_size: int):
+    print("Reading data from BigQuery", file=sys.stderr)
+
+    if mode.upper() == 'FOLDER':
+        gdf = (
+            read_gdf(input_query, primary_column)
+                .assign(__partition__= lambda x: x.index // partition_size)
+        )
+    else:
+        gdf = read_gdf(input_query, primary_column)
+
+    print("Finished reading", file=sys.stderr)
+    print("Starting conversion to Arrow", file=sys.stderr)
+    arrow_table = geopandas_to_arrow(gdf, Edges.SPHERICAL)
+    print("Finished conversion to Arrow", file=sys.stderr)
+
+    print("Starting write to Parquet", file=sys.stderr)
+
+    if mode.upper() == 'FOLDER':
+        # We need to export to multiple files, because a single file might hit bigquery limits (UDF out of memory). https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet
+        pq.write_to_dataset(arrow_table, root_path=output, partition_cols=['__partition__'], compression=compression)
+    else:
+        pq.write_table(arrow_table, output, compression=compression)
+
+    print("Finished write to Parquet", file=sys.stderr)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/encoder.py b/scripts/encoder.py
@@ -0,0 +1,140 @@
+import json
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List
+
+import click
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pygeos
+from numpy.typing import NDArray
+
+GEOPARQUET_VERSION = "0.4.0"
+AVAILABLE_COMPRESSIONS = ["NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"]
+
+PygeosGeometryArray = NDArray[pygeos.Geometry]
+
+class Edges(Enum):
+    PLANAR = 'planar'
+    SPHERICAL = 'spherical'
+
+class PathType(click.Path):
+    """A Click path argument that returns a pathlib Path, not a string"""
+
+    def convert(self, value, param, ctx):
+        return Path(super().convert(value, param, ctx))
+
+class GeometryType(int, Enum):
+    """Pygeos (GEOS) geometry type mapping
+    From https://pygeos.readthedocs.io/en/latest/geometry.html?highlight=type#pygeos.geometry.get_type_id
+    """
+
+    Missing = -1
+    Point = 0
+    LineString = 1
+    LinearRing = 2
+    Polygon = 3
+    MultiPoint = 4
+    MultiLinestring = 5
+    MultiPolygon = 6
+    GeometryCollection = 7
+
+
+def _parse_to_pygeos(df: gpd.GeoDataFrame) -> Dict[str, PygeosGeometryArray]:
+    """Parse to pygeos geometry array
+
+    This is split out from _create_metadata so that we don't have to create the pygeos
+    array twice: once for converting to wkb and another time for metadata handling.
+    """
+    geometry_columns: Dict[str, PygeosGeometryArray] = {}
+    for col in df.columns[df.dtypes == "geometry"]:
+        geometry_columns[col] = df[col].array.data
+
+    return geometry_columns
+
+
+def _create_metadata(
+    df: gpd.GeoDataFrame, geometry_columns: Dict[str, PygeosGeometryArray], edges: Edges
+) -> Dict[str, Any]:
+    """Create and encode geo metadata dict.
+
+    Parameters
+    ----------
+    df : GeoDataFrame
+
+    Returns
+    -------
+    dict
+    """
+
+    # Construct metadata for each geometry
+    column_metadata = {}
+    for col, geometry_array in geometry_columns.items():
+        geometry_type = _get_geometry_type(geometry_array)
+        bbox = list(pygeos.total_bounds(geometry_array))
+
+        series = df[col]
+        column_metadata[col] = {
+            "encoding": "WKB",
+            "geometry_type": geometry_type,
+            "crs": series.crs.to_json_dict() if series.crs else None,
+            # We don't specify orientation for now
+            # "orientation"
+            "edges": edges.value,
+            "bbox": bbox,
+            # I don't know how to get the epoch from a pyproj CRS, and if it's relevant
+            # here
+            # "epoch":
+        }
+
+    return {
+        "version": GEOPARQUET_VERSION,
+        "primary_column": df._geometry_column_name,
+        "columns": column_metadata,
+        # "creator": {"library": "geopandas", "version": geopandas.__version__},
+    }
+
+
+def _get_geometry_type(pygeos_geoms: PygeosGeometryArray) -> List[str]:
+    type_ids = pygeos.get_type_id(pygeos_geoms)
+    unique_type_ids = set(type_ids)
+
+    geom_type_names: List[str] = []
+    for type_id in unique_type_ids:
+        geom_type_names.append(GeometryType(type_id).name)
+
+    return geom_type_names
+
+
+def _encode_metadata(metadata: Dict) -> bytes:
+    """Encode metadata dict to UTF-8 JSON string
+
+    Parameters
+    ----------
+    metadata : dict
+
+    Returns
+    -------
+    UTF-8 encoded JSON string
+    """
+    # Remove unnecessary whitespace in JSON metadata
+    # https://stackoverflow.com/a/33233406
+    return json.dumps(metadata, separators=(',', ':')).encode("utf-8")
+
+
+def geopandas_to_arrow(df: gpd.GeoDataFrame, edges:Edges = Edges.PLANAR) -> pa.Table:
+    geometry_columns = _parse_to_pygeos(df)
+    geo_metadata = _create_metadata(df, geometry_columns, edges)
+
+    df = pd.DataFrame(df)
+    for col, geometry_array in geometry_columns.items():
+        df[col] = pygeos.to_wkb(geometry_array)
+
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    metadata = table.schema.metadata
+    metadata.update({b"geo": _encode_metadata(geo_metadata)})
+    return table.replace_schema_metadata(metadata)
diff --git a/scripts/parquet_to_bigquery.py b/scripts/parquet_to_bigquery.py
@@ -0,0 +1,109 @@
+import click
+import json
+import glob
+import pyarrow.parquet as pq
+
+from encoder import PathType
+from pathlib import Path
+from google.cloud import bigquery
+from encoder import Edges
+
+def upload_parquet_file(client:  bigquery.Client, file: Path, write_disposition: str, dst: str):
+    """Upload a parquet file to BigQuery"""
+    job_config = bigquery.LoadJobConfig(
+        source_format = bigquery.SourceFormat.PARQUET,
+        write_disposition = write_disposition,
+    )
+
+    with open(file, "rb") as source_file:
+        print(f"Uploading file {file}")
+        job = client.load_table_from_file(source_file, dst, job_config=job_config)
+
+    job.result()  # Waits for the job to complete.
+
+def validate_metadata(metadata):
+    """Validate metadata"""
+    if metadata is None or b"geo" not in metadata:
+        raise ValueError("Missing geo metadata")
+
+    geo = json.loads(metadata[b"geo"])
+
+    if (geo["primary_column"] not in geo["columns"]):
+        raise ValueError("Primary column not found")
+
+    for column_name, column_meta in geo["columns"].items():
+        encoding = column_meta["encoding"]
+        edges = column_meta["edges"]
+        if encoding != 'WKB':
+            raise ValueError(f"Not supported encoding {encoding} for column {column_name}")
+        if edges != Edges.SPHERICAL.value:
+            raise ValueError(f"Only spherical edges are supported")
+
+@click.command()
+@click.option(
+    "-i",
+    "--input",
+    type=PathType(exists=True, readable=True),
+    help="Path to a parquet file or a folder with multiple parquet files inside (it requires extension *.parquet).",
+    required=True,
+)
+@click.option(
+    "-o",
+    "--output",
+    type=str,
+    help="FQN of the destination table (project.dataset.table).",
+    required=True,
+)
+def main(input: Path, output: str):
+    primary_column = None
+    tmp_output =  f"{output}_tmp"
+    metadata = None
+    client = bigquery.Client()
+
+    if input.is_dir():
+        # A folder is detected
+        first_file = True
+
+        for file in glob.glob(f"{input}/**/*.parquet",recursive=True):
+
+            if first_file:
+                # First file determines the schema and truncates the table
+                metadata = pq.read_schema(file).metadata
+                validate_metadata(metadata)
+                write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+                first_file = False
+            else:
+               # other files will append
+               write_disposition = bigquery.WriteDisposition.WRITE_APPEND
+
+            upload_parquet_file(client, file, write_disposition, tmp_output)
+    else:
+        # Single file mode
+        metadata = pq.read_schema(input).metadata
+        validate_metadata(metadata)
+        write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+        upload_parquet_file(client, input, write_disposition, tmp_output)
+
+    metadata_geo = json.loads(metadata[b"geo"])
+    primary_column = metadata_geo["primary_column"]
+    geo_columns = list(metadata_geo["columns"].keys())
+    wkb_columns_expression = map(lambda c: f"ST_GEOGFROMWKB({c}) as {c}", geo_columns)
+
+    ## Convert to geography the file(s) imported
+    sql = f"""
+        DROP TABLE IF EXISTS {output};
+        CREATE TABLE {output} CLUSTER BY {primary_column}
+        AS SELECT * EXCEPT({", ".join(geo_columns)}),
+            {", ".join(wkb_columns_expression)}
+        FROM {tmp_output};
+        DROP TABLE IF EXISTS {tmp_output};
+    """
+
+    query_job = client.query(sql)
+    query_job.result()  # Waits for job to complete.
+
+    table = client.get_table(output)
+    print(f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {output}")
+
+if __name__ == "__main__":
+    main()