From 87c02ee620029553c5ca41cc276388257c640641 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 27 Dec 2023 13:33:34 -0500 Subject: [PATCH] chore(build): update base-requirements + add script for regeneration --- .../base-requirements.txt | 317 +++++++++--------- .../regenerate-base-requirements.sh | 37 ++ 2 files changed, 195 insertions(+), 159 deletions(-) create mode 100755 docker/datahub-ingestion-base/regenerate-base-requirements.sh diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 141382466ab9f6..90928759027942 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -1,149 +1,147 @@ -# Excluded for slim -# pyspark==3.0.3 -# pydeequ==1.0.1 - +# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate. acryl-datahub-classify==0.0.8 -acryl-PyHive==0.6.14 -acryl-sqlglot==18.5.2.dev45 +acryl-PyHive==0.6.16 +acryl-sqlglot==20.4.1.dev14 aenum==3.1.15 -aiohttp==3.8.6 +aiohttp==3.9.1 aiosignal==1.3.1 -alembic==1.12.0 +alembic==1.13.1 altair==4.2.0 +annotated-types==0.6.0 anyio==3.7.1 -apache-airflow==2.7.2 -apache-airflow-providers-common-sql==1.7.2 -apache-airflow-providers-ftp==3.5.2 -apache-airflow-providers-http==4.5.2 -apache-airflow-providers-imap==3.3.2 -apache-airflow-providers-sqlite==3.4.3 -apispec==6.3.0 +apache-airflow==2.7.3 +apache-airflow-providers-common-sql==1.9.0 +apache-airflow-providers-ftp==3.7.0 +apache-airflow-providers-http==4.8.0 +apache-airflow-providers-imap==3.5.0 +apache-airflow-providers-sqlite==3.6.0 +apispec==6.3.1 appdirs==1.4.4 appnope==0.1.3 -argcomplete==3.1.2 +argcomplete==3.2.1 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 asgiref==3.7.2 asn1crypto==1.5.1 -asttokens==2.4.0 +asttokens==2.4.1 async-timeout==4.0.3 -asynch==0.2.2 +asynch==0.2.3 attrs==23.1.0 -avro==1.10.2 +avro==1.11.3 avro-gen3==0.7.11 -Babel==2.13.0 -backcall==0.2.0 +Babel==2.14.0 backoff==2.2.1 beautifulsoup4==4.12.2 bleach==6.1.0 -blinker==1.6.3 +blinker==1.7.0 blis==0.7.11 -boto3==1.28.62 -botocore==1.31.62 +boto3==1.34.8 +botocore==1.34.8 bowler==0.9.0 bracex==2.4 cached-property==1.5.2 cachelib==0.9.0 -cachetools==5.3.1 +cachetools==5.3.2 catalogue==2.0.10 -cattrs==23.1.2 -certifi==2023.7.22 +cattrs==23.2.3 +certifi==2023.11.17 cffi==1.16.0 chardet==5.2.0 -charset-normalizer==3.3.0 -ciso8601==2.3.0 +charset-normalizer==3.3.2 +ciso8601==2.3.1 click==8.1.7 click-default-group==1.2.4 click-spinner==0.1.10 clickclick==20.10.2 -clickhouse-cityhash==1.0.2.4 clickhouse-driver==0.2.6 clickhouse-sqlalchemy==0.2.4 -cloudpickle==2.2.1 +cloudpickle==3.0.0 colorama==0.4.6 colorlog==4.8.0 -comm==0.1.4 -confection==0.1.3 -ConfigUpdater==3.1.1 +comm==0.2.0 +confection==0.1.4 +ConfigUpdater==3.2 confluent-kafka==2.3.0 connexion==2.14.2 cron-descriptor==1.4.0 croniter==2.0.1 -cryptography==41.0.4 +cryptography==41.0.7 cx-Oracle==8.3.0 cymem==2.0.8 -dask==2023.9.3 +dask==2023.12.1 databricks-cli==0.18.0 databricks-dbapi==0.6.0 -databricks-sdk==0.10.0 +databricks-sdk==0.15.0 +databricks-sql-connector==2.9.3 debugpy==1.8.0 decorator==5.1.1 defusedxml==0.7.1 -deltalake==0.11.0 +deltalake==0.14.0 Deprecated==1.2.14 dill==0.3.7 dnspython==2.4.2 -docker==6.1.3 +docker==7.0.0 docutils==0.20.1 ecdsa==0.18.0 elasticsearch==7.13.4 email-validator==1.3.1 entrypoints==0.4 et-xmlfile==1.1.0 -exceptiongroup==1.1.3 -executing==2.0.0 -expandvars==0.11.0 -fastapi==0.103.2 -fastavro==1.8.4 -fastjsonschema==2.18.1 +exceptiongroup==1.2.0 +executing==2.0.1 +expandvars==0.12.0 +fastapi==0.108.0 +fastavro==1.9.2 +fastjsonschema==2.19.0 feast==0.31.1 -filelock==3.12.4 +filelock==3.13.1 fissix==21.11.13 Flask==2.2.5 flatdict==4.0.1 -frozenlist==1.4.0 -fsspec==2023.9.2 +frozenlist==1.4.1 +fsspec==2023.12.2 future==0.18.3 -GeoAlchemy2==0.14.1 -gitdb==4.0.10 -GitPython==3.1.37 -google-api-core==2.12.0 -google-auth==2.23.3 -google-cloud-appengine-logging==1.3.2 +GeoAlchemy2==0.14.3 +gitdb==4.0.11 +GitPython==3.1.40 +google-api-core==2.15.0 +google-auth==2.25.2 +google-cloud-appengine-logging==1.4.0 google-cloud-audit-log==0.2.5 -google-cloud-bigquery==3.12.0 -google-cloud-core==2.3.3 +google-cloud-bigquery==3.14.1 +google-cloud-core==2.4.1 google-cloud-datacatalog-lineage==0.2.2 google-cloud-logging==3.5.0 google-crc32c==1.5.0 google-re2==1.1 -google-resumable-media==2.6.0 -googleapis-common-protos==1.60.0 +google-resumable-media==2.7.0 +googleapis-common-protos==1.62.0 gql==3.4.1 graphql-core==3.2.3 graphviz==0.20.1 great-expectations==0.15.50 -greenlet==3.0.0 -grpc-google-iam-v1==0.12.6 -grpcio==1.59.0 -grpcio-reflection==1.59.0 -grpcio-status==1.59.0 -grpcio-tools==1.59.0 +greenlet==3.0.3 +grpc-google-iam-v1==0.13.0 +grpcio==1.60.0 +grpcio-reflection==1.60.0 +grpcio-status==1.60.0 +grpcio-tools==1.60.0 gssapi==1.8.3 gunicorn==21.2.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 +hdbcli==2.19.20 +httpcore==1.0.2 +httptools==0.6.1 +httpx==0.26.0 humanfriendly==10.0 -idna==3.4 +idna==3.6 ijson==3.2.3 -importlib-metadata==6.8.0 -importlib-resources==6.1.0 +importlib-metadata==6.11.0 +importlib-resources==6.1.1 inflection==0.5.1 ipaddress==1.0.23 ipykernel==6.17.1 -ipython==8.16.1 +ipython==8.19.0 ipython-genutils==0.2.0 ipywidgets==8.1.1 iso3166==2.1.1 @@ -152,34 +150,34 @@ itsdangerous==2.1.2 jedi==0.19.1 Jinja2==3.1.2 jmespath==1.0.1 -JPype1==1.4.1 +JPype1==1.5.0 jsonlines==4.0.0 jsonpatch==1.33 jsonpointer==2.4 jsonref==1.1.0 -jsonschema==4.19.1 -jsonschema-specifications==2023.7.1 +jsonschema==4.20.0 +jsonschema-specifications==2023.12.1 jupyter-server==1.24.0 jupyter_client==7.4.9 jupyter_core==4.12.0 -jupyterlab-pygments==0.2.2 jupyterlab-widgets==3.0.9 +jupyterlab_pygments==0.3.0 langcodes==3.3.0 lark==1.1.4 -lazy-object-proxy==1.9.0 +lazy-object-proxy==1.10.0 leb128==1.0.5 -limits==3.6.0 +limits==3.7.0 linear-tsv==1.1.0 linkify-it-py==2.0.2 -lkml==1.3.1 +lkml==1.3.3 locket==1.0.0 lockfile==0.12.2 looker-sdk==23.0.0 -lxml==4.9.3 +lxml==4.9.4 lz4==4.3.2 -makefun==1.15.1 -Mako==1.2.4 -Markdown==3.5 +makefun==1.15.2 +Mako==1.3.0 +Markdown==3.5.1 markdown-it-py==3.0.0 MarkupSafe==2.1.3 marshmallow==3.20.1 @@ -190,26 +188,26 @@ mdit-py-plugins==0.4.0 mdurl==0.1.2 mistune==3.0.2 mixpanel==4.10.0 -mlflow-skinny==2.7.1 +mlflow-skinny==2.9.2 mmh3==4.0.1 mmhash3==3.0.1 more-itertools==10.1.0 moreorless==0.4.0 -moto==4.2.5 +moto==4.2.12 msal==1.22.0 multidict==6.0.4 murmurhash==1.0.10 -mypy==1.6.0 +mypy==1.8.0 mypy-extensions==1.0.0 nbclassic==1.0.0 nbclient==0.6.3 -nbconvert==7.9.2 +nbconvert==7.13.1 nbformat==5.9.1 nest-asyncio==1.5.8 -networkx==3.1 +networkx==3.2.1 notebook==6.5.6 notebook_shim==0.2.3 -numpy==1.26.0 +numpy==1.26.2 oauthlib==3.2.2 okta==1.7.0 openlineage-airflow==1.2.0 @@ -217,110 +215,107 @@ openlineage-integration-common==1.2.0 openlineage-python==1.2.0 openlineage_sql==1.2.0 openpyxl==3.1.2 -opentelemetry-api==1.20.0 -opentelemetry-exporter-otlp==1.20.0 -opentelemetry-exporter-otlp-proto-common==1.20.0 -opentelemetry-exporter-otlp-proto-grpc==1.20.0 -opentelemetry-exporter-otlp-proto-http==1.20.0 -opentelemetry-proto==1.20.0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 +opentelemetry-api==1.22.0 +opentelemetry-exporter-otlp==1.22.0 +opentelemetry-exporter-otlp-proto-common==1.22.0 +opentelemetry-exporter-otlp-proto-grpc==1.22.0 +opentelemetry-exporter-otlp-proto-http==1.22.0 +opentelemetry-proto==1.22.0 +opentelemetry-sdk==1.22.0 +opentelemetry-semantic-conventions==0.43b0 ordered-set==4.1.0 -oscrypto==1.3.0 packaging==23.2 pandas==1.5.3 pandavro==1.5.2 pandocfilters==1.5.0 -parse==1.19.1 +parse==1.20.0 parso==0.8.3 partd==1.4.1 -pathspec==0.11.2 -pathy==0.10.2 +pathspec==0.12.1 +pathy==0.10.3 pendulum==2.1.2 -pexpect==4.8.0 +pexpect==4.9.0 phonenumbers==8.13.0 -pickleshare==0.7.5 platformdirs==3.11.0 pluggy==1.3.0 preshed==3.0.9 prison==0.2.1 -progressbar2==4.2.0 -prometheus-client==0.17.1 -prompt-toolkit==3.0.39 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.5 +progressbar2==4.3.2 +prometheus-client==0.19.0 +prompt-toolkit==3.0.43 +proto-plus==1.23.0 +protobuf==4.25.1 +psutil==5.9.7 psycopg2-binary==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.7 +py-partiql-parser==0.5.0 pyarrow==11.0.0 -pyasn1==0.5.0 +pyasn1==0.5.1 pyasn1-modules==0.3.0 -pyathena==2.4.1 -pycountry==22.3.5 +pyathena==2.25.2 +pycountry==23.12.11 pycparser==2.21 pycryptodome==3.19.0 -pycryptodomex==3.19.0 pydantic==1.10.13 +pydantic_core==2.14.6 pydash==7.0.6 -pydruid==0.6.5 -Pygments==2.16.1 +pydruid==0.6.6 +Pygments==2.17.2 pyiceberg==0.4.0 -pymongo==4.5.0 +pymongo==4.6.1 PyMySQL==1.1.0 -pyOpenSSL==23.2.0 +pyOpenSSL==23.3.0 pyparsing==3.0.9 pyspnego==0.10.2 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 -python-ldap==3.4.3 +python-ldap==3.4.4 python-nvd3==0.15.0 python-slugify==8.0.1 python-stdnum==1.19 -python-tds==1.13.0 +python-tds==1.14.0 python-utils==3.8.1 python3-openid==3.2.0 pytz==2023.3.post1 pytzdata==2020.1 PyYAML==6.0.1 pyzmq==24.0.1 -ratelimiter==1.2.0.post0 redash-toolbelt==0.1.9 -redshift-connector==2.0.914 -referencing==0.30.2 -regex==2023.10.3 +redshift-connector==2.0.918 +referencing==0.32.0 +regex==2023.12.25 requests==2.31.0 requests-file==1.5.1 requests-gssapi==1.2.3 requests-ntlm==1.2.0 requests-toolbelt==0.10.1 -responses==0.23.3 +responses==0.24.1 rfc3339-validator==0.1.4 rfc3986==2.0.0 -rich==13.6.0 -rich-argparse==1.3.0 -rpds-py==0.10.6 +rich==13.7.0 +rich-argparse==1.4.0 +rpds-py==0.15.2 rsa==4.9 ruamel.yaml==0.17.17 ruamel.yaml.clib==0.2.8 -s3transfer==0.7.0 -schwifty==2023.9.0 -scipy==1.11.3 +s3transfer==0.10.0 +schwifty==2023.11.2 +scipy==1.11.4 scramp==1.4.4 Send2Trash==1.8.2 -sentry-sdk==1.32.0 +sentry-sdk==1.39.1 setproctitle==1.3.3 simple-salesforce==1.12.5 six==1.16.0 smart-open==6.4.0 smmap==5.0.1 sniffio==1.3.0 -snowflake-connector-python==3.2.1 -snowflake-sqlalchemy==1.5.0 +snowflake-connector-python==3.6.0 +snowflake-sqlalchemy==1.5.1 sortedcontainers==2.4.0 soupsieve==2.5 spacy==3.4.3 @@ -328,67 +323,71 @@ spacy-legacy==3.0.12 spacy-loggers==1.0.5 sql-metadata==2.2.2 SQLAlchemy==1.4.44 -sqlalchemy-bigquery==1.8.0 -SQLAlchemy-JSONField==1.0.1.post0 +sqlalchemy-bigquery==1.9.0 +sqlalchemy-hana==1.1.1 +SQLAlchemy-JSONField==1.0.2 sqlalchemy-pytds==0.3.5 sqlalchemy-redshift==0.8.14 SQLAlchemy-Utils==0.41.1 -sqlalchemy2-stubs==0.0.2a35 +sqlalchemy2-stubs==0.0.2a37 sqllineage==1.3.8 sqlparse==0.4.4 srsly==2.4.8 stack-data==0.6.3 -starlette==0.27.0 +starlette==0.32.0.post1 strictyaml==1.7.3 tableauserverclient==0.25 tableschema==1.20.2 tabulate==0.9.0 tabulator==1.53.5 tenacity==8.2.3 -termcolor==2.3.0 -terminado==0.17.1 +teradatasql==20.0.0.2 +teradatasqlalchemy==17.20.0.0 +termcolor==2.4.0 +terminado==0.18.0 text-unidecode==1.3 thinc==8.1.12 -thrift==0.13.0 +thrift==0.16.0 thrift-sasl==0.4.3 tinycss2==1.2.1 toml==0.10.2 tomli==2.0.1 -tomlkit==0.12.1 +tomlkit==0.12.3 toolz==0.12.0 -tornado==6.3.3 +tornado==6.4 tqdm==4.66.1 traitlets==5.2.1.post0 trino==0.327.0 typeguard==2.13.3 typer==0.7.0 -types-PyYAML==6.0.12.12 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzlocal==5.1 +typing_extensions==4.9.0 +tzlocal==5.2 uc-micro-py==1.0.2 -ujson==5.8.0 +ujson==5.9.0 unicodecsv==0.14.1 -urllib3==1.26.17 -uvicorn==0.23.2 -uvloop==0.17.0 -vertica-python==1.3.5 -vertica-sqlalchemy-dialect==0.0.8 +universal-pathlib==0.1.4 +urllib3==1.26.18 +uvicorn==0.25.0 +uvloop==0.19.0 +vertica-python==1.3.8 +vertica-sqlalchemy-dialect==0.0.8.1 vininfo==1.7.0 volatile==2.1.0 wasabi==0.10.1 -watchfiles==0.20.0 +watchfiles==0.21.0 wcmatch==8.5 -wcwidth==0.2.8 +wcwidth==0.2.12 webencodings==0.5.1 -websocket-client==1.6.4 -websockets==11.0.3 +websocket-client==1.7.0 +websockets==12.0 Werkzeug==2.2.3 widgetsnbextension==4.0.9 -wrapt==1.15.0 -WTForms==3.1.0 +wrapt==1.16.0 +WTForms==3.0.1 xlrd==2.0.1 xmltodict==0.13.0 -yarl==1.9.2 +yarl==1.9.4 zeep==4.2.1 -zstd==1.5.5.1 \ No newline at end of file +zipp==3.17.0 +zstd==1.5.5.1 diff --git a/docker/datahub-ingestion-base/regenerate-base-requirements.sh b/docker/datahub-ingestion-base/regenerate-base-requirements.sh new file mode 100755 index 00000000000000..6fb331afa484a3 --- /dev/null +++ b/docker/datahub-ingestion-base/regenerate-base-requirements.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# This script is used to regenerate the base-requirements.txt file + +set -euxo pipefail +cd "$( dirname "${BASH_SOURCE[0]}" )" + +SCRIPT_NAME=$(basename "$0") +DATAHUB_DIR=$(pwd)/../.. + +# Create a virtualenv. +VENV_DIR=$(mktemp -d) +python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'" +python -m venv $VENV_DIR +source $VENV_DIR/bin/activate +pip install --upgrade pip setuptools wheel +echo "Using virtualenv at $VENV_DIR" + +# Install stuff. +pushd $DATAHUB_DIR/metadata-ingestion +pip install -e . +pip install -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]' +pip install -e '.[all]' +popd + +# Generate the requirements file. +# Removing Flask deps due as per https://github.com/datahub-project/datahub/pull/6867/files +# Removing py4j and PyJWT due to https://github.com/datahub-project/datahub/pull/6868/files +# Removing pyspark and pydeequ because we don't want them in the slim image, so they can be added separately. +# TODO: It's unclear if these removals are still actually needed. +echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt +pip freeze \ + | grep -v -E "^-e" \ + | grep -v "Flask-" \ + | grep -v -E "(py4j|PyJWT)==" \ + | grep -v -E "(pyspark|pydeequ)==" \ + >> base-requirements.txt