From 08605a95a78df3f2a47c42a1e595b01f52dcc5e5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 19 Dec 2024 11:02:37 -0500 Subject: [PATCH] feat(python): split out temp wheel builds (#12157) --- .github/workflows/airflow-plugin.yml | 5 +- .github/workflows/dagster-plugin.yml | 8 +- .github/workflows/gx-plugin.yml | 8 +- .github/workflows/metadata-ingestion.yml | 9 +- .github/workflows/prefect-plugin.yml | 17 +-- .github/workflows/python-build-pages.yml | 64 ++++++++++ docs-website/build.gradle | 6 +- docs-website/generateDocsDir.ts | 24 ++-- metadata-ingestion/build.gradle | 4 +- python-build/.gitignore | 3 + python-build/build.gradle | 27 ++++ python-build/build_site.py | 150 +++++++++++++++++++++++ python-build/copy_wheels.py | 27 ++++ settings.gradle | 1 + 14 files changed, 304 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/python-build-pages.yml create mode 100644 python-build/.gitignore create mode 100644 python-build/build.gradle create mode 100644 python-build/build_site.py create mode 100644 python-build/copy_wheels.py diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index eefa02be4f1af8..26fcceb8aeab70 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -27,7 +27,6 @@ jobs: airflow-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -69,7 +68,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) @@ -93,7 +92,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index bee1ec95e77747..d8a9cd7bfd6a35 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -27,7 +27,6 @@ jobs: dagster-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -44,7 +43,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -56,7 +56,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) @@ -79,7 +79,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 595438bd6e4a90..2fd814a0764858 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -27,7 +27,6 @@ jobs: gx-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -48,7 +47,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -60,7 +60,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/gx-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'great-expectations~=0.17.0' }} with: name: Test Results (GX Plugin ${{ matrix.python-version}}) @@ -83,7 +83,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 49def2a863c565..ad00c6d1551d1d 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -28,7 +28,6 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 40 env: - SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} @@ -84,9 +83,9 @@ jobs: df -hl docker image ls docker system df - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: Test Results (metadata ingestion ${{ matrix.python-version }}) + name: Test Results (metadata ingestion ${{ matrix.python-version }} ${{ matrix.command }}) path: | **/build/reports/tests/test/** **/build/test-results/test/** @@ -100,14 +99,14 @@ jobs: directory: ./build/coverage-reports/ fail_ci_if_error: false flags: pytest-${{ matrix.command }} - name: pytest-${{ matrix.command }} + name: pytest-${{ matrix.python-version }}-${{ matrix.command }} verbose: true event-file: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index 3c75e8fe9a62ff..e4a70426f3a618 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -27,25 +27,20 @@ jobs: prefect-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: python-version: ["3.8", "3.9", "3.10"] - include: - - python-version: "3.8" - - python-version: "3.9" - - python-version: "3.10" fail-fast: false steps: - name: Set up JDK 17 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v3 - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" @@ -56,7 +51,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/prefect-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10'}} with: name: Test Results (Prefect Plugin ${{ matrix.python-version}}) @@ -72,7 +67,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: prefect,prefect-${{ matrix.extra_pip_extras }} + flags: prefect,prefect-${{ matrix.python-version }} name: pytest-prefect-${{ matrix.python-version }} verbose: true @@ -80,7 +75,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/python-build-pages.yml b/.github/workflows/python-build-pages.yml new file mode 100644 index 00000000000000..8971722c374fb7 --- /dev/null +++ b/.github/workflows/python-build-pages.yml @@ -0,0 +1,64 @@ +name: Python Build +on: + push: + branches: + - master + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + pull_request: + branches: + - "**" + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + deploy-pages: + runs-on: ubuntu-latest + if: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME != '' }} + + name: Python Wheels + permissions: + contents: read + pull-requests: read + deployments: write + steps: + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: "zulu" + java-version: 17 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + - uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} + - name: Build Python wheel site + run: | + ./gradlew :python-build:buildSite + env: + GITHUB_TOKEN: ${{ github.token }} + - name: Publish + uses: cloudflare/pages-action@v1 + with: + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME }} + workingDirectory: python-build + directory: site + gitHubToken: ${{ github.token }} diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 1860b4a49ae23a..797863d2019fbd 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -83,11 +83,7 @@ task yarnInstall(type: YarnTask) { task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLSchema, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen', - ':metadata-ingestion:buildWheel', - ':metadata-ingestion-modules:airflow-plugin:buildWheel', - ':metadata-ingestion-modules:dagster-plugin:buildWheel', - ':metadata-ingestion-modules:prefect-plugin:buildWheel', - ':metadata-ingestion-modules:gx-plugin:buildWheel', + ':python-build:buildWheels', ]) { inputs.files(projectMdFiles) outputs.cacheIf { true } diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 0f7e347da64eba..ad82a85f9e5672 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -573,26 +573,20 @@ function write_markdown_file( function copy_python_wheels(): void { // Copy the built wheel files to the static directory. - const wheel_dirs = [ - "../metadata-ingestion/dist", - "../metadata-ingestion-modules/airflow-plugin/dist", - "../metadata-ingestion-modules/dagster-plugin/dist", - "../metadata-ingestion-modules/prefect-plugin/dist", - "../metadata-ingestion-modules/gx-plugin/dist", - ]; + // Everything is copied to the python-build directory first, so + // we just need to copy from there. + const wheel_dir = "../python-build/wheels"; const wheel_output_directory = path.join(STATIC_DIRECTORY, "wheels"); fs.mkdirSync(wheel_output_directory, { recursive: true }); - for (const wheel_dir of wheel_dirs) { - const wheel_files = fs.readdirSync(wheel_dir); - for (const wheel_file of wheel_files) { - const src = path.join(wheel_dir, wheel_file); - const dest = path.join(wheel_output_directory, wheel_file); + const wheel_files = fs.readdirSync(wheel_dir); + for (const wheel_file of wheel_files) { + const src = path.join(wheel_dir, wheel_file); + const dest = path.join(wheel_output_directory, wheel_file); - // console.log(`Copying artifact ${src} to ${dest}...`); - fs.copyFileSync(src, dest); - } + // console.log(`Copying artifact ${src} to ${dest}...`); + fs.copyFileSync(src, dest); } } diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 2c5d8e6c9646a8..fc1409fbed74e4 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -23,8 +23,8 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { inputs.file file('setup.py') outputs.file(sentinel_file) commandLine 'bash', '-c', - "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " + + "${python_executable} -m venv ${venv_name} && set -x && " + + "${venv_name}/bin/python -m pip install --upgrade uv && " + "touch ${sentinel_file}" } diff --git a/python-build/.gitignore b/python-build/.gitignore new file mode 100644 index 00000000000000..d2de6dec258091 --- /dev/null +++ b/python-build/.gitignore @@ -0,0 +1,3 @@ + +/wheels +/site diff --git a/python-build/build.gradle b/python-build/build.gradle new file mode 100644 index 00000000000000..e90bffd46828ce --- /dev/null +++ b/python-build/build.gradle @@ -0,0 +1,27 @@ +plugins { + id 'base' +} + +ext { + python_executable = 'python3' +} + +task checkPythonVersion(type: Exec) { + commandLine python_executable, '-c', + 'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info} is too old"' +} + +task buildWheels(type: Exec, dependsOn: [ + checkPythonVersion, + ':metadata-ingestion:buildWheel', + ':metadata-ingestion-modules:airflow-plugin:buildWheel', + ':metadata-ingestion-modules:dagster-plugin:buildWheel', + ':metadata-ingestion-modules:prefect-plugin:buildWheel', + ':metadata-ingestion-modules:gx-plugin:buildWheel', +]) { + commandLine python_executable, "copy_wheels.py" +} + +task buildSite(type: Exec, dependsOn: [buildWheels]) { + commandLine python_executable, "build_site.py" +} diff --git a/python-build/build_site.py b/python-build/build_site.py new file mode 100644 index 00000000000000..73941eca9968ca --- /dev/null +++ b/python-build/build_site.py @@ -0,0 +1,150 @@ +import contextlib +import json +import os +import pathlib +import shutil +import subprocess +from datetime import datetime, timezone + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +WHEEL_DIR = PYTHON_BUILD_DIR / "wheels" +SITE_OUTPUT_DIR = PYTHON_BUILD_DIR / "site" + +shutil.rmtree(SITE_OUTPUT_DIR, ignore_errors=True) +SITE_OUTPUT_DIR.mkdir(parents=True) + +SITE_ARTIFACT_WHEEL_DIR = SITE_OUTPUT_DIR / "artifacts" / "wheels" +SITE_ARTIFACT_WHEEL_DIR.mkdir(parents=True) +for wheel_file in WHEEL_DIR.glob("*"): + shutil.copy(wheel_file, SITE_ARTIFACT_WHEEL_DIR) + + +def package_name(wheel_file: pathlib.Path) -> str: + return wheel_file.name.split("-")[0].replace("_", "-") + + +# Get some extra context about the build +ts = datetime.now(timezone.utc).isoformat() +context_info: dict = { + "timestamp": ts, +} + +# Get branch info. +with contextlib.suppress(Exception): + if branch_info := os.getenv("GITHUB_HEAD_REF"): + pass + else: + branch_info = subprocess.check_output( + ["git", "branch", "--show-current"], text=True + ) + context_info["branch"] = branch_info.strip() + +# Get commit info. +with contextlib.suppress(Exception): + commit_info = subprocess.check_output( + ["git", "log", "-1", "--pretty=%H%n%B"], text=True + ) + commit_hash, commit_msg = commit_info.strip().split("\n", 1) + context_info["commit"] = { + "hash": commit_hash, + "message": commit_msg.strip(), + } + +# Get PR info. +with contextlib.suppress(Exception): + pr_info = "unknown" + if github_ref := os.getenv("GITHUB_REF"): + # e.g. GITHUB_REF=refs/pull/12157/merge + parts = github_ref.split("/") + if parts[1] == "pull": + pull_number = parts[2] + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", pull_number, "--json", "title,number,url"], + text=True, + ) + ) + else: + # The `gh` CLI might be able to figure it out. + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", "--json", "title,number,url"], text=True + ) + ) + context_info["pr"] = pr_info + + +newline = "\n" +(SITE_OUTPUT_DIR / "index.html").write_text( + f""" + + + DataHub Python Builds + + + + + + + + + + + +
+

DataHub Python Builds

+

+ These prebuilt wheel files can be used to install our Python packages as of a specific commit. +

+ +

Build context

+

+ Built at {ts}. +

+
{json.dumps(context_info, indent=2)}
+ +

Usage

+

+ Current base URL: unknown +

+ + + + + + + + + + + { + newline.join( + f''' + + + + + + ''' + for wheel_file in sorted(WHEEL_DIR.glob("*.whl")) + ) + } + +
PackageSizeInstall command
{package_name(wheel_file)}{wheel_file.stat().st_size / 1024 / 1024:.3f} MBuv pip install '{package_name(wheel_file)} @ <base-url>/artifacts/wheels/{wheel_file.name}'
+
+ + + +""" +) + +print("DataHub Python wheel site built in", SITE_OUTPUT_DIR) diff --git a/python-build/copy_wheels.py b/python-build/copy_wheels.py new file mode 100644 index 00000000000000..b66662cbfe9914 --- /dev/null +++ b/python-build/copy_wheels.py @@ -0,0 +1,27 @@ +import pathlib +import shutil + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +ROOT_DIR = PYTHON_BUILD_DIR.parent +WHEEL_OUTPUT_DIR = PYTHON_BUILD_DIR / "wheels" + +# These should line up with the build.gradle file. +wheel_dirs = [ + ROOT_DIR / "metadata-ingestion/dist", + ROOT_DIR / "metadata-ingestion-modules/airflow-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/dagster-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/prefect-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/gx-plugin/dist", +] + +# Delete and recreate the output directory. +if WHEEL_OUTPUT_DIR.exists(): + shutil.rmtree(WHEEL_OUTPUT_DIR) +WHEEL_OUTPUT_DIR.mkdir(parents=True) + +# Copy things over. +for wheel_dir in wheel_dirs: + for wheel_file in wheel_dir.glob("*"): + shutil.copy(wheel_file, WHEEL_OUTPUT_DIR) + +print("Copied wheels to", WHEEL_OUTPUT_DIR) diff --git a/settings.gradle b/settings.gradle index 8756df31c1ac6f..b0c2c707d566c0 100644 --- a/settings.gradle +++ b/settings.gradle @@ -64,6 +64,7 @@ include 'metadata-ingestion-modules:airflow-plugin' include 'metadata-ingestion-modules:gx-plugin' include 'metadata-ingestion-modules:dagster-plugin' include 'metadata-ingestion-modules:prefect-plugin' +include 'python-build' include 'smoke-test' include 'metadata-auth:auth-api' include 'metadata-service:schema-registry-api'