Skip to content

Build data

Build data #847

Workflow file for this run

# On every push this script is executed
on:
workflow_dispatch:
push:
branches:
- main
schedule:
- cron: "0 3 * * *"
concurrency: data
name: Build data
jobs:
build-data:
runs-on: ubuntu-latest
permissions:
contents: write
packages: write
steps:
- name: Max build space
run: |
rm -rf /usr/share/dotnet/ &
rm -rf /usr/local/lib/android/ &
rm -rf /opt/ghc/ &
rm -rf /opt/hostedtoolcache/CodeQL/ &
sudo docker image prune --all --force &
df -h
- name: checkout
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v4
with:
enable-cache: true
- uses: actions/setup-python@v5
with:
python-version-file: "pyproject.toml"
- name: Install deps
run: uv sync --frozen
- name: "Set current date as env variable"
run: |
echo "tag_name=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_OUTPUT
id: version
- name: Create Release
id: create-release
uses: shogo82148/actions-create-release@v1
with:
overwrite: true
draft: true
release_name: ${{ steps.version.outputs.tag_name }}
tag_name: ${{ steps.version.outputs.tag_name }}
commitish: ${{ github.sha }}
- name: Generate token
id: generate_token
uses: pypi-data/github-app-token@v2
with:
app_id: ${{ secrets.APP_ID }}
private_key: ${{ secrets.APP_PRIVATE_KEY }}
revoke: false
- name: Download hishel cache
uses: actions/cache/restore@v4
id: hisel-cache
with:
path: .hishel.sqlite
key: ${{ runner.os }}-hishel-${{ hashFiles('links/*.json') }}
- name: Generate Repo Metadata
env:
GITHUB_TOKEN: ${{ steps.generate_token.outputs.token }}
run: uv run pypi-data load-repos repos-with-packages.jsonl.gz links/
- name: Upload hishel cache
uses: actions/cache/save@v4
if: steps.hisel-cache.outputs.cache-hit != 'true'
with:
path: .hishel.sqlite
key: ${{ steps.hisel-cache.outputs.cache-primary-key }}
- name: Upload Repos with packages
uses: shogo82148/actions-upload-release-asset@v1
with:
upload_url: ${{ steps.create-release.outputs.upload_url }}
asset_path: ${{ github.workspace }}/repos-with-packages.jsonl.gz
- name: Remove repos-with-packages.jsonl.gz
run: rm repos-with-packages.jsonl.gz
- name: Create dataset
run: uv run pypi-data merge-datasets links/repositories.json dataset/
- name: Debug
run: ls -la dataset/
- name: Upload Dataset
uses: shogo82148/actions-upload-release-asset@v1
id: upload-dataset
with:
upload_url: ${{ steps.create-release.outputs.upload_url }}
asset_path: ${{ github.workspace }}/dataset/dataset-*.parquet
asset_content_type: application/vnd.apache.parquet
- name: Publish Draft Release
uses: actions/github-script@v7
with:
script: |
const releaseId = "${{ steps.create-release.outputs.id }}";
const response = await github.rest.repos.updateRelease({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: releaseId,
draft: false
});
if (response.status === 200) {
core.info(`Release ${releaseId} successfully published.`);
} else {
core.setFailed(`Failed to publish release ${releaseId}.`);
}
- name: Get download links
id: get_download_links
uses: actions/github-script@v7
with:
script: |
const response = await github.rest.repos.listReleaseAssets({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: ${{ steps.create-release.outputs.id }},
});
const assets = response.data;
const urls = assets.filter(asset => asset.name.endsWith(".parquet")).map(asset => asset.browser_download_url)
// Save the download links to a file
require('fs').writeFileSync('links/dataset.txt', urls.join('\n'));
const jsonUrls = assets.filter(asset => asset.name == "repos-with-packages.jsonl.gz").map(asset => asset.browser_download_url)
require('fs').writeFileSync('links/repositories-with-packages.txt', jsonUrls.join('\n'));
- uses: EndBug/add-and-commit@v9
with:
add: links/*
message: "Add repository URLs"
push: true
fetch: true
pull: '--rebase --autostash'