Skip to content

Commit

Permalink
update query and drive link (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
fpgmaas authored Jul 3, 2024
1 parent 40ad8ee commit 4ec88d3
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 17 deletions.
38 changes: 22 additions & 16 deletions pypi_bigquery.sql
Original file line number Diff line number Diff line change
@@ -1,36 +1,42 @@
WITH recent_downloads AS (
SELECT
LOWER(project) AS project_lower,
project,
COUNT(*) AS download_count
FROM
`bigquery-public-data.pypi.file_downloads`
WHERE
DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY) AND CURRENT_DATE()
GROUP BY
project
LOWER(project), project
HAVING
download_count >= 100
COUNT(*) >= 100
),
latest_metadata AS (
SELECT
LOWER(name) AS name_lower,
name,
description,
summary,
version,
upload_time,
ROW_NUMBER() OVER (PARTITION BY LOWER(name) ORDER BY upload_time DESC) AS rn
FROM
`bigquery-public-data.pypi.distribution_metadata`
)
SELECT
rd.project AS name,
dm.description AS description,
dm.summary AS summary,
dm.version AS latest_version,
lm.name AS name,
lm.description AS description,
lm.summary AS summary,
lm.version AS latest_version,
rd.download_count AS number_of_downloads
FROM
recent_downloads rd
JOIN
`bigquery-public-data.pypi.distribution_metadata` dm
latest_metadata lm
ON
rd.project = dm.name
rd.project_lower = lm.name_lower
WHERE
dm.upload_time = (
SELECT
MAX(upload_time)
FROM
`bigquery-public-data.pypi.distribution_metadata` sub_dm
WHERE
sub_dm.name = dm.name
)
lm.rn = 1
ORDER BY
rd.download_count DESC;
2 changes: 1 addition & 1 deletion pypi_scout/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class Config:
EMBEDDINGS_PARQUET_NAME = "embeddings.parquet"

# Google Drive file ID for downloading the raw dataset.
GOOGLE_FILE_ID = "1IDJvCsq1gz0yUSXgff13pMl3nUk7zJzb"
GOOGLE_FILE_ID = "12AH8PwKvZqRhXBf9uS1qRZq1-k3gIhhG"

# Fraction of the dataset to include in the vector database. This value determines the portion of top packages
# (sorted by weekly downloads) to include. Increase this value to include a larger portion of the dataset, up to 1.0 (100%).
Expand Down

0 comments on commit 4ec88d3

Please sign in to comment.