diff --git a/.github/workflows/repo_scraper.yml b/.github/workflows/repo_scraper.yml index 34b896735..e1b12aac3 100644 --- a/.github/workflows/repo_scraper.yml +++ b/.github/workflows/repo_scraper.yml @@ -1,35 +1,36 @@ name: Run repo scraper on: - #push: - # branches: [main] #schedule: # - cron: "0 0 * * *" # Run at the end of every day workflow_dispatch: {} # manual executions jobs: scrape: - name: Run repo scraper runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python uses: actions/setup-python@v5 with: python-version: 3.11 + cache: 'pip' + cache-dependency-path: setup.py + + - name: Auth with GCP + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -r tools/repo_scraper/requirements.txt + pip install -r tools/repo_stats/requirements.txt pip list - name: Run repo scraper env: - DROPBOX_TOKEN: ${{ secrets.DROPBOX_TOKEN }} - DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} - DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} - DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} - GH_TOKEN: ${{ secrets.GH_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - python tools/repo_scraper/repo_scraper.py + python tools/repo_stats/scraper.py diff --git a/.gitignore b/.gitignore index 2552e7057..dd8897e76 100644 --- a/.gitignore +++ b/.gitignore @@ -156,10 +156,13 @@ node_modules/ latest_info.csv latest_repo_data.csv repo_data.csv +group_info.csv +repo_stats*.json student_repos/ reviews.csv bert_sentiment_model.pt prediction*.json +**/service_account_key.json # vscode .vscode/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0705e7efe..636cf94c0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: hooks: # try to fix what is possible - id: ruff - args: ["--fix"] + args: ["--fix", "--unsafe-fixes"] # perform formatting updates - id: ruff-format # validate if all is fine with preview mode diff --git a/reports/report.py b/reports/report.py index f8ef337f4..2c8d8bee6 100644 --- a/reports/report.py +++ b/reports/report.py @@ -19,7 +19,7 @@ def no_constraints(answer, index) -> None: def length_constraints(answer, index, min_length, max_length) -> None: - """Either min or maximum length contrains for question.""" + """Either min or maximum length constrains for question.""" answer = answer.split() if not (min_length <= len(answer) <= max_length): warnings.warn( diff --git a/s3_reproducibility/docker.md b/s3_reproducibility/docker.md index 2fcd489d9..cd3615c6f 100644 --- a/s3_reproducibility/docker.md +++ b/s3_reproducibility/docker.md @@ -158,6 +158,14 @@ beneficial for you to download. docker rm ``` + In general we recommend to use the `--rm` flag when running a container e.g. + + ```bash + docker run --rm + ``` + + which will automatically remove the container after it has finished running. + 9. Let's now move on to trying to construct a Dockerfile ourselves for our MNIST project. Create a file called `trainer.dockerfile`. The intention is that we want to develop one Dockerfile for running our training script and one for doing predictions. @@ -303,7 +311,7 @@ beneficial for you to download. to start the image in interactive mode: ```bash - docker run -it --entrypoint sh {image_name}:{image_name} + docker run --rm -it --entrypoint sh {image_name}:{image_tag} ``` 16. When your training has completed you will notice that any files that are created when running your training script diff --git a/tools/leaderboard_app/leaderboard_app.py b/tools/leaderboard_app/leaderboard_app.py deleted file mode 100644 index bdee74854..000000000 --- a/tools/leaderboard_app/leaderboard_app.py +++ /dev/null @@ -1,151 +0,0 @@ -r"""Basic streamlit leaderboard app for showing data from scraped GitHub repos. - -Run with: - streamlit run tools\leaderboard_app\leaderboard_app.py -""" - -import ast -import os -import sys -from datetime import datetime - -import dropbox -import pandas as pd -import streamlit as st -from dotenv import load_dotenv -from dropbox.exceptions import AuthError - -st.set_page_config(layout="wide") - -if st.secrets.load_if_toml_exists(): - DROPBOX_TOKEN = st.secrets["DROPBOX_TOKEN"] - DROPBOX_APP_KEY = st.secrets["DROPBOX_APP_KEY"] - DROPBOX_APP_SECRET = st.secrets["DROPBOX_APP_SECRET"] - DROPBOX_REFRESH_TOKEN = st.secrets["DROPBOX_REFRESH_TOKEN"] -else: # load credentials from .env file - load_dotenv() - DROPBOX_TOKEN = os.getenv("DROPBOX_TOKEN") - DROPBOX_APP_KEY = os.getenv("DROPBOX_APP_KEY") - DROPBOX_APP_SECRET = os.getenv("DROPBOX_APP_SECRET") - DROPBOX_REFRESH_TOKEN = os.getenv("DROPBOX_REFRESH_TOKEN") - - -def download_data(filename: str) -> None: - """Download data from dropbox.""" - with dropbox.Dropbox( - oauth2_access_token=DROPBOX_TOKEN, - app_key=DROPBOX_APP_KEY, - app_secret=DROPBOX_APP_SECRET, - oauth2_refresh_token=DROPBOX_REFRESH_TOKEN, - ) as dbx: - try: - dbx.users_get_current_account() - except AuthError: - sys.exit("ERROR: Invalid access token; try re-generating an access token from the app console on the web.") - - dbx.files_download_to_file(filename, f"/{filename}") - - -def main() -> None: - """Streamlit application for showing group GitHub stats.""" - download_data("latest_repo_data.csv") - - df = pd.read_csv("latest_repo_data.csv") - - # convert to column - df["contributions_per_contributor"] = df["contributions_per_contributor"].apply( - lambda x: ast.literal_eval(x) if pd.notnull(x) else x, - ) - df["warnings_raised"] = df["warnings_raised"].apply(lambda x: 27 - x if pd.notnull(x) else x) - f = "%Y-%m-%dT%H:%M:%SZ" - df["latest_commit"] = df["latest_commit"].apply( - lambda x: datetime.strptime(x, f).astimezone(tz=datetime.UTC) if pd.notnull(x) else x, - ) - - # remove columns that are not needed - df1 = df[ - [ - "group_nb", - "num_students", - "num_contributors", - "total_commits", - "num_commits_to_main", - "contributions_per_contributor", - "num_prs", - "average_commit_message_length_to_main", - "average_commit_message_length", - "latest_commit", - ] - ] - - df2 = df[ - [ - "group_nb", - "num_docker_files", - "num_workflow_files", - "has_requirement_file", - "has_makefile", - "has_cloudbuild", - "repo_size", - "readme_size", - "using_dvc", - "warnings_raised", - ] - ] - - st.title("Group Github Stats") - st.text( - """ - Below is shown automatic scraped data for all groups in the course. None of these stats directly contribute - towards you passing the course or not. Instead they can inform how you are doing in comparison to other groups, - and it can indirectly inform the us about how well you are using version control for collaborating on your - project. - """, - ) - - st.header("Base statistics") - st.dataframe( - df1, - column_config={ - "group_nb": "Group Number", - "num_students": "Students", - "num_contributors": "Contributors", - "total_commits": "Total Commits", - "num_commits_to_main": "Commits to main", - "contributions_per_contributor": st.column_config.BarChartColumn("Contributions distribution"), - "num_prs": "Number of Pull Requests", - "average_commit_message_length_to_main": "ACML* (main)", - "average_commit_message_length": "ACML* (all)", - "latest_commit": st.column_config.DatetimeColumn("Latest commit"), - }, - hide_index=True, - ) - st.text("*ACML = Average Commit Message Length") - - st.header("Content statistics") - st.dataframe( - df2, - column_config={ - "group_nb": "Group Number", - "num_docker_files": "Docker files", - "num_workflow_files": "Workflow files", - "has_requirement_file": "Requirement file", - "has_makefile": "Makefile", - "has_cloudbuild": "Cloudbuild", - "repo_size": "Repository size", - "readme_size": "Readme size", - "using_dvc": "Using dvc", - "warnings_raised": st.column_config.ProgressColumn( - "Report completion", - help="Number of questions answered in exam report", - format="%d", - min_value=0, - max_value=27, - ), - }, - hide_index=True, - ) - - -if __name__ == "__main__": - main() diff --git a/tools/leaderboard_app/requirements.txt b/tools/leaderboard_app/requirements.txt deleted file mode 100644 index 141ddcee2..000000000 --- a/tools/leaderboard_app/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -streamlit >= 1.26.0 -dropbox >= 11.36.2 -python-dotenv >= 1.0.0 -pandas >= 2.0.3 diff --git a/tools/repo_scraper/repo_scraper.py b/tools/repo_scraper/repo_scraper.py deleted file mode 100644 index 3af0103e2..000000000 --- a/tools/repo_scraper/repo_scraper.py +++ /dev/null @@ -1,375 +0,0 @@ -"""Tool for scraping student repos for information. - -Run locally with (from root folder) - python tools/repo_scraper/repo_scraper.py -""" - -from __future__ import annotations - -import csv -import datetime -import os -import shutil -import sys -from pathlib import Path -from subprocess import PIPE, Popen - -import dropbox -import requests -from dotenv import load_dotenv -from dropbox.exceptions import AuthError - -load_dotenv() -DROPBOX_TOKEN = os.getenv("DROPBOX_TOKEN") -DROPBOX_APP_KEY = os.getenv("DROPBOX_APP_KEY") -DROPBOX_APP_SECRET = os.getenv("DROPBOX_APP_SECRET") -DROPBOX_REFRESH_TOKEN = os.getenv("DROPBOX_REFRESH_TOKEN") -GH_TOKEN = os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") -headers = {"Authorization": f"Bearer {GH_TOKEN}"} - - -def process_data(data: list[list[str]]): - """Process the data from the csv file.""" - # remove empty emails - new_data = [] - for group in data: - group[0] = int(group[0]) # convert group number to int - new_data.append([group[0], len([g for g in group[1:-1] if g != ""]), group[-1]]) - return new_data - - -def load_data(filename: str) -> list[list[str]]: - """Load the data from the csv file.""" - with open("latest_info.csv") as f: - csv_reader = csv.reader(f, delimiter=",") - content = [] - for row in csv_reader: - content.append(row) - - header = content.pop(0) - return process_data(content) - - -def download_data(filename: str) -> None: - """Download specific file from dropbox.""" - with dropbox.Dropbox( - oauth2_access_token=DROPBOX_TOKEN, - app_key=DROPBOX_APP_KEY, - app_secret=DROPBOX_APP_SECRET, - oauth2_refresh_token=DROPBOX_REFRESH_TOKEN, - ) as dbx: - try: - dbx.users_get_current_account() - except AuthError: - sys.exit("ERROR: Invalid access token; try re-generating an access token from the app console on the web.") - - dbx.files_download_to_file(filename, f"/{filename}") - - -def upload_data(filename: str) -> None: - """Upload specific file to dropbox.""" - with dropbox.Dropbox( - oauth2_access_token=DROPBOX_TOKEN, - app_key=DROPBOX_APP_KEY, - app_secret=DROPBOX_APP_SECRET, - oauth2_refresh_token=DROPBOX_REFRESH_TOKEN, - ) as dbx: - try: - dbx.users_get_current_account() - except AuthError: - sys.exit("ERROR: Invalid access token; try re-generating an access token from the app console on the web.") - - now = datetime.datetime.now(tz=datetime.UTC).strftime("%Y_%m_%d_%H_%M_%S") - with open(filename, "rb") as f: - dbx.files_upload(f.read(), f"/{now}_{filename}") - with open(filename, "rb") as f: - dbx.files_upload(f.read(), f"/latest_{filename}", mode=dropbox.files.WriteMode.overwrite) - - -def reformat_repo(repo: str) -> str: - """Extract from the url the user id and repository name only.""" - split = repo.split("/") - return f"{split[-2]}/{split[-1]}" - - -def get_default_branch_name(repo: str) -> str: - """Get the default branch name of a GitHub repo.""" - response = requests.get(f"https://api.github.com/repos/{repo}", headers=headers, timeout=100) - return response.json()["default_branch"] - - -def get_content(branch: str, url: str, repo: str, current_path: str) -> None: - """Recursively download content from a GitHub repo.""" - response = requests.get(url, headers=headers, timeout=100) - for file in response.json(): - if file["type"] == "dir": - folder = file["name"] - os.system(f"cd {current_path} & mkdir {folder}") - get_content(branch, f"{url}/{folder}", repo, f"{current_path}/{folder}") - else: - path = file["path"] - os.system(f"cd {current_path} & curl -s -OL https://raw.githubusercontent.com/{repo}/{branch}/{path}") - - -def get_content_recursive(url): - """Extract all content from a GitHub repo recursively.""" - all_content = [] - content = requests.get(url, headers=headers, timeout=10).json() - for c in content: - if "type" not in c: - continue - if c["type"] == "dir": - all_content += get_content_recursive(f"{url}/{c['name']}") - else: - all_content.append(c) - return all_content - - -def write_to_file(filename, row, mode="a") -> None: - """Write to a local csv file.""" - with open(filename, mode=mode, newline="") as f: - writer = csv.writer(f, delimiter=",") - writer.writerow(row) - - -def clone_repos(formatted_data, out_folder, timeout_clone) -> None: - """Clone the repos of the students.""" - print("====== Cloning repos ======") - for index, data in enumerate(formatted_data): - group_nb, _, repo = data - print(f"Cloning group {group_nb}, {index}/{len(formatted_data)}") - out = os.system(f"cd {out_folder} && timeout -v {timeout_clone} git clone -q {repo}") - clone_success = out == 0 - folder_name = repo.split("/")[-1] - if clone_success: - os.system(f"cd {out_folder} && cp -r {folder_name} group_{group_nb} && rm -rf {folder_name}") - else: - if folder_name in os.listdir(out_folder): - shutil.rmtree(f"{out_folder}/{folder_name}") - data.append(clone_success) - - -def extract_prs(repo: str) -> list[dict]: - """Extract all PRs from a GitHub repo.""" - prs = [] - page_counter = 1 - while True: - prs_page = requests.get( - f"https://api.github.com/repos/{repo}/pulls", - headers=headers, - params={"state": "all", "page": page_counter, "per_page": 100}, - timeout=100, - ).json() - if len(prs_page) == 0: - break - page_counter += 1 - prs += prs_page - return prs - - -def extract_commits(repo: str) -> list[dict]: - """Extract all commits from a GitHub repo.""" - commits = [] - page_counter = 1 - while True: - commits_page = requests.get( - f"https://api.github.com/repos/{repo}/commits", - headers=headers, - params={"state": "all", "page": page_counter, "per_page": 100}, - timeout=100, - ).json() - if len(commits_page) == 0: - break - page_counter += 1 - commits += commits_page - return commits - - -def get_stats_from_content(repo: str): - """Extract stats from the content of a GitHub repo.""" - content = get_content_recursive(f"https://api.github.com/repos/{repo}/contents") - docker_files = [c for c in content if c["name"] == "Dockerfile" or ".dockerfile" in c["name"]] - num_docker_files = len(docker_files) - workflow_files = [c for c in content if c["path"].startswith(".github/workflows")] - num_workflow_files = len(workflow_files) - has_requirement_file = len([c for c in content if c["name"] == "requirements.txt"]) > 0 - has_makefile = len([c for c in content if c["name"] == "Makefile"]) > 0 - has_cloudbuild = len([c for c in content if "cloudbuild.yaml" in c["name"]]) > 0 - return num_docker_files, num_workflow_files, has_requirement_file, has_makefile, has_cloudbuild - - -def check_report(out_folder: str, group_nb: int) -> None | int: - """Check how many questions are answered in the report.""" - warnings_raised = None - if "reports" in os.listdir(f"{out_folder}/group_{group_nb}"): - report_dir = os.listdir(f"{out_folder}/group_{group_nb}/reports") - if "README.md" in report_dir and "report.py" in report_dir: - p = Popen( - ["python", "report.py", "check"], - cwd=f"{out_folder}/group_{group_nb}/reports", - stdout=PIPE, - stderr=PIPE, - stdin=PIPE, - ) - output = p.stderr.read() - warnings_raised = len(output.decode("utf-8").split("\n")[:-1:2]) - - -def main( - out_folder: str = "student_repos", - timeout_clone: str = "2m", -) -> None: - """Extract group statistics from github.""" - print("Getting the repository information") - if "latest_info.csv" not in os.listdir(): - download_data("latest_info.csv") - formatted_data = load_data("latest_info.csv") - - # loop for scraping the repository of each group - print("Cleaning out old data if needed") - if os.path.isdir(out_folder): # non-empty folder, delete content - shutil.rmtree(out_folder) - os.makedirs(out_folder) - - # clone repos - clone_repos(formatted_data, out_folder, timeout_clone) - - # create file for data - write_to_file( - "repo_data.csv", - [ - "group_nb", - "num_students", - "num_contributors", - "num_prs", - "num_commits_to_main", - "average_commit_message_length_to_main", - "latest_commit", - "average_commit_message_length", - "contributions_per_contributor", - "total_commits", - "num_docker_files", - "num_workflow_files", - "has_requirement_file", - "has_makefile", - "has_cloudbuild", - "repo_size", - "readme_size", - "using_dvc", - "warnings_raised", - ], - mode="w", - ) - - # extract info through API - print("====== Extracting info through API ======") - for index, (group_nb, num_students, repo, clone_success) in enumerate(formatted_data): - print(f"Processing group {group_nb}, {index}/{len(formatted_data)}") - repo = reformat_repo(repo) - exists = requests.get(f"https://api.github.com/repos/{repo}", headers=headers, timeout=100) - if exists.status_code == 200: - contributors = requests.get( - f"https://api.github.com/repos/{repo}/contributors", - headers=headers, - timeout=100, - ).json() - contributors = {c["login"]: {"contributions": c["contributions"], "commits_pr": 0} for c in contributors} - num_contributors = len(contributors) - - prs = extract_prs(repo) - num_prs = len(prs) - - commits = extract_commits(repo) - num_commits_to_main = len(commits) - commit_messages = [c["commit"]["message"] for c in commits] - average_commit_message_length_to_main = sum([len(c) for c in commit_messages]) / len(commit_messages) - latest_commit = commits[0]["commit"]["author"]["date"] - - merged_prs = [p["number"] for p in prs if p["merged_at"] is not None] - for pr_num in merged_prs: - pr_commits = requests.get( - f"https://api.github.com/repos/{repo}/pulls/{pr_num}/commits", - headers=headers, - params={"state": "all", "per_page": 100}, - timeout=100, - ).json() - commit_messages += [c["commit"]["message"] for c in pr_commits] - for comm in pr_commits: - if ( - comm["committer"] is not None - and "login" in comm["committer"] - and comm["committer"]["login"] in contributors - ): - contributors[comm["committer"]["login"]]["commits_pr"] += 1 - average_commit_message_length = sum([len(c) for c in commit_messages]) / len(commit_messages) - - contributions_per_contributor = [c["contributions"] + c["commits_pr"] for c in contributors.values()] - total_commits = sum(contributions_per_contributor) - - stats = get_stats_from_content(repo) - num_docker_files, num_workflow_files, has_requirement_file, has_makefile, has_cloudbuild = stats - else: - num_contributors = None - num_prs = None - num_commits_to_main = None - average_commit_message_length_to_main = None - latest_commit = None - average_commit_message_length = None - contributions_per_contributor = None - total_commits = None - num_docker_files = None - num_workflow_files = None - has_requirement_file = None - has_makefile = None - has_cloudbuild = None - - if clone_success: - path = Path(f"{out_folder}/group_{group_nb}") - repo_size = sum([f.stat().st_size for f in path.glob("**/*") if f.is_file()]) / 1_048_576 # in MB - - if "README.md" in os.listdir(f"{out_folder}/group_{group_nb}"): - with open(f"{out_folder}/group_{group_nb}/README.md") as f: - content = f.read() - readme_size = len(content.split(" ")) - else: - readme_size = None - - using_dvc = ".dvc" in os.listdir(f"{out_folder}/group_{group_nb}") - warnings_raised = check_report(out_folder, group_nb) - - else: - repo_size = None - readme_size = None - using_dvc = None - warnings_raised = None - - write_to_file( - "repo_data.csv", - [ - group_nb, - num_students, - num_contributors, - num_prs, - num_commits_to_main, - average_commit_message_length_to_main, - latest_commit, - average_commit_message_length, - contributions_per_contributor, - total_commits, - num_docker_files, - num_workflow_files, - has_requirement_file, - has_makefile, - has_cloudbuild, - repo_size, - readme_size, - using_dvc, - warnings_raised, - ], - mode="a", - ) - upload_data("repo_data.csv") - - -if __name__ == "__main__": - main() diff --git a/tools/repo_scraper/requirements.txt b/tools/repo_scraper/requirements.txt deleted file mode 100644 index dc75d742b..000000000 --- a/tools/repo_scraper/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -dropbox >= 11.36.2 -python-dotenv >= 1.0.0 -markdown >= 3.5.1 -click >= 8.1.7 diff --git a/tools/repo_stats/Dockerfile b/tools/repo_stats/Dockerfile new file mode 100644 index 000000000..a81c28cd2 --- /dev/null +++ b/tools/repo_stats/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +EXPOSE $PORT + +WORKDIR /app + +COPY requirements.txt . +COPY leaderboard.py . +COPY models.py . + +RUN pip install --no-cache-dir -r requirements.txt + +# Command to run the Streamlit application +ENTRYPOINT ["sh", "-c", "streamlit run leaderboard.py --server.port=$PORT --server.address=0.0.0.0"] diff --git a/tools/repo_stats/Makefile b/tools/repo_stats/Makefile new file mode 100644 index 000000000..71551eb41 --- /dev/null +++ b/tools/repo_stats/Makefile @@ -0,0 +1,36 @@ +## This defines all targets as phony targets, i.e. targets that are always out of date +## This is done to ensure that the commands are always executed, even if a file with the same name exists +## See https://www.gnu.org/software/make/manual/html_node/Phony-Targets.html +## Remove this if you want to use this Makefile for real targets +.PHONY: * + +GCP_PROJECT_NAME = $(shell gcloud config get-value project) + +# construct service account key file name +# upload as github secret and use in github actions +service_account: + gcloud iam service-accounts create repo-stats-account \ + --description="Service account for repo stats application" --display-name="repo-stats-account" + gcloud projects add-iam-policy-binding $(GCP_PROJECT_NAME) \ + --member="serviceAccount:repo-stats-account@$(GCP_PROJECT_NAME).iam.gserviceaccount.com" \ + --role="roles/storage.objectUser" + --project + gcloud iam service-accounts keys create service_account_key.json \ + --iam-account=repo-stats-account@$(GCP_PROJECT_NAME).iam.gserviceaccount.com + echo service_account_key.json >> .gitignore + +# deploy the leaderboard service +deploy_leaderboard: + gcloud run deploy repo-stats-leaderboard \ + --source . \ + --platform managed \ + --region europe-west1 \ + --allow-unauthenticated + + gcloud run services add-iam-policy-binding \ + --region=europe-west1 \ + --member=allUsers \ + --role=roles/run.invoker \ + repo-stats-leaderboard + + gcloud run services describe repo-stats-leaderboard --region=europe-west1 --format="value(status.url)" diff --git a/tools/repo_stats/leaderboard.py b/tools/repo_stats/leaderboard.py new file mode 100644 index 000000000..f1313c595 --- /dev/null +++ b/tools/repo_stats/leaderboard.py @@ -0,0 +1,174 @@ +import base64 +import json +from io import BytesIO +from pathlib import Path + +import numpy as np +import pandas as pd +import streamlit as st +from dotenv import load_dotenv +from google.cloud.storage import Client +from models import RepoStats +from PIL import Image + +load_dotenv() + + +def download_data(file_name: str) -> None: + """Downloads the group-repository data from GCS.""" + storage_client = Client() + bucket = storage_client.bucket("mlops_group_repository") + blob = bucket.blob(file_name) + blob.download_to_filename(file_name) + + +def load_data(file_name: str) -> pd.DataFrame: + """Loads the group-repository data into a DataFrame.""" + with Path(file_name).open() as f: + content = json.load(f) + repo_content = [RepoStats(**group) for group in content] + repo_content_dicts = [repo.model_dump() for repo in repo_content] + return pd.DataFrame(repo_content_dicts) + + +def activity_to_image(activity_matrix: list[list[int]], scale_factor: int = 10) -> str: + """ + Convert an activity matrix (N, 24) into an RGB image scaled up by a given factor. + + Args: + activity_matrix (list[list[int]]): A 2D list of activity values. + scale_factor (int): Factor by which to scale up the image size. + + Returns: + str: Base64-encoded PNG image string in "data:image/png;base64," format. + """ + # Normalize the activity matrix to the range [0, 255]. + array = np.array(activity_matrix, dtype=np.float32) + max_value = np.max(array) + if max_value > 0: + array = array / max_value * 255 + + # Create an RGB image: Green for activity, Black for no activity. + height, width = array.shape + rgb_array = np.zeros((height, width, 3), dtype=np.uint8) + rgb_array[:, :, 1] = array.astype(np.uint8) # Green channel + + # Scale up the image by the scale factor. + scaled_height, scaled_width = height * scale_factor, width * scale_factor + image = Image.fromarray(rgb_array, mode="RGB") + image = image.resize((scaled_width, scaled_height), Image.NEAREST) + + # Convert the image to a Base64 string. + buffer = BytesIO() + image.save(buffer, format="PNG") + buffer.seek(0) + image_base64 = base64.b64encode(buffer.read()).decode("utf-8") + + return f"data:image/png;base64,{image_base64}" + + +def main() -> None: + """Main function for the leaderboard.""" + download_data("repo_stats.json") + dataframe = load_data("repo_stats.json") + dataframe["num_warnings"] = dataframe["num_warnings"].apply(lambda x: 27 - x if pd.notnull(x) else x) + dataframe["activity_matrix"] = dataframe["activity_matrix"].apply( + lambda x: activity_to_image(x) if x is not None else x + ) + st.set_page_config(layout="wide") + st.title("Group Github Stats") + st.text( + """ + Below is shown automatic scraped data for all groups in the course. None of these stats directly contribute + towards you passing the course or not. Instead they can inform how you are doing in comparison to other groups, + and it can indirectly inform the us about how well you are using version control for collaborating on your + project. + """, + ) + + df_base = dataframe[ + [ + "group_number", + "group_size", + "num_contributors", + "total_commits", + "num_commits_to_main", + "contributions_per_contributor", + "num_prs", + "average_commit_length_to_main", + "average_commit_length", + "latest_commit", + "activity_matrix", + ] + ] + + df_content = dataframe[ + [ + "group_number", + "num_python_files", + "num_docker_files", + "num_workflow_files", + "has_requirements_file", + "has_cloudbuild", + "using_dvc", + "repo_size", + "actions_passing", + "readme_length", + "num_warnings", + ] + ] + + st.header("Base statistics") + st.dataframe( + df_base, + column_config={ + "group_number": "Group Number", + "group_size": "Group Size", + "num_contributors": "Contributors", + "total_commits": "Total Commits", + "num_commits_to_main": "Commits to main", + "contributions_per_contributor": st.column_config.BarChartColumn("Contributions distribution"), + "num_prs": "PRs", + "average_commit_length_to_main": "ACML* (main)", + "average_commit_length": "ACML* (all)", + "latest_commit": st.column_config.DatetimeColumn("Latest commit"), + "activity_matrix": st.column_config.ImageColumn( + "Commit activity**", + width="medium", + ), + }, + hide_index=True, + ) + st.write("*ACML = Average commit message length") + st.write( + "**Activity matrix is a (N, 24) matrix where N is the number of days since the first commit." + " Each row represents the number of commits per hour for that day." + ) + st.header("Content statistics") + st.dataframe( + df_content, + column_config={ + "group_number": "Group Number", + "num_python_files": "Python files", + "num_docker_files": "Docker files", + "num_workflow_files": "Workflow files", + "has_requirements_file": "Requirement file", + "has_cloudbuild": "Cloudbuild", + "using_dvc": "Using dvc", + "repo_size": "Repository size", + "actions_passing": "Actions passing", + "readme_length": "Readme size", + "num_warnings": st.column_config.ProgressColumn( + "Report completion", + help="Number of questions answered in exam report", + format="%d", + min_value=0, + max_value=27, + ), + }, + hide_index=True, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/repo_stats/models.py b/tools/repo_stats/models.py new file mode 100644 index 000000000..2245b17a8 --- /dev/null +++ b/tools/repo_stats/models.py @@ -0,0 +1,282 @@ +import base64 +import os +from pathlib import Path +from subprocess import PIPE, Popen + +import markdown2 +import requests +from dotenv import load_dotenv +from pydantic import BaseModel + +load_dotenv() +GH_TOKEN = os.getenv("GH_TOKEN") or os.getenv("GITHUB_TOKEN") +headers = {"Authorization": f"Bearer {GH_TOKEN}"} + + +class RepoStats(BaseModel): + """Model for repository statistics.""" + + group_number: int + group_size: int + num_contributors: int | None + num_prs: int | None + num_commits_to_main: int | None + average_commit_length_to_main: float | None + latest_commit: str | None + average_commit_length: float | None + contributions_per_contributor: list[int] | None + total_commits: int | None + activity_matrix: list[list[int]] | None + + num_docker_files: int | None + num_python_files: int | None + num_workflow_files: int | None + has_requirements_file: bool | None + has_cloudbuild: bool | None + using_dvc: bool | None + repo_size: float | None + readme_length: int | None + actions_passing: bool | None + + num_warnings: int | None + + +class Contributor(BaseModel): + """Model for contributors.""" + + login: str + contributions: int + commits_pr: int + + @property + def total_commits(self) -> int: + """Returns the total number of commits by the contributor.""" + return self.contributions + self.commits_pr + + +class Report(BaseModel): + """Model for the report.""" + + group_number: int + repo_api: str + default_branch: str + file_written: bool = False + + def download_checker(self) -> None: + """Downloads the checker script from the repository.""" + if not Path("report.py").exists(): + url = "https://api.github.com/repos/SkafteNicki/dtu_mlops/contents/reports/report.py" + response = requests.get(url, headers=headers, timeout=100) + if response.status_code == 200: + content_base64 = response.json()["content"] + content_decoded = base64.b64decode(content_base64).decode("utf-8") + with open("report.py", "w", encoding="utf-8") as file: + file.write(content_decoded) + + def download_report(self) -> None: + """Downloads the report from the repository.""" + if self.file_written: + return + url = f"{self.repo_api}/contents/reports/README.md" + response = requests.get(url, headers=headers, timeout=100).json() + if response.get("message") != "Not Found": + content_base64 = response["content"] + content_decoded = base64.b64decode(content_base64).decode("utf-8") + with open("README.md", "w", encoding="utf-8") as file: + file.write(content_decoded) + self.file_written = True + else: + self.file_written = False + + @property + def check_answers(self) -> int | None: + """Returns the number of warnings in the report.""" + self.download_checker() + self.download_report() + if self.file_written: + p = Popen( + ["python", "report.py", "check"], + cwd=".", + stdout=PIPE, + stderr=PIPE, + stdin=PIPE, + ) + output = p.stderr.read() + return len(output.decode("utf-8").split("\n")[:-1:2]) + return None + + +class RepoContent(BaseModel): + """Model for repository content.""" + + group_number: int + repo_api: str + default_branch: str + + @property + def file_tree(self): + """Returns the file tree of the repository.""" + if hasattr(self, "_file_tree"): + return self._file_tree + branch_url = f"{self.repo_api}/git/refs/heads/{self.default_branch}" + branch_response = requests.get(branch_url, headers=headers, timeout=100).json() + tree_sha = branch_response["object"]["sha"] + tree_url = f"{self.repo_api}/git/trees/{tree_sha}?recursive=1" + tree_response = requests.get(tree_url, headers=headers, timeout=100).json() + self._file_tree = tree_response["tree"] + return self._file_tree + + @property + def num_docker_files(self) -> int: + """Returns the number of Dockerfiles in the repository.""" + return len([f for f in self.file_tree if "Dockerfile" in f["path"] or ".dockerfile" in f["path"]]) + + @property + def num_python_files(self) -> int: + """Returns the number of Python files in the repository.""" + return len([f for f in self.file_tree if ".py" in f["path"]]) + + @property + def num_workflow_files(self) -> int: + """Returns the number of workflow files in the repository.""" + return len([f for f in self.file_tree if ".yml" in f["path"]]) + + @property + def has_requirements_file(self) -> bool: + """Returns True if the repository has a requirements.txt file.""" + return any("requirements.txt" in f["path"] for f in self.file_tree) + + @property + def has_cloudbuild(self) -> bool: + """Returns True if the repository uses Google Cloud Build.""" + return any("cloudbuild.yaml" in f["path"] for f in self.file_tree) + + @property + def using_dvc(self) -> bool: + """Returns True if the repository uses DVC.""" + return any(".dvc" in f["path"] for f in self.file_tree) + + @property + def repo_size(self) -> float: + """Returns the size of the repository in MB.""" + total_size_bytes = sum([f["size"] for f in self.file_tree if "size" in f]) + return total_size_bytes / (1024**2) + + @property + def readme_length(self) -> int: + """Returns the number of words in the README file.""" + readme_url = f"{self.repo_api}/readme" + readme_response = requests.get(readme_url, headers=headers, timeout=100).json() + if "content" in readme_response: + content_base64 = readme_response["content"] + content_decoded = base64.b64decode(content_base64).decode("utf-8") + plain_text = markdown2.markdown(content_decoded, extras=["strip"]) + return len(plain_text.split()) + return 0 + + @property + def actions_passing(self) -> bool: + """Returns True if the GitHub Actions are passing.""" + commit_url = f"{self.repo_api}/commits/{self.default_branch}" + commit_response = requests.get(commit_url, headers=headers, timeout=100).json() + latest_commit = commit_response["sha"] + + workflow_url = f"{self.repo_api}/actions/runs?branch={self.default_branch}&event=push" + workflow_response = requests.get(workflow_url, headers=headers, timeout=100).json() + workflow_runs = workflow_response["workflow_runs"] + + all_passing = True + for w_run in workflow_runs: + if w_run["head_sha"] == latest_commit and ( + w_run["status"] != "completed" or w_run["conclusion"] != "success" + ): + all_passing = False + break + return all_passing + + +class GroupInfo(BaseModel): + """Model for group information.""" + + group_number: int + student_1: str | None + student_2: str | None + student_3: str | None + student_4: str | None + student_5: str | None + repo_url: str + + @property + def group_size(self) -> int: + """Returns the number of students in the group.""" + return len(list(filter(None, [self.student_1, self.student_2, self.student_3, self.student_4, self.student_5]))) + + @property + def repo_api(self) -> str: + """Returns the API URL of the repository.""" + split = self.repo_url.split("/") + return f"https://api.github.com/repos/{split[-2]}/{split[-1]}" + + @property + def default_branch(self) -> str: + """Returns the default branch of the repository.""" + if hasattr(self, "_default_branch"): + return self._default_branch + self._default_branch = requests.get(self.repo_api, headers=headers, timeout=100).json()["default_branch"] + return self._default_branch + + @property + def repo_accessible(self) -> bool: + """Returns True if the repository is accessible.""" + if hasattr(self, "_repo_accessible"): + return self._repo_accessible + self._repo_accessible = requests.head(self.repo_url, headers=headers, timeout=100).status_code == 200 + return self._repo_accessible + + @property + def contributors(self) -> list[Contributor]: + """Returns all contributors to the repository.""" + if self.repo_accessible: + request = requests.get(f"{self.repo_api}/contributors", headers=headers, timeout=100).json() + return [Contributor(login=c["login"], contributions=c["contributions"], commits_pr=0) for c in request] + return None + + @property + def prs(self): + """Returns all pull requests to the repository.""" + if self.repo_accessible: + prs = [] + page_counter = 1 + while True: + request = requests.get( + f"{self.repo_api}/pulls", + headers=headers, + timeout=100, + params={"state": "all", "page": page_counter, "per_page": 100}, + ).json() + if len(request) == 0: + break + page_counter += 1 + prs.extend(request) + return prs + return None + + @property + def commits(self) -> list: + """Returns all commits to the default branch.""" + if self.repo_accessible: + commits = [] + page_counter = 1 + while True: + request = requests.get( + f"{self.repo_api}/commits", + headers=headers, + timeout=100, + params={"page": page_counter, "per_page": 100}, + ).json() + if len(request) == 0: + break + page_counter += 1 + commits.extend(request) + return commits + return None diff --git a/tools/repo_stats/requirements.txt b/tools/repo_stats/requirements.txt new file mode 100644 index 000000000..45f322845 --- /dev/null +++ b/tools/repo_stats/requirements.txt @@ -0,0 +1,10 @@ +typer==0.13.1 +python-dotenv==1.0.1 +markdown2==2.5.1 +google-cloud-storage==2.18.2 +pandas==2.2.3 +streamlit==1.40.1 +pydantic==2.9.2 +requests==2.32.3 +numpy==2.1.3 +pillow==11.0.0 diff --git a/tools/repo_stats/scraper.py b/tools/repo_stats/scraper.py new file mode 100644 index 000000000..034e3a058 --- /dev/null +++ b/tools/repo_stats/scraper.py @@ -0,0 +1,220 @@ +import csv +import datetime +import json +import logging +import os +from pathlib import Path + +import numpy as np +import requests +from dotenv import load_dotenv +from google.cloud.storage import Client +from models import GroupInfo, RepoContent, Report, RepoStats +from typer import Typer + +load_dotenv() + +GH_TOKEN = os.getenv("GH_TOKEN") or os.getenv("GITHUB_TOKEN") +headers = {"Authorization": f"Bearer {GH_TOKEN}"} + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def upload_data(file_name: str) -> None: + """Uploads the repo stats data to GCS.""" + storage_client = Client() + bucket = storage_client.bucket("mlops_group_repository") + blob = bucket.blob(file_name) + blob.upload_from_filename(file_name) + + +def download_data(file_name: str) -> None: + """Downloads the group-repository data from GCS.""" + storage_client = Client() + bucket = storage_client.bucket("mlops_group_repository") + blob = bucket.blob(file_name) + blob.download_to_filename(file_name) + + +def load_data(file_name: str) -> list[GroupInfo]: + """Loads the group-repository data into a DataFrame.""" + with Path(file_name).open() as f: + csv_reader = csv.reader(f, delimiter=",") + content = [] + for i, row in enumerate(csv_reader): + if i == 0: # Skip the header + continue + group = GroupInfo( + group_number=int(row[0]), + student_1=row[1] if row[1] != "" else None, + student_2=row[2] if row[2] != "" else None, + student_3=row[3] if row[3] != "" else None, + student_4=row[4] if row[4] != "" else None, + student_5=row[5] if row[5] != "" else None, + repo_url=row[6], + ) + content.append(group) + return content + + +def create_activity_matrix(commits: list, max_delta: int = 5, normalize: bool = True) -> list[list[int]]: + """Creates an activity matrix from the commits.""" + commit_times = [datetime.datetime.fromisoformat(commit["commit"]["committer"]["date"][:-1]) for commit in commits] + commit_times.sort() + + start_time = commit_times[0] + end_time = min(start_time + datetime.timedelta(weeks=max_delta), commit_times[-1]) + + num_days = (end_time - start_time).days + 1 # include last day + + commit_matrix = np.zeros((num_days, 24), dtype=int) + + for commit_time in commit_times: + if start_time <= commit_time <= end_time: + day_index = (commit_time - start_time).days + hour_index = commit_time.hour + commit_matrix[day_index, hour_index] += 1 + + return commit_matrix.tolist() + + +app = Typer() + + +@app.command() +def main(): + """Main function to scrape the group-repository data.""" + logger.info("Getting group-repository information") + if "group_info.csv" not in os.listdir(): + download_data("group_info.csv") + group_data = load_data("group_info.csv") + logger.info("Group-repository information loaded successfully") + + repo_stats: list[RepoContent] = [] + for index, group in enumerate(group_data): + logger.info(f"Processing group {group.group_number}, {index+1}/{len(group_data)}") + + if group.repo_accessible: + contributors = group.contributors + num_contributors = len(contributors) + + prs = group.prs + num_prs = len(prs) + + commits = group.commits + num_commits_to_main = len(commits) + commit_messages = [c["commit"]["message"] for c in commits] + average_commit_length_to_main = sum([len(c) for c in commit_messages]) / len(commit_messages) + latest_commit = commits[0]["commit"]["author"]["date"] + + merged_prs = [p["number"] for p in prs if p["merged_at"] is not None] + for pr_num in merged_prs: + pr_commits = requests.get( + f"{group.repo_api}/pulls/{pr_num}/commits", headers=headers, timeout=100 + ).json() + commit_messages += [c["commit"]["message"] for c in pr_commits] + for commit in pr_commits: + for contributor in contributors: + if ( + commit["committer"] is not None + and "login" in commit["committer"] + and contributor.login == commit["author"]["login"] + ): + contributor.commits_pr += 1 + commits += pr_commits + + activity_matrix = create_activity_matrix(commits) + + average_commit_length = sum([len(c) for c in commit_messages]) / len(commit_messages) + + contributions_per_contributor = [c.total_commits for c in contributors] + total_commits = sum(contributions_per_contributor) + + repo_content = RepoContent( + group_number=group.group_number, repo_api=group.repo_api, default_branch=group.default_branch + ) + num_docker_files = repo_content.num_docker_files + num_python_files = repo_content.num_python_files + num_workflow_files = repo_content.num_workflow_files + has_requirements_file = repo_content.has_requirements_file + has_cloudbuild = repo_content.has_cloudbuild + using_dvc = repo_content.using_dvc + repo_size = repo_content.repo_size + readme_length = repo_content.readme_length + actions_passing = repo_content.actions_passing + + report = Report( + group_number=group.group_number, repo_api=group.repo_api, default_branch=group.default_branch + ) + num_warnings = report.check_answers + + else: + num_contributors = None + num_prs = None + num_commits_to_main = None + average_commit_length_to_main = None + latest_commit = None + average_commit_length = None + total_commits = None + contributions_per_contributor = None + total_commits = None + activity_matrix = None + + num_docker_files = None + num_python_files = None + num_workflow_files = None + has_requirements_file = None + has_cloudbuild = None + using_dvc = None + repo_size = None + readme_length = None + actions_passing = None + + num_warnings = None + + repo_stat = RepoStats( + group_number=group.group_number, + group_size=group.group_size, + num_contributors=num_contributors, + num_prs=num_prs, + num_commits_to_main=num_commits_to_main, + average_commit_length_to_main=average_commit_length_to_main, + latest_commit=latest_commit, + average_commit_length=average_commit_length, + contributions_per_contributor=contributions_per_contributor, + total_commits=total_commits, + activity_matrix=activity_matrix, + num_docker_files=num_docker_files, + num_python_files=num_python_files, + num_workflow_files=num_workflow_files, + has_requirements_file=has_requirements_file, + has_cloudbuild=has_cloudbuild, + using_dvc=using_dvc, + repo_size=repo_size, + readme_length=readme_length, + actions_passing=actions_passing, + num_warnings=num_warnings, + ) + repo_stats.append(repo_stat) + + logger.info("Writing repo stats to file") + now = datetime.datetime.now(tz=datetime.UTC).strftime("%Y_%m_%d_%H_%M_%S") + filename = f"repo_stats_{now}.json" + with open("repo_stats.json", "w") as f: + json.dump([r.model_dump() for r in repo_stats], f) + with open(filename, "w") as f: + json.dump([r.model_dump() for r in repo_stats], f) + + logger.info("Uploading repo stats to GCS") + upload_data("repo_stats.json") + upload_data(filename) + + logger.info("Cleaning locally temp files") + Path("README.md").unlink() + Path("report.py").unlink() + Path(filename).unlink() + + +if __name__ == "__main__": + app()