From f9db176e7805c27e78da3f74155f52ce9caf6708 Mon Sep 17 00:00:00 2001 From: Luis Montero Date: Mon, 15 Jan 2024 14:58:39 +0100 Subject: [PATCH] chore: working on CIFAR upload to the DB --- .github/workflows/cifar_benchmark.yaml | 29 +++++---- .github/workflows/single_benchmark.yaml | 2 +- benchmarks/convert_cifar.py | 80 +++++++++++++++++++++++-- 3 files changed, 94 insertions(+), 17 deletions(-) diff --git a/.github/workflows/cifar_benchmark.yaml b/.github/workflows/cifar_benchmark.yaml index 3398c9d63f..730e4620a1 100644 --- a/.github/workflows/cifar_benchmark.yaml +++ b/.github/workflows/cifar_benchmark.yaml @@ -131,16 +131,6 @@ jobs: NUM_SAMPLES=${{ github.event.inputs.num_samples }} python3 ./use_case_examples/cifar/cifar_brevitas_training/evaluate_one_example_fhe.py python3 ./benchmarks/convert_cifar.py --model-name "16-bits-trained-v0" - - name: Upload results - if: ${{ github.repository == 'zama-ai/concrete-ml-internal' }} - id: upload-results - run: | - curl \ - -H "Authorization: Bearer ${{ secrets.NEW_ML_PROGRESS_TRACKER_TOKEN }}" \ - -H "Content-Type: application/json" \ - -d @to_upload.json \ - -X POST "${{ secrets.NEW_ML_PROGRESS_TRACKER_URL }}experiment" - - name: Archive raw predictions uses: actions/upload-artifact@v4 with: @@ -177,6 +167,25 @@ jobs: name: server.zip path: client_server/server.zip + # We need to keep this as the last step to avoid not uploading the artifacts + # if the step crashes + - name: Upload results + id: upload-results + run: | + # Log the json + cat to_upload.json | jq + + # We need to sleep to avoid log issues + sleep 1. + + # Upload the json to the benchmark database + curl --fail-with-body \ + -H "Authorization: Bearer ${{ secrets.NEW_ML_PROGRESS_TRACKER_TOKEN }}" \ + -H "Content-Type: application/json; charset=UTF-8" \ + --json @to_upload.json \ + -X POST "${{ secrets.NEW_ML_PROGRESS_TRACKER_URL }}experiment" + + stop-runner: name: Stop EC2 runner needs: [run-cifar-10, start-cifar-runner] diff --git a/.github/workflows/single_benchmark.yaml b/.github/workflows/single_benchmark.yaml index 4a74c8d420..6b1313f6d5 100644 --- a/.github/workflows/single_benchmark.yaml +++ b/.github/workflows/single_benchmark.yaml @@ -182,7 +182,7 @@ jobs: - name: Upload results id: upload-results run: | - curl \ + curl --fail-with-body \ -H "Authorization: Bearer ${{ secrets.NEW_ML_PROGRESS_TRACKER_TOKEN }}" \ -H "Content-Type: application/json" \ -d @converted.json \ diff --git a/benchmarks/convert_cifar.py b/benchmarks/convert_cifar.py index 8f60ccdda0..d3a67d13ea 100644 --- a/benchmarks/convert_cifar.py +++ b/benchmarks/convert_cifar.py @@ -3,12 +3,20 @@ import argparse import datetime import json +import logging +import platform +import re +import socket +import subprocess +import uuid from importlib.metadata import version from pathlib import Path from typing import Any, Dict, List, Union +import cpuinfo import numpy as np import pandas as pd +import psutil from convert import get_git_hash, get_git_hash_date, git_iso_to_python_iso, is_git_diff @@ -28,6 +36,67 @@ def minimum_bribes(q): return bribes +def get_size(bytes_count: float, suffix="B"): + """ + Scale bytes to its proper format + e.g: + 1253656 => '1.20MB' + 1253656678 => '1.17GB' + """ + factor = 1024 + for unit in ["", "K", "M", "G", "T", "P"]: + if bytes_count < factor: + return f"{bytes_count:.2f} {unit}{suffix}" + bytes_count /= factor + + +def get_system_information(): + # From https://stackoverflow.com/questions/3103178/how-to-get-the-system-info-with-python + info = {} + # What is naturally dumped by python-progress-tracker + info["ram"] = get_size(psutil.virtual_memory().total) + info["cpu"] = cpuinfo.get_cpu_info()["brand_raw"] + info["os"] = f"{platform.system()} {platform.release()}" + + # Added metadata about the system + info["platform"] = platform.system() + info["platform-release"] = platform.release() + info["platform-version"] = platform.version() + info["architecture"] = platform.machine() + info["hostname"] = socket.gethostname() + info["processor"] = platform.processor() + info["physical_cores"] = psutil.cpu_count(logical=False) + info["total_cores"] = psutil.cpu_count(logical=True) + uname = platform.uname() + info["machine"] = uname.machine + info["processor"] = uname.processor + info["system"] = uname.system + info["node_name"] = uname.node + info["release"] = uname.release + info["version"] = uname.version + info["swap"] = get_size(psutil.swap_memory().total) + + return info + + +def get_ec2_metadata(): + res = {} + try: + output = subprocess.check_output("ec2metadata", shell=True, encoding="utf-8") + for line in output.split("\n"): + if line: + splitted = line.split(": ") + if len(splitted) == 2: + key, value = splitted + res[key] = value + else: + print(line) + return res + except Exception as exception: + print(exception) + return res + + def main(model_name): # Get metrics results = pd.read_csv("./inference_results.csv") @@ -73,14 +142,13 @@ def main(model_name): # Collect everything session_data: Dict[str, Union[Dict, List]] = {} + ec2_metadata = get_ec2_metadata() + # Create machine + # We should probably add the platform to the DB too session_data["machine"] = { - "machine_name": None, - "machine_specs": { - "cpu": None, - "ram": None, - "os": None, - }, + "machine_name": ec2_metadata.get("instance-type", socket.gethostname()), + "machine_specs": get_system_information(), } # Create experiments