Merge pull request #29 from TheCacophonyProject/fix-object-backup

Fix object backup
TheCacophonyProject · Sep 9, 2024 · 9d271de · 9d271de
2 parents 57b46e3 + 8208cb7
commit 9d271de
Show file tree

Hide file tree

Showing 10 changed files with 235 additions and 105 deletions.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -0,0 +1,32 @@
+name: Check Python code format
+
+on:
+  push:
+    branches: 
+      - '**'
+    tags:
+      - '*'
+  pull_request:
+    branches: 
+      - '**'
+
+jobs:
+  black:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          pip install black
+
+      - name: Check formatting with black
+        run: |
+          black --check .
diff --git a/backups/grafana/grafana-backup.py b/backups/grafana/grafana-backup.py
@@ -15,25 +15,25 @@
     print(f"failed to find config file '{CONFIG_FILE}'")
     sys.exit()
 
-with open(CONFIG_FILE, 'r') as f:
+with open(CONFIG_FILE, "r") as f:
     config = yaml.load(f, Loader=yaml.FullLoader)
 
 print("Running grafana backup")
 
 # File paths for Grafana configuration, database files, and plugins folder
-GRAFANA_INI_FILE = '/etc/grafana/grafana.ini'
-GRAFANA_DB_FILE = '/var/lib/grafana/grafana.db'
-GRAFANA_PLUGINS_FOLDER = '/var/lib/grafana/plugins/'
+GRAFANA_INI_FILE = "/etc/grafana/grafana.ini"
+GRAFANA_DB_FILE = "/var/lib/grafana/grafana.db"
+GRAFANA_PLUGINS_FOLDER = "/var/lib/grafana/plugins/"
 
 # Temporary backup file names
-TMP_BACKUP_INI_FILE = '/tmp/grafana_backup_ini.ini'
-TMP_BACKUP_DB_FILE = '/tmp/grafana_backup_db.db'
-TMP_BACKUP_PLUGINS_ZIP = '/tmp/grafana_backup_plugins.zip'
+TMP_BACKUP_INI_FILE = "/tmp/grafana_backup_ini.ini"
+TMP_BACKUP_DB_FILE = "/tmp/grafana_backup_db.db"
+TMP_BACKUP_PLUGINS_ZIP = "/tmp/grafana_backup_plugins.zip"
 
 # Backup path target file names
-BACKUP_INI_FILE = 'grafana.ini'
-BACKUP_DB_FILE = 'grafana.db'
-BACKUP_PLUGINS_ZIP = 'grafana_plugins.zip'
+BACKUP_INI_FILE = "grafana.ini"
+BACKUP_DB_FILE = "grafana.db"
+BACKUP_PLUGINS_ZIP = "grafana_plugins.zip"
 
 # Copy files to temporary backup files
 print("Copying files to temporary backup files")
@@ -42,20 +42,25 @@
 
 # Zip plugins folder
 print("Zipping plugins folder")
-archive_path, _ = os.path.splitext(TMP_BACKUP_PLUGINS_ZIP) # Remove extension as make_archive will add it
-shutil.make_archive(archive_path, 'zip', GRAFANA_PLUGINS_FOLDER)
+archive_path, _ = os.path.splitext(
+    TMP_BACKUP_PLUGINS_ZIP
+)  # Remove extension as make_archive will add it
+shutil.make_archive(archive_path, "zip", GRAFANA_PLUGINS_FOLDER)
 
 # Initialize Backblaze B2 API
 print("Initializing Backblaze B2 API")
 info = InMemoryAccountInfo()
 b2_api = B2Api(info)
-b2_api.authorize_account("production", config["b2"]["app_key_id"], config["b2"]["app_key"])
+b2_api.authorize_account(
+    "production", config["b2"]["app_key_id"], config["b2"]["app_key"]
+)
 bucket = b2_api.get_bucket_by_name(config["b2"]["bucket"])
 
+
 # Upload file to Backblaze B2 and delete original file
 def upload_to_b2(file_name, object_name):
     try:
-        with open(file_name, 'rb') as f:
+        with open(file_name, "rb") as f:
             data = f.read()
         source = UploadSourceBytes(data)
         bucket.upload(source, object_name)
@@ -65,7 +70,8 @@ def upload_to_b2(file_name, object_name):
     except Exception as e:
         print(f"File {file_name} could not be uploaded to Backblaze B2. Error: {e}")
         return False
-
+
+
 print("Uploading files to Backblaze B2")
 success = True
 success &= upload_to_b2(TMP_BACKUP_INI_FILE, BACKUP_INI_FILE)
@@ -78,16 +84,18 @@ def upload_to_b2(file_name, object_name):
     print("Grafana backup failed")
 
 print("Logging to influx")
-json_body = [{
+json_body = [
+    {
         "measurement": "backup",
         "tags": {
             "host": HOST_NAME,
         },
         "fields": {
             "success": 1.0 if success else 0.0,
-        }
-    }]
-client = InfluxDBClient(**config['influx'])
+        },
+    }
+]
+client = InfluxDBClient(**config["influx"])
 print(json_body)
 client.write_points(json_body)
 

diff --git a/backups/object-backup/object-backup.py b/backups/object-backup/object-backup.py
@@ -74,22 +74,32 @@ def get_archive_key(key):
     return os.path.join(config["archive"]["prefix"], key)
 
 
+def dont_backup_key(key):
+    if key.endswith("-thumb"):
+        return True
+
+
 # It is very easy to configure it to upload to the wrong bucket, so this checks that at least 80
 # out a random 100 recordings are already on the target bucket. Meaning it's probably the correct bucket.
+## TODO: Make an API request to the server for getting a random sample of keys from the target bucket.
 print(
     "Check that some files already match as a way of checking that the correct buckets are being/prefix used."
 )
 keys = []
 i = 0
-for obj in local_bucket.objects.page_size(10000):
+keys_sample_size = 10000
+for obj in local_bucket.objects.page_size(1000):
+    if dont_backup_key(obj.key):
+        continue
     keys.append(obj.key)
     i += 1
-    if i >= 10000:
+    if i >= keys_sample_size:
         break
 
-random_keys = []
-for i in range(100):
-    random_keys.append(random.choice(keys))
+random.shuffle(keys)
+
+# Select the first 100 random keys after shuffling
+random_keys = keys[:100]
 
 matching = 0
 
@@ -106,9 +116,16 @@ def check_matching_key(key):
     for key in random_keys:
         executor.submit(check_matching_key, key)
 
-if matching < 50:
+minimum_matching = 60
+if matching < minimum_matching:
     print(
-        f"{matching} out of 100 objects are already on the target bucket. Canceling backup."
+        textwrap.dedent(
+            f"""
+    Only {matching} out of 100 objects are already on the target bucket.This is less than {minimum_matching}.
+    A minimum of {minimum_matching} is required. Canceling backup.
+    This can be cased by a bucket misconfiguration or not a high enough keys to sample from, current size ({keys_sample_size})
+    """
+        )
     )
     time.sleep(2)
     sys.exit(0)
@@ -138,7 +155,7 @@ def handle_file(obj):
     global file_changed_count
     global matching_count
     try:
-        if obj.key.endswith("-thumb"):
+        if dont_backup_key(obj.key):
             return
         archive_key = os.path.join(config["archive"]["prefix"], obj.key)
         archive_obj = archive_bucket.Object(archive_key)

diff --git a/backups/object-recover/object-recover.py b/backups/object-recover/object-recover.py
@@ -8,6 +8,7 @@
 import os
 from minio import Minio
 
+
 def check_file_exists(minio_client, bucket_name, object_name):
     try:
         minio_client.stat_object(bucket_name, object_name)
@@ -50,11 +51,11 @@ def check_file_exists(minio_client, bucket_name, object_name):
 object_keys = file_object_keys + raw_file_object_keys
 
 minio_client = Minio(
-        minio["endpoint"],
-        access_key=minio["access_key"],
-        secret_key=minio["secret_key"],
-        secure=minio["http"],
-    )
+    minio["endpoint"],
+    access_key=minio["access_key"],
+    secret_key=minio["secret_key"],
+    secure=minio["http"],
+)
 
 print("Finding keys that are not in local object store")
 transfers = []
@@ -69,24 +70,28 @@ def check_file_exists(minio_client, bucket_name, object_name):
 completed_transfers = 0
 lock = Lock()
 
+
 def transfer_file(source, destination):
     try:
         subprocess.run(
             ["mc", "cp", "--quiet", source, destination],
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
-            check=True)
+            check=True,
+        )
         with lock:
             global completed_transfers
             completed_transfers += 1
-            print(f"{completed_transfers}/{len(transfers)} Transferred '{source}' to '{destination}'")
+            print(
+                f"{completed_transfers}/{len(transfers)} Transferred '{source}' to '{destination}'"
+            )
     except subprocess.CalledProcessError as e:
         print(f"Failed to transfer '{source}': {e}")
 
+
 size = len(transfers)
 print(f"Objects to recover: {size}")
 
 
 with ThreadPoolExecutor(max_workers=20) as executor:
     results = list(executor.map(lambda args: transfer_file(*args), transfers))
-