Skip to content

Commit

Permalink
Handle rare case where GCP does not return size, updated and md5Hash …
Browse files Browse the repository at this point in the history
…fields
  • Loading branch information
giacomo-alzetta-aiven committed Nov 20, 2023
1 parent 638cebe commit ee636db
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 11 deletions.
22 changes: 12 additions & 10 deletions rohmu/object_storage/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,16 +380,18 @@ def initial_op(domain: Any) -> HttpRequest:
self.log.warning("list_iter: directory entry %r", item)
continue # skip directory level objects

yield IterKeyItem(
type=KEY_TYPE_OBJECT,
value={
"name": self.format_key_from_backend(item["name"]),
"size": int(item["size"]),
"last_modified": parse_timestamp(item["updated"]),
"metadata": item.get("metadata", {}),
"md5": base64_to_hex(item["md5Hash"]),
},
)
value = {
"name": self.format_key_from_backend(item["name"]),
"metadata": item.get("metadata", {}),
}
# in very rare circumstances size, updated and md5Hash can be missing. Omit the keys if that happens
if (size := item.get("size")) is not None:
value["size"] = int(size)
if (updated := item.get("updated")) is not None:
value["last_modified"] = parse_timestamp(updated)
if (md5 := item.get("md5Hash")) is not None:
value["md5"] = base64_to_hex(md5)
yield IterKeyItem(type=KEY_TYPE_OBJECT, value=value)
elif property_name == "prefixes":
for prefix in items:
yield IterKeyItem(type=KEY_TYPE_PREFIX, value=self.format_key_from_backend(prefix).rstrip("/"))
Expand Down
79 changes: 78 additions & 1 deletion test/object_storage/test_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@
from __future__ import annotations

from contextlib import ExitStack
from datetime import datetime
from datetime import datetime, UTC
from googleapiclient.http import MediaUploadProgress
from io import BytesIO
from rohmu.common.models import StorageOperation
from rohmu.errors import InvalidByteRangeError
from rohmu.object_storage.base import IterKeyItem
from rohmu.object_storage.google import GoogleTransfer, MediaIoBaseDownloadWithByteRange, Reporter
from tempfile import NamedTemporaryFile
from unittest.mock import ANY, call, MagicMock, Mock, patch

import base64
import pytest


Expand Down Expand Up @@ -211,3 +213,78 @@ def test_media_io_download_with_byte_range_and_very_small_object() -> None:
assert status.progress() == 1.0
assert result.getvalue() == b"lo, World!"
mock_request.http.request.assert_called_once_with(ANY, ANY, headers={"range": "bytes=3-100"})


def test_object_listed_when_missing_md5hash_size_and_updated() -> None:
notifier = MagicMock()
with ExitStack() as stack:
stack.enter_context(patch("rohmu.object_storage.google.get_credentials"))
stack.enter_context(patch("rohmu.object_storage.google.GoogleTransfer.get_or_create_bucket"))
mock_operation = stack.enter_context(patch("rohmu.common.statsd.StatsClient.operation"))
transfer = GoogleTransfer(
project_id="test-project-id",
bucket_name="test-bucket",
notifier=notifier,
)

# mock instance because there is decorator and context managers in the way
mock_client = stack.enter_context(patch.object(transfer, "_object_client"))
mock_client.return_value.__enter__.return_value.list_next.return_value = None
object_name = (
"aiventest/111aa1aa-1aaa-1111-11a1-11111aaaaa11/a1111111-aaa1-1aaa-aa1a-1a11aaaa11a1"
"/tiered_storage/ccs/aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
)
escaped_name = object_name.replace("/", "%2F")

# API response missing size, updated & md5Hash fields
sample_item = {
"bucket": "test-bucket",
"contentType": "binary/octet-stream",
"generation": "1111111111111111",
"id": f"test-bucket/{object_name}/1111111111111111",
"kind": "storage#object",
"mediaLink": f"https://storage.googleapis.com/download/storage/v1/b/test-bucket/o/"
f"{escaped_name}?generation=1111111111111111&alt=media",
"metageneration": "1",
"name": object_name,
"selfLink": f"https://www.googleapis.com/storage/v1/b/"
f"p812de5da-0bab-4990-90e8-57303eebfd30-99012089cf1d961516b8b3ff6/o/"
f"{escaped_name}?generation=1111111111111111",
"storageClass": "REGIONAL",
}
mock_client.return_value.__enter__.return_value.list.return_value.execute.return_value = {
"items": [
sample_item,
{"size": 100, **sample_item},
{"md5Hash": base64.encodebytes(b"Missing md5Hash!"), **sample_item},
{"updated": "2023-11-20T16:18:00+00:00", **sample_item},
]
}

got = list(
transfer.iter_key(
key="testkey",
with_metadata=False,
deep=True,
include_key=False,
)
)
assert mock_operation.call_count == 1
mock_operation.assert_has_calls(
[
call(operation=StorageOperation.iter_key),
]
)
expected = [
IterKeyItem(type="object", value={"name": object_name, "metadata": {}}),
IterKeyItem(type="object", value={"name": object_name, "metadata": {}, "size": 100}),
IterKeyItem(
type="object", value={"name": object_name, "metadata": {}, "md5": "4d697373696e67206d64354861736821"}
),
IterKeyItem(
type="object",
value={"name": object_name, "metadata": {}, "last_modified": datetime(2023, 11, 20, 16, 18, tzinfo=UTC)},
),
]
assert len(got) == len(expected)
assert got == expected

0 comments on commit ee636db

Please sign in to comment.