Skip to content

Commit

Permalink
Moved constants within ObjectStoreLocationProvider
Browse files Browse the repository at this point in the history
  • Loading branch information
Sreesh Maheshwar committed Jan 10, 2025
1 parent 3555932 commit 55d6c4f
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions pyiceberg/table/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,11 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti
return f"{prefix}/{partition_key.to_path()}/{data_file_name}" if partition_key else f"{prefix}/{data_file_name}"


HASH_BINARY_STRING_BITS = 20
ENTROPY_DIR_LENGTH = 4
ENTROPY_DIR_DEPTH = 3


class ObjectStoreLocationProvider(LocationProvider):
HASH_BINARY_STRING_BITS = 20
ENTROPY_DIR_LENGTH = 4
ENTROPY_DIR_DEPTH = 3

_include_partition_paths: bool

def __init__(self, table_location: str, table_properties: Properties):
Expand All @@ -93,18 +92,21 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti
@staticmethod
def _compute_hash(data_file_name: str) -> str:
# Bitwise AND to combat sign-extension; bitwise OR to preserve leading zeroes that `bin` would otherwise strip.
hash_code = mmh3.hash(data_file_name) & ((1 << HASH_BINARY_STRING_BITS) - 1) | (1 << HASH_BINARY_STRING_BITS)
return ObjectStoreLocationProvider._dirs_from_hash(bin(hash_code)[-HASH_BINARY_STRING_BITS:])
top_mask = 1 << ObjectStoreLocationProvider.HASH_BINARY_STRING_BITS
hash_code = mmh3.hash(data_file_name) & (top_mask - 1) | top_mask
return ObjectStoreLocationProvider._dirs_from_hash(bin(hash_code)[-ObjectStoreLocationProvider.HASH_BINARY_STRING_BITS :])

@staticmethod
def _dirs_from_hash(file_hash: str) -> str:
"""Divides hash into directories for optimized orphan removal operation using ENTROPY_DIR_DEPTH and ENTROPY_DIR_LENGTH."""
total_entropy_length = ObjectStoreLocationProvider.ENTROPY_DIR_DEPTH * ObjectStoreLocationProvider.ENTROPY_DIR_LENGTH

hash_with_dirs = []
for i in range(0, ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH, ENTROPY_DIR_LENGTH):
hash_with_dirs.append(file_hash[i : i + ENTROPY_DIR_LENGTH])
for i in range(0, total_entropy_length, ObjectStoreLocationProvider.ENTROPY_DIR_LENGTH):
hash_with_dirs.append(file_hash[i : i + ObjectStoreLocationProvider.ENTROPY_DIR_LENGTH])

if len(file_hash) > ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH:
hash_with_dirs.append(file_hash[ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH :])
if len(file_hash) > total_entropy_length:
hash_with_dirs.append(file_hash[total_entropy_length:])

return "/".join(hash_with_dirs)

Expand Down

0 comments on commit 55d6c4f

Please sign in to comment.