diff --git a/tfx_bsl/coders/batch_util.py b/tfx_bsl/coders/batch_util.py index f6bfb82..08a0758 100644 --- a/tfx_bsl/coders/batch_util.py +++ b/tfx_bsl/coders/batch_util.py @@ -48,8 +48,8 @@ # this rule is a case when there are many empty features in the encoded # examples, but even then the difference is not significant and it is likely # that the actual size cap will be applied first. -_TARGET_BATCH_BYTES_SIZE = 104_857_600 # 100MB -_BATCH_SIZE_CAP_WITH_BYTE_TARGET = 10000 +_TARGET_BATCH_BYTES_SIZE = 20_971_520 # 20MiB +_BATCH_SIZE_CAP_WITH_BYTE_TARGET = 8192 def _UseByteSizeBatching() -> bool: diff --git a/tfx_bsl/coders/batch_util_test.py b/tfx_bsl/coders/batch_util_test.py index 784660b..a213062 100644 --- a/tfx_bsl/coders/batch_util_test.py +++ b/tfx_bsl/coders/batch_util_test.py @@ -52,12 +52,12 @@ batch_size=None, tfxio_use_byte_size_batching=True, expected_kwargs={ - "min_batch_size": 104_857_600, - "max_batch_size": 104_857_600, + "min_batch_size": batch_util._TARGET_BATCH_BYTES_SIZE, + "max_batch_size": batch_util._TARGET_BATCH_BYTES_SIZE, "element_size_fn": "dummy", }, expected_element_contributions={ - b"dummy": 10486, # Minimal contribution. + b"dummy": 2560, # Minimal contribution. b"dummy" * 10000: 50000, }, ), @@ -66,13 +66,13 @@ batch_size=None, tfxio_use_byte_size_batching=True, expected_kwargs={ - "min_batch_size": 104_857_600, - "max_batch_size": 104_857_600, + "min_batch_size": batch_util._TARGET_BATCH_BYTES_SIZE, + "max_batch_size": batch_util._TARGET_BATCH_BYTES_SIZE, "element_size_fn": "dummy", }, element_size_fn=lambda kv: len(kv[0] or b"") + len(kv[1]), expected_element_contributions={ - (None, b"dummy"): 10486, # Minimal contribution. + (None, b"dummy"): 2560, # Minimal contribution. (b"asd", b"dummy" * 10000): 50003, }, ),