Skip to content

Commit

Permalink
OnDiskIndex: rename ds_buffer_size to max_indexing_size
Browse files Browse the repository at this point in the history
  • Loading branch information
mrjleo committed Oct 1, 2024
1 parent 4c5911d commit 33b35f8
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 12 deletions.
18 changes: 9 additions & 9 deletions fast_forward/index/disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
class OnDiskIndex(Index):
"""Fast-Forward index that is read on-demand from disk.
Uses HDF5 via h5py under the hood. The buffer (`ds_buffer_size`) works around a [h5py limitation](https://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing).
Uses HDF5 via h5py under the hood. The `max_indexing_size` works around a [h5py limitation](https://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing).
"""

def __init__(
Expand All @@ -34,7 +34,7 @@ def __init__(
hdf5_chunk_size: int = None,
max_id_length: int = 8,
overwrite: bool = False,
ds_buffer_size: int = 2**10,
max_indexing_size: int = 2**10,
) -> None:
"""Create an index.
Expand All @@ -49,7 +49,7 @@ def __init__(
hdf5_chunk_size (int, optional): Override chunk size used by HDF5. Defaults to None.
max_id_length (int, optional): Maximum length of document and passage IDs (number of characters). Defaults to 8.
overwrite (bool, optional): Overwrite index file if it exists. Defaults to False.
ds_buffer_size (int, optional): Maximum number of vectors to retrieve from the HDF5 dataset at once. Defaults to 2**10.
max_indexing_size (int, optional): Maximum number of vectors to retrieve from the HDF5 dataset at once. Defaults to 2**10.
Raises:
ValueError: When the file exists and `overwrite=False`.
Expand All @@ -65,7 +65,7 @@ def __init__(
self._resize_min_val = resize_min_val
self._hdf5_chunk_size = hdf5_chunk_size
self._max_id_length = max_id_length
self._ds_buffer_size = ds_buffer_size
self._max_indexing_size = max_indexing_size

LOGGER.debug("creating file %s", self._index_file)
with h5py.File(self._index_file, "w") as fp:
Expand Down Expand Up @@ -269,8 +269,8 @@ def _get_vectors(self, ids: Iterable[str]) -> Tuple[np.ndarray, List[List[int]]]
# reading all vectors at once slows h5py down significantly, so we read them in chunks and concatenate
vectors = np.concatenate(
[
fp["vectors"][vec_idxs[i : i + self._ds_buffer_size]]
for i in range(0, len(vec_idxs), self._ds_buffer_size)
fp["vectors"][vec_idxs[i : i + self._max_indexing_size]]
for i in range(0, len(vec_idxs), self._max_indexing_size)
]
)
return vectors, [id_to_idxs[id] for id in ids]
Expand Down Expand Up @@ -300,7 +300,7 @@ def load(
mode: Mode = Mode.MAXP,
encoder_batch_size: int = 32,
resize_min_val: int = 2**10,
ds_buffer_size: int = 2**10,
max_indexing_size: int = 2**10,
) -> "OnDiskIndex":
"""Open an existing index on disk.
Expand All @@ -310,7 +310,7 @@ def load(
mode (Mode, optional): Ranking mode. Defaults to Mode.MAXP.
encoder_batch_size (int, optional): Batch size for query encoder. Defaults to 32.
resize_min_val (int, optional): Minimum value to increase index size by. Defaults to 2**10.
ds_buffer_size (int, optional): Maximum number of vectors to retrieve from the HDF5 dataset at once. Defaults to 2**10.
max_indexing_size (int, optional): Maximum number of vectors to retrieve from the HDF5 dataset at once. Defaults to 2**10.
Returns:
OnDiskIndex: The index.
Expand All @@ -326,7 +326,7 @@ def load(
)
index._index_file = index_file.absolute()
index._resize_min_val = resize_min_val
index._ds_buffer_size = ds_buffer_size
index._max_indexing_size = max_indexing_size

# deserialize quantizer if any
with h5py.File(index_file, "r") as fp:
Expand Down
6 changes: 3 additions & 3 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,11 +571,11 @@ def test_max_id_length(self):
self.assertEqual(index.psg_ids, set(psg_ids_ok))
self.assertEqual(16, len(index))

def test_ds_buffer_size(self):
def test_max_indexing_size(self):
index = OnDiskIndex(
self.temp_dir / "ds_buffer_size_index.h5",
self.temp_dir / "max_indexing_size_index.h5",
mode=Mode.PASSAGE,
ds_buffer_size=5,
max_indexing_size=5,
)
psg_reps = np.random.normal(size=(16, 16))
psg_ids = [f"p{i}" for i in range(16)]
Expand Down

0 comments on commit 33b35f8

Please sign in to comment.