From c294570bcede61695f038122cc5b5c266da9b65f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 22 Jan 2024 21:22:54 +0000 Subject: [PATCH] Mas d31 nhskv16sst (#428) * Add performance/profiling test Add test to perf_SUITE to do performance tests and also profile different activities in leveled. This can then be used to highlight functions with unexpectedly high execution times, and prove the impact of changes. Switch between riak_ctperf and riak_fullperf to change from standard test (with profile option) to full-scale performance test * Change shape of default perfTest * Refactor SST Compare and contrast profile for guess, before and after refactor: pre ``` lists:map_1/2 313370 2.33 32379 [ 0.10] lists:foldl_1/3 956590 4.81 66992 [ 0.07] leveled_sst:'-expand_list_by_pointer/5-fun-0-'/4 925020 6.13 85318 [ 0.09] erlang:binary_to_term/1 3881 8.55 119012 [ 30.67] erlang:'++'/2 974322 11.55 160724 [ 0.16] lists:member/2 4000180 15.00 208697 [ 0.05] leveled_sst:find_pos/4 4029220 21.01 292347 [ 0.07] leveled_sst:member_check/2 4000000 21.17 294601 [ 0.07] -------------------------------------------------- -------- ------- ------- [----------] Total: 16894665 100.00% 1391759 [ 0.08] ``` post ``` lists:map_1/2 63800 0.79 6795 [ 0.11] erlang:term_to_binary/1 15726 0.81 6950 [ 0.44] lists:keyfind/3 180967 0.92 7884 [ 0.04] erlang:spawn_link/3 15717 1.08 9327 [ 0.59] leveled_sst:'-read_slots/5-fun-1-'/8 31270 1.15 9895 [ 0.32] gen:do_call/4 7881 1.31 11243 [ 1.43] leveled_penciller:find_nextkey/8 180936 2.01 17293 [ 0.10] prim_file:pread_nif/3 15717 3.89 33437 [ 2.13] leveled_sst:find_pos/4 4028940 17.85 153554 [ 0.04] erlang:binary_to_term/1 15717 51.97 447048 [ 28.44] -------------------------------------------------- ------- ------- ------ [----------] Total: 6704100 100.00% 860233 [ 0.13] ``` * Update leveled_penciller.erl * Mas d31 nhskv16sstpcl (#426) Performance updates to leveled: - Refactoring of pointer expansion when fetching from leveled_sst files to avoid expensive list concatenation. - Refactoring of leveled_ebloom to make more flexible, reduce code, and improve check time. - Refactoring of querying within leveled_sst to reduce the number of blocks that need to be de-serialised per query. - Refactoring of the leveled_penciller's query key comparator, to make use of maps and simplify the filtering. - General speed-up of frequently called functions. --- include/leveled.hrl | 2 +- src/leveled_codec.erl | 113 +- src/leveled_ebloom.erl | 657 ++------- src/leveled_pclerk.erl | 2 +- src/leveled_penciller.erl | 792 +++++----- src/leveled_pmanifest.erl | 56 +- src/leveled_pmem.erl | 10 +- src/leveled_runner.erl | 66 +- src/leveled_sst.erl | 2159 ++++++++++++++-------------- src/leveled_util.erl | 8 +- test/end_to_end/iterator_SUITE.erl | 62 +- test/end_to_end/testutil.erl | 11 + 12 files changed, 1821 insertions(+), 2117 deletions(-) diff --git a/include/leveled.hrl b/include/leveled.hrl index 8f79da6e..55b82816 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -84,7 +84,7 @@ end_key :: tuple() | undefined, owner :: pid()|list(), filename :: string() | undefined, - bloom :: binary() | none | undefined}). + bloom = none :: leveled_ebloom:bloom() | none}). -record(cdb_options, {max_size :: pos_integer() | undefined, diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 462a3956..1c92344b 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -18,7 +18,6 @@ strip_to_keyseqonly/1, strip_to_indexdetails/1, striphead_to_v1details/1, - is_active/3, endkey_passed/2, key_dominates/2, maybe_reap_expiredkey/2, @@ -48,7 +47,10 @@ to_lookup/1, next_key/1, return_proxy/4, - get_metadata/1]). + get_metadata/1, + maybe_accumulate/5, + accumulate_index/2, + count_tombs/2]). -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). -define(NRT_IDX, "$aae."). @@ -251,22 +253,79 @@ striphead_to_v1details(V) -> get_metadata(LV) -> element(4, LV). --spec key_dominates(ledger_kv(), ledger_kv()) -> - left_hand_first|right_hand_first|left_hand_dominant|right_hand_dominant. +-spec maybe_accumulate( + list(leveled_codec:ledger_kv()), + term(), + non_neg_integer(), + {pos_integer(), {non_neg_integer(), non_neg_integer()|infinity}}, + leveled_penciller:pclacc_fun()) + -> {term(), non_neg_integer()}. +%% @doc +%% Make an accumulation decision based on the date range and also the expiry +%% status of the ledger key and value Needs to handle v1 and v2 values. When +%% folding over heads -> v2 values, index-keys -> v1 values. +maybe_accumulate([], Acc, Count, _Filter, _Fun) -> + {Acc, Count}; +maybe_accumulate( + [{K, {_SQN, {active, TS}, _SH, _MD, undefined}=V}|T], + Acc, Count, {Now, _ModRange}=Filter, AccFun) + when TS >= Now -> + maybe_accumulate(T, AccFun(K, V, Acc), Count + 1, Filter, AccFun); +maybe_accumulate( + [{K, {_SQN, {active, TS}, _SH, _MD}=V}|T], + Acc, Count, {Now, _ModRange}=Filter, AccFun) + when TS >= Now -> + maybe_accumulate(T, AccFun(K, V, Acc), Count + 1, Filter, AccFun); +maybe_accumulate( + [{_K, {_SQN, tomb, _SH, _MD, _LMD}}|T], + Acc, Count, Filter, AccFun) -> + maybe_accumulate(T, Acc, Count, Filter, AccFun); +maybe_accumulate( + [{_K, {_SQN, tomb, _SH, _MD}}|T], + Acc, Count, Filter, AccFun) -> + maybe_accumulate(T, Acc, Count, Filter, AccFun); +maybe_accumulate( + [{K, {_SQN, {active, TS}, _SH, _MD, LMD}=V}|T], + Acc, Count, {Now, {LowDate, HighDate}}=Filter, AccFun) + when TS >= Now, LMD >= LowDate, LMD =< HighDate -> + maybe_accumulate(T, AccFun(K, V, Acc), Count + 1, Filter, AccFun); +maybe_accumulate( + [_LV|T], + Acc, Count, Filter, AccFun) -> + maybe_accumulate(T, Acc, Count, Filter, AccFun). + +-spec accumulate_index( + {boolean(), undefined|leveled_runner:mp()}, leveled_runner:acc_fun()) + -> any(). +accumulate_index({false, undefined}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, _IndexInfo, ObjKey}, _Value, Acc) -> + FoldKeysFun(Bucket, ObjKey, Acc) + end; +accumulate_index({true, undefined}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc) + end; +accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + case re:run(IdxValue, TermRegex) of + nomatch -> + Acc; + _ -> + case AddTerm of + true -> + FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc); + false -> + FoldKeysFun(Bucket, ObjKey, Acc) + end + end + end. + +-spec key_dominates(ledger_kv(), ledger_kv()) -> boolean(). %% @doc %% When comparing two keys in the ledger need to find if one key comes before %% the other, or if the match, which key is "better" and should be the winner -key_dominates({LK, _LVAL}, {RK, _RVAL}) when LK < RK -> - left_hand_first; -key_dominates({LK, _LVAL}, {RK, _RVAL}) when RK < LK -> - right_hand_first; key_dominates(LObj, RObj) -> - case strip_to_seqonly(LObj) >= strip_to_seqonly(RObj) of - true -> - left_hand_dominant; - false -> - right_hand_dominant - end. + strip_to_seqonly(LObj) >= strip_to_seqonly(RObj). -spec maybe_reap_expiredkey(ledger_kv(), {boolean(), integer()}) -> boolean(). %% @doc @@ -286,20 +345,18 @@ maybe_reap(tomb, {true, _CurrTS}) -> maybe_reap(_, _) -> false. --spec is_active(ledger_key(), ledger_value(), non_neg_integer()) -> boolean(). -%% @doc -%% Is this an active KV pair or has the timestamp expired -is_active(Key, Value, Now) -> - case strip_to_statusonly({Key, Value}) of - {active, infinity} -> - true; - tomb -> - false; - {active, TS} when TS >= Now -> - true; - {active, _TS} -> - false - end. +-spec count_tombs( + list(ledger_kv()), non_neg_integer()|not_counted) -> + non_neg_integer()|not_counted. +count_tombs(_List, not_counted) -> + not_counted; +count_tombs([], Count) -> + Count; +count_tombs([{_K, V}|T], Count) when element(2, V) == tomb -> + count_tombs(T, Count + 1); +count_tombs([_KV|T], Count) -> + count_tombs(T, Count). + -spec from_ledgerkey(atom(), tuple()) -> false|tuple(). %% @doc diff --git a/src/leveled_ebloom.erl b/src/leveled_ebloom.erl index f5050edc..17cb9384 100644 --- a/src/leveled_ebloom.erl +++ b/src/leveled_ebloom.erl @@ -1,23 +1,36 @@ %% -------- TinyBloom --------- %% -%% A fixed size bloom that supports 32K keys only, made to try and minimise -%% the cost of producing the bloom -%% - +%% A 1-byte per key bloom filter with a 5% fpr. Pre-prepared segment hashes +%% (a leveled codec type) are, used for building and checking - the filter +%% splits a single hash into a 1 byte slot identifier, and 2 x 12 bit hashes +%% (so k=2, although only a single hash is used). +%% +%% The filter is designed to support a maximum of 64K keys, larger numbers of +%% keys will see higher fprs - with a 40% fpr at 250K keys. +%% +%% The filter uses the second "Extra Hash" part of the segment-hash to ensure +%% no overlap of fpr with the leveled_sst find_pos function. +%% +%% The completed bloom is a binary - to minimise the cost of copying between +%% processes and holding in memory. -module(leveled_ebloom). --include("include/leveled.hrl"). - -export([ create_bloom/1, check_hash/2 ]). --define(BLOOM_SIZE_BYTES, 512). --define(INTEGER_SIZE, 4096). --define(BAND_MASK, ?INTEGER_SIZE - 1). - +-define(BLOOM_SLOTSIZE_BYTES, 512). +-define(INTEGER_SLICE_SIZE, 64). +-define(INTEGER_SLICES, 64). + % i.e. ?INTEGER_SLICES * ?INTEGER_SLICE_SIZE = ?BLOOM_SLOTSIZE_BYTES div 8 +-define(MASK_BSR, 6). + % i.e. 2 ^ (12 - 6) = ?INTEGER_SLICES +-define(MASK_BAND, 63). + % i.e. integer slize size - 1 +-define(SPLIT_BAND, 4095). + % i.e. (?BLOOM_SLOTSIZE_BYTES * 8) - 1 -type bloom() :: binary(). @@ -29,64 +42,39 @@ -spec create_bloom(list(leveled_codec:segment_hash())) -> bloom(). %% @doc -%% Create a binary bloom filter from a list of hashes +%% Create a binary bloom filter from a list of hashes. In the leveled +%% implementation the hashes are leveled_codec:segment_hash/0 type, but only +%% a single 32-bit hash (the second element of the tuple is actually used in +%% the building of the bloom filter create_bloom(HashList) -> - case length(HashList) of - 0 -> - <<>>; - L when L > 32768 -> - {HL0, HL1} = - lists:partition(fun({_, Hash}) -> Hash band 32 == 0 end, - HashList), - Bin1 = - add_hashlist(HL0, - 32, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0), - Bin2 = - add_hashlist(HL1, - 32, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0), - <>; - L when L > 16384 -> - add_hashlist(HashList, - 32, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0); - L when L > 4096 -> - add_hashlist(HashList, - 16, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0); - L when L > 2048 -> - add_hashlist(HashList, 4, 0, 0, 0, 0); - _ -> - add_hashlist(HashList, 2, 0, 0) - end. - + SlotCount = + case length(HashList) of + 0 -> + 0; + L -> + min(128, max(2, (L - 1) div 512)) + end, + SlotHashes = + map_hashes( + HashList, + list_to_tuple(lists:duplicate(SlotCount, [])), + SlotCount + ), + build_bloom(SlotHashes, SlotCount). -spec check_hash(leveled_codec:segment_hash(), bloom()) -> boolean(). %% @doc -%% Check for the presence of a given hash within a bloom +%% Check for the presence of a given hash within a bloom. Only the second +%% element of the leveled_codec:segment_hash/0 type is used - a 32-bit hash. check_hash(_Hash, <<>>) -> false; -check_hash({_SegHash, Hash}, BloomBin) -> - SlotSplit = byte_size(BloomBin) div ?BLOOM_SIZE_BYTES, - {Slot, Hashes} = split_hash(Hash, SlotSplit), - Mask = get_mask(Hashes), - Pos = Slot * ?BLOOM_SIZE_BYTES, - IntSize = ?INTEGER_SIZE, - <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, - case CheckInt band Mask of - Mask -> - true; +check_hash({_SegHash, Hash}, BloomBin) when is_binary(BloomBin)-> + SlotSplit = byte_size(BloomBin) div ?BLOOM_SLOTSIZE_BYTES, + {Slot, [H0, H1]} = split_hash(Hash, SlotSplit), + Pos = ((Slot + 1) * ?BLOOM_SLOTSIZE_BYTES) - 1, + case match_hash(BloomBin, Pos - (H0 div 8), H0 rem 8) of + true -> + match_hash(BloomBin, Pos - (H1 div 8), H1 rem 8); _ -> false end. @@ -95,408 +83,78 @@ check_hash({_SegHash, Hash}, BloomBin) -> %%% Internal Functions %%%============================================================================ +-type slot_count() :: 0|2..128. +-type bloom_hash() :: 0..16#FFF. +-type external_hash() :: 0..16#FFFFFFFF. + +-spec map_hashes( + list(leveled_codec:segment_hash()), tuple(), slot_count()) -> tuple(). +map_hashes([], HashListTuple, _SlotCount) -> + HashListTuple; +map_hashes([Hash|Rest], HashListTuple, SlotCount) -> + {Slot, Hashes} = split_hash(element(2, Hash), SlotCount), + SlotHL = element(Slot + 1, HashListTuple), + map_hashes( + Rest, + setelement(Slot + 1, HashListTuple, Hashes ++ SlotHL), + SlotCount). + +-spec split_hash(external_hash(), slot_count()) + -> {non_neg_integer(), [bloom_hash()]}. split_hash(Hash, SlotSplit) -> Slot = (Hash band 255) rem SlotSplit, - H0 = (Hash bsr 8) band (?BAND_MASK), - H1 = (Hash bsr 20) band (?BAND_MASK), + H0 = (Hash bsr 8) band ?SPLIT_BAND, + H1 = (Hash bsr 20) band ?SPLIT_BAND, {Slot, [H0, H1]}. -get_mask([H0, H1]) -> - (1 bsl H0) bor (1 bsl H1). - - -%% This looks ugly and clunky, but in tests it was quicker than modifying an -%% Erlang term like an array as it is passed around the loop - -add_hashlist([], _S, S0, S1) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, SlotSplit, S0 bor Mask, S1); - 1 -> - add_hashlist(T, SlotSplit, S0, S1 bor Mask) - end. - -add_hashlist([], _S, S0, S1, S2, S3) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); - 1 -> - add_hashlist(T, SlotSplit, S0, S1 bor Mask, S2, S3); - 2 -> - add_hashlist(T, SlotSplit, S0, S1, S2 bor Mask, S3); - 3 -> - add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) - end. - -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, - SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 1 -> - add_hashlist(T, - SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 2 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 3 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 4 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 5 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 6 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 7 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, - S8, S9, S10, S11, S12, S13, S14, S15); - 8 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8 bor Mask, S9, S10, S11, S12, S13, S14, S15); - 9 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9 bor Mask, S10, S11, S12, S13, S14, S15); - 10 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10 bor Mask, S11, S12, S13, S14, S15); - 11 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11 bor Mask, S12, S13, S14, S15); - 12 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12 bor Mask, S13, S14, S15); - 13 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13 bor Mask, S14, S15); - 14 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14 bor Mask, S15); - 15 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15 bor Mask) - end. - - -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, - SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 1 -> - add_hashlist(T, - SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 2 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 3 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 4 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 5 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 6 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 7 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 8 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8 bor Mask, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 9 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9 bor Mask, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 10 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10 bor Mask, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 11 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11 bor Mask, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 12 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12 bor Mask, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 13 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13 bor Mask, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 14 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14 bor Mask, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 15 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15 bor Mask, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 16 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16 bor Mask, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 17 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17 bor Mask, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 18 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18 bor Mask, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 19 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19 bor Mask, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 20 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20 bor Mask, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 21 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21 bor Mask, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 22 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22 bor Mask, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 23 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23 bor Mask, - S24, S25, S26, S27, S28, S29, S30, S31); - 24 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24 bor Mask, S25, S26, S27, S28, S29, S30, S31); - 25 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25 bor Mask, S26, S27, S28, S29, S30, S31); - 26 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26 bor Mask, S27, S28, S29, S30, S31); - 27 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27 bor Mask, S28, S29, S30, S31); - 28 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28 bor Mask, S29, S30, S31); - 29 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29 bor Mask, S30, S31); - 30 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30 bor Mask, S31); - 31 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31 bor Mask) - - end. - +-spec match_hash(bloom(), non_neg_integer(), 0..16#FF) -> boolean(). +match_hash(BloomBin, Pos, Hash) -> + <<_Pre:Pos/binary, CheckInt:8/integer, _Rest/binary>> = BloomBin, + (CheckInt bsr Hash) band 1 == 1. + +-spec build_bloom(tuple(), slot_count()) -> bloom(). +build_bloom(_SlotHashes, 0) -> + <<>>; +build_bloom(SlotHashes, SlotCount) when SlotCount > 0 -> + lists:foldr( + fun(I, AccBin) -> + HashList = element(I, SlotHashes), + SlotBin = + add_hashlist( + lists:usort(HashList), 0, 1, ?INTEGER_SLICES, <<>>), + <> + end, + <<>>, + lists:seq(1, SlotCount) + ). + +-spec add_hashlist( + list(bloom_hash()), + non_neg_integer(), + non_neg_integer(), + 0..?INTEGER_SLICES, + binary()) -> bloom(). +add_hashlist([], ThisSlice, SliceCount, SliceCount, AccBin) -> + <>; +add_hashlist([], ThisSlice, SliceNumber, SliceCount, AccBin) -> + add_hashlist( + [], + 0, + SliceNumber + 1, + SliceCount, + <>); +add_hashlist([H0|Rest], ThisSlice, SliceNumber, SliceCount, AccBin) + when ((H0 bsr ?MASK_BSR) + 1) == SliceNumber -> + Mask0 = 1 bsl (H0 band (?MASK_BAND)), + add_hashlist( + Rest, ThisSlice bor Mask0, SliceNumber, SliceCount, AccBin); +add_hashlist(Rest, ThisSlice, SliceNumber, SliceCount, AccBin) -> + add_hashlist( + Rest, + 0, + SliceNumber + 1, + SliceCount, + <>). %%%============================================================================ %%% Test @@ -507,11 +165,7 @@ add_hashlist([{_SegHash, TopHash}|T], -include_lib("eunit/include/eunit.hrl"). generate_orderedkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> - generate_orderedkeys(Seqn, - Count, - [], - BucketRangeLow, - BucketRangeHigh). + generate_orderedkeys(Seqn, Count, [], BucketRangeLow, BucketRangeHigh). generate_orderedkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> Acc; @@ -521,17 +175,12 @@ generate_orderedkeys(Seqn, Count, Acc, BucketLow, BucketHigh) -> io_lib:format("K~4..0B", [BucketLow + BNumber]), KeyExt = io_lib:format("K~8..0B", [Seqn * 100 + leveled_rand:uniform(100)]), - LK = leveled_codec:to_ledgerkey("Bucket" ++ BucketExt, "Key" ++ KeyExt, o), Chunk = leveled_rand:rand_bytes(16), {_B, _K, MV, _H, _LMs} = leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity), - generate_orderedkeys(Seqn + 1, - Count - 1, - [{LK, MV}|Acc], - BucketLow, - BucketHigh). - + generate_orderedkeys( + Seqn + 1, Count - 1, [{LK, MV}|Acc], BucketLow, BucketHigh). get_hashlist(N) -> KVL = generate_orderedkeys(1, N, 1, 20), @@ -560,16 +209,16 @@ check_neg_hashes(BloomBin, HashList, Counters) -> end, lists:foldl(CheckFun, Counters, HashList). - empty_bloom_test() -> BloomBin0 = create_bloom([]), - ?assertMatch({0, 4}, - check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). + ?assertMatch( + {0, 4}, check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). bloom_test_() -> {timeout, 120, fun bloom_test_ranges/0}. bloom_test_ranges() -> + test_bloom(250000, 2), test_bloom(80000, 4), test_bloom(60000, 4), test_bloom(40000, 4), @@ -577,7 +226,8 @@ bloom_test_ranges() -> test_bloom(20000, 4), test_bloom(10000, 4), test_bloom(5000, 4), - test_bloom(2000, 4). + test_bloom(2000, 4), + test_bloom(1000, 4). test_bloom(N, Runs) -> ListOfHashLists = @@ -599,35 +249,44 @@ test_bloom(N, Runs) -> SWa = os:timestamp(), ListOfBlooms = - lists:map(fun({HL, _ML}) -> create_bloom(HL) end, - SplitListOfHashLists), + lists:map( + fun({HL, _ML}) -> create_bloom(HL) end, SplitListOfHashLists), TSa = timer:now_diff(os:timestamp(), SWa)/Runs, SWb = os:timestamp(), - lists:foreach(fun(Nth) -> - {HL, _ML} = lists:nth(Nth, SplitListOfHashLists), - BB = lists:nth(Nth, ListOfBlooms), - check_all_hashes(BB, HL) - end, - lists:seq(1, Runs)), - TSb = timer:now_diff(os:timestamp(), SWb)/Runs, - + PosChecks = + lists:foldl( + fun(Nth, ChecksMade) -> + {HL, _ML} = lists:nth(Nth, SplitListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_all_hashes(BB, HL), + ChecksMade + length(HL) + end, + 0, + lists:seq(1, Runs)), + TSb = timer:now_diff(os:timestamp(), SWb), + SWc = os:timestamp(), {Pos, Neg} = - lists:foldl(fun(Nth, Acc) -> - {_HL, ML} = lists:nth(Nth, SplitListOfHashLists), - BB = lists:nth(Nth, ListOfBlooms), - check_neg_hashes(BB, ML, Acc) - end, - {0, 0}, - lists:seq(1, Runs)), + lists:foldl( + fun(Nth, Acc) -> + {_HL, ML} = lists:nth(Nth, SplitListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_neg_hashes(BB, ML, Acc) + end, + {0, 0}, + lists:seq(1, Runs)), FPR = Pos / (Pos + Neg), - TSc = timer:now_diff(os:timestamp(), SWc)/Runs, - - io:format(user, - "Test with size ~w has microsecond timings: -" - ++ " build ~w check ~w neg_check ~w and fpr ~w~n", - [N, TSa, TSb, TSc, FPR]). + TSc = timer:now_diff(os:timestamp(), SWc), + + BytesPerKey = + (lists:sum(lists:map(fun byte_size/1, ListOfBlooms)) div 4) / N, + io:format( + user, + "Test with size ~w has microsecond timings: - " + "build in ~w then ~.3f per pos-check, ~.3f per neg-check, " + "fpr ~.3f with bytes-per-key ~.3f~n", + [N, round(TSa), TSb / PosChecks, TSc / (Pos + Neg), FPR, BytesPerKey]). -endif. diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 4e5fc38d..c242dbef 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -107,7 +107,7 @@ clerk_removelogs(Pid, ForcedLogs) -> -spec clerk_close(pid()) -> ok. clerk_close(Pid) -> - gen_server:call(Pid, close, 20000). + gen_server:call(Pid, close, 60000). %%%============================================================================ %%% gen_server callbacks diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 97d76188..eeb358fa 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -204,8 +204,7 @@ -export([pcl_getsstpids/1, pcl_getclerkpid/1]). -ifdef(TEST). --export([ - clean_testdir/1]). +-export([clean_testdir/1]). -endif. -define(MAX_WORK_WAIT, 300). @@ -220,7 +219,9 @@ -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(COIN_SIDECOUNT, 4). -define(SLOW_FETCH, 500000). % Log a very slow fetch - longer than 500ms +-define(FOLD_SCANWIDTH, 32). -define(ITERATOR_SCANWIDTH, 4). +-define(ITERATOR_MINSCANWIDTH, 1). -define(TIMING_SAMPLECOUNTDOWN, 10000). -define(TIMING_SAMPLESIZE, 100). -define(SHUTDOWN_LOOPS, 10). @@ -287,10 +288,6 @@ -type pcl_state() :: #state{}. -type levelzero_cacheentry() :: {pos_integer(), leveled_tree:leveled_tree()}. -type levelzero_cache() :: list(levelzero_cacheentry()). --type iterator_entry() - :: {pos_integer(), - list(leveled_codec:ledger_kv()|leveled_sst:expandable_pointer())}. --type iterator() :: list(iterator_entry()). -type bad_ledgerkey() :: list(). -type sqn_check() :: current|replaced|missing. -type sst_fetchfun() :: @@ -303,10 +300,14 @@ -type pclacc_fun() :: fun((leveled_codec:ledger_key(), leveled_codec:ledger_value(), - any()) -> any()). + term()) -> term()). -type sst_options() :: #sst_options{}. --export_type([levelzero_cacheentry/0, levelzero_returnfun/0, sqn_check/0]). +-export_type( + [levelzero_cacheentry/0, + levelzero_returnfun/0, + sqn_check/0, + pclacc_fun/0]). %%%============================================================================ %%% API @@ -421,7 +422,7 @@ pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc, By) -> %% all keys in the range - so must only be run against snapshots of the %% penciller to avoid blocking behaviour. %% -%% This version allows an additional input of a SegmentList. This is a list +%% This version allows an additional input of a SegChecker. This is a list %% of 16-bit integers representing the segment IDs band ((2 ^ 16) -1) that %% are interesting to the fetch %% @@ -771,25 +772,32 @@ handle_call({fetch_keys, L0AsList = case State#state.levelzero_astree of undefined -> - leveled_pmem:merge_trees(StartKey, - EndKey, - State#state.levelzero_cache, - leveled_tree:empty(?CACHE_TYPE)); + leveled_pmem:merge_trees( + StartKey, + EndKey, + State#state.levelzero_cache, + leveled_tree:empty(?CACHE_TYPE)); List -> List end, + SegChecker = + leveled_sst:segment_checker(leveled_sst:tune_seglist(SegmentList)), FilteredL0 = - case SegmentList of + case SegChecker of false -> L0AsList; - _ -> - TunedList = leveled_sst:tune_seglist(SegmentList), + {Min, Max, CheckFun} -> FilterFun = fun(LKV) -> CheckSeg = leveled_sst:extract_hash( leveled_codec:strip_to_segmentonly(LKV)), - leveled_sst:member_check(CheckSeg, TunedList) + case CheckSeg of + CheckSeg when CheckSeg >= Min, CheckSeg =< Max -> + CheckFun(CheckSeg); + _ -> + false + end end, lists:filter(FilterFun, L0AsList) end, @@ -809,13 +817,14 @@ handle_call({fetch_keys, QueryManifest end, SnapshotTime = State#state.snapshot_time, - + PersistedIterator = maps:from_list(SSTiter), Folder = fun() -> - keyfolder({FilteredL0, SSTiter}, - {StartKey, EndKey}, - {AccFun, InitAcc, SnapshotTime}, - {SegmentList, LastModRange0, MaxKeys}) + keyfolder( + maps:put(-1, FilteredL0, PersistedIterator), + {StartKey, EndKey}, + {AccFun, InitAcc, SnapshotTime}, + {SegChecker, LastModRange0, MaxKeys}) end, case By of as_pcl -> @@ -1072,10 +1081,9 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) -> filename=FN, bloom=Bloom}, ManifestSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest) + 1, - UpdMan = leveled_pmanifest:insert_manifest_entry(State#state.manifest, - ManifestSQN, - 0, - ManEntry), + UpdMan = + leveled_pmanifest:insert_manifest_entry( + State#state.manifest, ManifestSQN, 0, ManEntry), % Prompt clerk to ask about work - do this for every L0 roll ok = leveled_pclerk:clerk_prompt(State#state.clerk), {noreply, State#state{levelzero_cache=[], @@ -1209,7 +1217,6 @@ handle_cast({complete_shutdown, ShutdownType, From}, State) -> end, {stop, normal, State}. - %% handle the bookie stopping and stop this snapshot handle_info({'DOWN', BookieMonRef, process, _BookiePid, _Info}, State=#state{bookie_monref = BookieMonRef}) -> @@ -1226,10 +1233,11 @@ terminate(Reason, _State) -> format_status(normal, [_PDict, State]) -> State; format_status(terminate, [_PDict, State]) -> - State#state{manifest = redacted, - levelzero_cache = redacted, - levelzero_index = redacted, - levelzero_astree = redacted}. + State#state{ + manifest = redacted, + levelzero_cache = redacted, + levelzero_index = redacted, + levelzero_astree = redacted}. code_change(_OldVsn, State, _Extra) -> @@ -1280,15 +1288,17 @@ start_from_file(PCLopts) -> % vnode syncronisation issues (e.g. stop them all by default merging to % level zero concurrently) - InitState = #state{clerk = MergeClerk, - root_path = RootPath, - levelzero_maxcachesize = MaxTableSize, - levelzero_cointoss = CoinToss, - levelzero_index = [], - snaptimeout_short = SnapTimeoutShort, - snaptimeout_long = SnapTimeoutLong, - sst_options = OptsSST, - monitor = Monitor}, + InitState = + #state{ + clerk = MergeClerk, + root_path = RootPath, + levelzero_maxcachesize = MaxTableSize, + levelzero_cointoss = CoinToss, + levelzero_index = [], + snaptimeout_short = SnapTimeoutShort, + snaptimeout_long = SnapTimeoutLong, + sst_options = OptsSST, + monitor = Monitor}, %% Open manifest Manifest0 = leveled_pmanifest:open_manifest(RootPath), @@ -1311,33 +1321,34 @@ start_from_file(PCLopts) -> case filelib:is_file(filename:join(sst_rootpath(RootPath), L0FN)) of true -> leveled_log:log(p0015, [L0FN]), - L0Open = leveled_sst:sst_open(sst_rootpath(RootPath), - L0FN, - OptsSST, - 0), + L0Open = + leveled_sst:sst_open( + sst_rootpath(RootPath), L0FN, OptsSST, 0), {ok, L0Pid, {L0StartKey, L0EndKey}, Bloom} = L0Open, L0SQN = leveled_sst:sst_getmaxsequencenumber(L0Pid), - L0Entry = #manifest_entry{start_key = L0StartKey, - end_key = L0EndKey, - filename = L0FN, - owner = L0Pid, - bloom = Bloom}, + L0Entry = + #manifest_entry{ + start_key = L0StartKey, + end_key = L0EndKey, + filename = L0FN, + owner = L0Pid, + bloom = Bloom}, Manifest2 = - leveled_pmanifest:insert_manifest_entry(Manifest1, - ManSQN + 1, - 0, - L0Entry), + leveled_pmanifest:insert_manifest_entry( + Manifest1, ManSQN + 1, 0, L0Entry), leveled_log:log(p0016, [L0SQN]), LedgerSQN = max(MaxSQN, L0SQN), - {InitState#state{manifest = Manifest2, - ledger_sqn = LedgerSQN, - persisted_sqn = LedgerSQN}, + {InitState#state{ + manifest = Manifest2, + ledger_sqn = LedgerSQN, + persisted_sqn = LedgerSQN}, [L0FN|FileList]}; false -> leveled_log:log(p0017, []), - {InitState#state{manifest = Manifest1, - ledger_sqn = MaxSQN, - persisted_sqn = MaxSQN}, + {InitState#state{ + manifest = Manifest1, + ledger_sqn = MaxSQN, + persisted_sqn = MaxSQN}, FileList} end, ok = archive_files(RootPath, FileList0), @@ -1373,7 +1384,6 @@ shutdown_manifest(Manifest, L0Constructor) -> leveled_pmanifest:close_manifest(Manifest, EntryCloseFun), EntryCloseFun(L0Constructor). - -spec check_alive(pid()|undefined) -> boolean(). %% @doc %% Double-check a processis active before attempting to terminate @@ -1382,7 +1392,6 @@ check_alive(Owner) when is_pid(Owner) -> check_alive(_Owner) -> false. - -spec archive_files(list(), list()) -> ok. %% @doc %% Archive any sst files in the folder that have not been used to build the @@ -1483,7 +1492,6 @@ roll_memory(NextManSQN, LedgerSQN, RootPath, L0Cache, CL, SSTOpts, true) -> L0Path, L0FN, 0, KVList, LedgerSQN, SSTOpts), {Constructor, Bloom}. - -spec timed_fetch_mem( tuple(), {integer(), integer()}, @@ -1507,7 +1515,6 @@ timed_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index, Monitor) -> maybelog_fetch_timing(Monitor, Level, TS0, R == not_present), R. - -spec fetch_sqn( leveled_codec:ledger_key(), leveled_codec:segment_hash(), @@ -1587,7 +1594,6 @@ log_slowfetch(T0, R, PID, Level, FetchTolerance) -> R end. - -spec compare_to_sqn( leveled_codec:ledger_kv()|leveled_codec:sqn()|not_present, integer()) -> sqn_check(). @@ -1609,341 +1615,237 @@ compare_to_sqn(ObjSQN, _SQN) when is_integer(ObjSQN) -> compare_to_sqn(Obj, SQN) -> compare_to_sqn(leveled_codec:strip_to_seqonly(Obj), SQN). +-spec maybelog_fetch_timing( + leveled_monitor:monitor(), + memory|leveled_pmanifest:lsm_level(), + leveled_monitor:timing(), + boolean()) -> ok. +maybelog_fetch_timing(_Monitor, _Level, no_timing, _NF) -> + ok; +maybelog_fetch_timing({Pid, _StatsFreq}, _Level, FetchTime, true) -> + leveled_monitor:add_stat(Pid, {pcl_fetch_update, not_found, FetchTime}); +maybelog_fetch_timing({Pid, _StatsFreq}, Level, FetchTime, _NF) -> + leveled_monitor:add_stat(Pid, {pcl_fetch_update, Level, FetchTime}). %%%============================================================================ -%%% Iterator functions -%%% -%%% TODO - move to dedicated module with extended unit testing +%%% Key folder %%%============================================================================ +-type sst_iterator() + :: #{ + leveled_pmanifest:lsm_level() => + list(leveled_sst:expandable_pointer()|leveled_codec:ledger_kv()), + -1 => + list(leveled_codec:ledger_kv())}. +-type max_keys() :: unlimited|non_neg_integer(). +-type iterator_level() :: -1|leveled_pmanifest:lsm_level(). +-type search_info() :: + {{leveled_codec:ledger_key(), leveled_codec:ledger_key()}, + {non_neg_integer(), pos_integer()|infinity}, + leveled_sst:segment_check_fun()}. + +-define(NULL_KEY, {null, null}). -spec keyfolder( - {list(), list()}, + sst_iterator(), {leveled_codec:ledger_key(), leveled_codec:ledger_key()}, {pclacc_fun(), any(), pos_integer()}, - {boolean(), {non_neg_integer(), pos_integer()|infinity}, integer()}) - -> any(). + {leveled_sst:segment_check_fun(), + {non_neg_integer(), pos_integer()|infinity}, + -1|non_neg_integer()}) -> {non_neg_integer(), term()}|term(). +keyfolder( + Iterator, + {StartKey, EndKey}, + {AccFun, InitAcc, Now}, + {SegCheckFun, LastModRange, KeyLimit}) -> + % The in-memory dump of keys in this range, may go beyond the end key - so + % strip these back before starting the fold + StripIMMFun = + fun(MemIter) -> + lists:reverse( + lists:dropwhile( + fun({K, _V}) -> leveled_codec:endkey_passed(EndKey, K) end, + lists:reverse(MemIter))) + end, + MaxKeys = + case KeyLimit of + -1 -> unlimited; + KeyLimit when is_integer(KeyLimit), KeyLimit >= 0 -> KeyLimit + end, + keyfolder( + maps:update_with(-1, StripIMMFun, Iterator), + InitAcc, + MaxKeys, + {?FOLD_SCANWIDTH, lists:sort(maps:keys(Iterator))}, + {{StartKey, EndKey}, LastModRange, SegCheckFun}, + {AccFun, Now}). + +-spec keyfolder( + sst_iterator()|no_more_keys, + term(), + max_keys(), + {pos_integer(), list(iterator_level())}, + search_info(), + {pclacc_fun(), integer()}) -> {non_neg_integer(), term()}|term(). %% @doc -%% The keyfolder will compare an iterator across the immutable in-memory cache -%% of the Penciller (the IMMiter), with an iterator across the persisted part -%% (the SSTiter). -%% -%% A Segment List and a MaxKeys may be passed. Every time something is added -%% to the accumulator MaxKeys is reduced - so set MaxKeys to -1 if it is -%% intended to be infinite. -%% -%% The basic principle is to take the next key in the IMMiter and compare it -%% to the next key in the SSTiter, and decide which one should be added to the -%% accumulator. The iterators are advanced if they either win (i.e. are the -%% next key), or are dominated. This goes on until the iterators are empty. +%% The keyfolder takes an iterator - a map with an entry for each level, from +%% level -1 (the in-memory cache of keys) through to level 7 (the theoretical) +%% maximum level. %% -%% To advance the SSTiter the find_nextkey/4 function is used, as the SSTiter -%% is an iterator across multiple levels - and so needs to do its own -%% comparisons to pop the next result. -keyfolder(_Iterators, - _KeyRange, - {_AccFun, Acc, _Now}, - {_SegmentList, _LastModRange, MaxKeys}) when MaxKeys == 0 -> - {0, Acc}; -keyfolder({[], SSTiter}, KeyRange, {AccFun, Acc, Now}, - {SegmentList, LastModRange, MaxKeys}) -> - {StartKey, EndKey} = KeyRange, - case find_nextkey(SSTiter, StartKey, EndKey, - SegmentList, element(1, LastModRange)) of - no_more_keys -> - case MaxKeys > 0 of - true -> - % This query had a max count, so we must respond with the - % remainder on the count - {MaxKeys, Acc}; - false -> - % This query started with a MaxKeys set to -1. Query is - % not interested in having MaxKeys in Response - Acc - end; - {NxSSTiter, {SSTKey, SSTVal}} -> - {Acc1, MK1} = - maybe_accumulate(SSTKey, SSTVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - keyfolder({[], NxSSTiter}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}) +%% The find_nextkeys function is used to scan the iterators to find the next +%% set of W keys. These can then be accumulated. If there is a MaxKeys set +%% (i.e. a maximum number of KV pairs to be accumulated), then this must be +%% tracked so the keyfolder never asks for more than the remainder from +%% find_nextkeys +keyfolder(no_more_keys, Acc, MaxKeys, _LevelInfo, _SearchInfo, _AccDetails) -> + case MaxKeys of + unlimited -> Acc; + MaxKeys -> {MaxKeys, Acc} end; -keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, - KeyRange, - {AccFun, Acc, Now}, - {SegmentList, LastModRange, MaxKeys}) -> - {StartKey, EndKey} = KeyRange, - case {IMMKey < StartKey, leveled_codec:endkey_passed(EndKey, IMMKey)} of - {false, true} -> - % There are no more keys in-range in the in-memory - % iterator, so take action as if this iterator is empty - % (see above) - keyfolder({[], SSTiterator}, - KeyRange, - {AccFun, Acc, Now}, - {SegmentList, LastModRange, MaxKeys}); - {false, false} -> - case find_nextkey(SSTiterator, StartKey, EndKey, - SegmentList, element(1, LastModRange)) of - no_more_keys -> - % No more keys in range in the persisted store, so use the - % in-memory KV as the next - {Acc1, MK1} = - maybe_accumulate(IMMKey, IMMVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - keyfolder({NxIMMiterator, - []}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}); - {NxSSTiterator, {SSTKey, SSTVal}} -> - % There is a next key, so need to know which is the - % next key between the two (and handle two keys - % with different sequence numbers). - case leveled_codec:key_dominates({IMMKey, - IMMVal}, - {SSTKey, - SSTVal}) of - left_hand_first -> - {Acc1, MK1} = - maybe_accumulate(IMMKey, IMMVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - % Stow the previous best result away at Level -1 - % so that there is no need to iterate to it again - NewEntry = {-1, [{SSTKey, SSTVal}]}, - keyfolder({NxIMMiterator, - lists:keystore(-1, - 1, - NxSSTiterator, - NewEntry)}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}); - right_hand_first -> - {Acc1, MK1} = - maybe_accumulate(SSTKey, SSTVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], - NxSSTiterator}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}); - left_hand_dominant -> - {Acc1, MK1} = - maybe_accumulate(IMMKey, IMMVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - % We can add to the accumulator here. As the SST - % key was the most dominant across all SST levels, - % so there is no need to hold off until the IMMKey - % is left hand first. - keyfolder({NxIMMiterator, - NxSSTiterator}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}) - end - end - end. - --spec maybe_accumulate(leveled_codec:ledger_key(), - leveled_codec:ledger_value(), - {any(), pclacc_fun(), pos_integer()}, - integer(), - {non_neg_integer(), non_neg_integer()|infinity}) - -> any(). -%% @doc -%% Make an accumulation decision based one the date range -maybe_accumulate(LK, LV, - {Acc, AccFun, QueryStartTime}, - MaxKeys, - {LowLastMod, HighLastMod}) -> - {_SQN, _SH, LMD} = leveled_codec:strip_to_indexdetails({LK, LV}), - RunAcc = - (LMD == undefined) or - ((LMD >= LowLastMod) and (LMD =< HighLastMod)), - case RunAcc and leveled_codec:is_active(LK, LV, QueryStartTime) of - true -> - {AccFun(LK, LV, Acc), MaxKeys - 1}; - false -> - {Acc, MaxKeys} - end. - - --spec find_nextkey( - iterator(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key(), - list(non_neg_integer())|false, - non_neg_integer()) - -> no_more_keys|{iterator(), leveled_codec:ledger_kv()}. -%% @doc -%% Looks to find the best choice for the next key across the levels (other -%% than in-memory table) -%% In finding the best choice, the next key in a given level may be a next -%% block or next file pointer which will need to be expanded -find_nextkey(QueryArray, StartKey, EndKey, SegmentList, LowLastMod) -> - find_nextkey(QueryArray, - -1, - {null, null}, - StartKey, EndKey, - SegmentList, - LowLastMod, - ?ITERATOR_SCANWIDTH). - -find_nextkey(_QueryArray, LCnt, - {null, null}, - _StartKey, _EndKey, - _SegList, _LowLastMod, _Width) when LCnt > ?MAX_LEVELS -> - % The array has been scanned wihtout finding a best key - must be - % exhausted - respond to indicate no more keys to be found by the - % iterator - no_more_keys; -find_nextkey(QueryArray, LCnt, - {BKL, BestKV}, - _StartKey, _EndKey, - _SegList, _LowLastMod, _Width) when LCnt > ?MAX_LEVELS -> - % All levels have been scanned, so need to remove the best result from - % the array, and return that array along with the best key/sqn/status - % combination - {BKL, [BestKV|Tail]} = lists:keyfind(BKL, 1, QueryArray), - {lists:keyreplace(BKL, 1, QueryArray, {BKL, Tail}), BestKV}; -find_nextkey(QueryArray, LCnt, - {BestKeyLevel, BestKV}, - StartKey, EndKey, - SegList, LowLastMod, Width) -> - % Get the next key at this level - {NextKey, RestOfKeys} = - case lists:keyfind(LCnt, 1, QueryArray) of - false -> - {null, null}; - {LCnt, []} -> - {null, null}; - {LCnt, [NK|ROfKs]} -> - {NK, ROfKs} +keyfolder(_Iter, Acc, 0, _LevelInfo, _SearchInfo, _AccDetails) -> + {0, Acc}; +keyfolder( + Iter, + Acc, + MaxKeys, + {W, Ls}=LevelInfo, + {_KR, LastModRange, _SCF}=SearchInfo, + {AccFun, Now}=AccDetails) -> + {IterUpd, FoundKVs} = + find_nextkeys( + Iter, + {Ls, ?NULL_KEY}, + [], + Ls, + {fetch_size(MaxKeys, W), scan_size(MaxKeys)}, + SearchInfo), + {UpdAcc, KeyCount} = + leveled_codec:maybe_accumulate( + lists:reverse(FoundKVs), Acc, 0, {Now, LastModRange}, AccFun), + MaxKeysLeft = + case MaxKeys of + unlimited -> unlimited; + MaxKeys -> MaxKeys - KeyCount end, - % Compare the next key at this level with the best key - case {NextKey, BestKeyLevel, BestKV} of - {null, BKL, BKV} -> - % There is no key at this level - go to the next level - find_nextkey(QueryArray, - LCnt + 1, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{next, Owner, _SK}, BKL, BKV} -> - % The first key at this level is pointer to a file - need to query - % the file to expand this level out before proceeding + keyfolder(IterUpd, UpdAcc, MaxKeysLeft, LevelInfo, SearchInfo, AccDetails). + +-spec fetch_size(max_keys(), pos_integer()) -> pos_integer(). +fetch_size(unlimited, W) -> W; +fetch_size(MaxKeys, W) -> min(MaxKeys, W). + +-spec scan_size(max_keys()) -> pos_integer(). +scan_size(unlimited) -> + ?ITERATOR_SCANWIDTH; +scan_size(MaxKeys) -> + min(?ITERATOR_SCANWIDTH, max(?ITERATOR_MINSCANWIDTH, MaxKeys div 256)). + +-spec find_nextkeys( + sst_iterator(), + {list(iterator_level()), + {null|iterator_level(), null|leveled_codec:ledger_kv()}}, + list(leveled_codec:ledger_kv()), + list(iterator_level()), + {pos_integer(), pos_integer()}, + search_info()) -> + {no_more_keys, list(leveled_codec:ledger_kv())}| + {sst_iterator(), list(leveled_codec:ledger_kv())}. +%% @doc +%% Looks to find up to W keys, where for each key every level is checked, +%% comparing keys to find the best key for that loop +find_nextkeys( + _Iter, {[], ?NULL_KEY}, FoundKVs, _Ls, _BatchInfo, _SearchInfo) -> + % Each level checked and best key still NULL => no_more_keys + {no_more_keys, FoundKVs}; +find_nextkeys( + Iter, {[], {BKL, BestKV}}, FoundKVs, _Ls, {W, _SW}, _SearchInfo) + when length(FoundKVs) == W - 1 -> + % All levels scanned, and there are now W keys (W - 1 previously found plus + % the latest best key) + {maps:update_with(BKL, fun tl/1, Iter), [BestKV|FoundKVs]}; +find_nextkeys( + Iter, {[], {BKL, BestKV}}, FoundKVs, Ls, BatchInfo, SearchInfo) -> + % All levels scanned so this is the best key ... now loop to find more + find_nextkeys( + maps:update_with(BKL, fun tl/1, Iter), + {Ls, ?NULL_KEY}, + [BestKV|FoundKVs], + Ls, BatchInfo, SearchInfo); +find_nextkeys( + Iter, + {[LCnt|OtherLevels]=LoopLs, {BKL, BKV}=PrevBest}, + FoundKVs, + Ls, + {_W, ScanWidth}=BI, + {{StartKey, EndKey}, {LowLastMod, _High}, SegChecker}=SI) -> + case maps:get(LCnt, Iter) of + [] -> + find_nextkeys( + Iter, + {OtherLevels, PrevBest}, + FoundKVs, + Ls -- [LCnt], BI, SI); + [{next, Owner, _SK}|RestOfKeys] -> + % Expansion required Pointer = {next, Owner, StartKey, EndKey}, - UpdList = leveled_sst:sst_expandpointer(Pointer, - RestOfKeys, - Width, - SegList, - LowLastMod), - NewEntry = {LCnt, UpdList}, + UpdList = + leveled_sst:sst_expandpointer( + Pointer, RestOfKeys, ScanWidth, SegChecker, LowLastMod), % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level - find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), - LCnt, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{pointer, SSTPid, Slot, PSK, PEK}, BKL, BKV} -> - % The first key at this level is pointer within a file - need to - % query the file to expand this level out before proceeding + find_nextkeys( + maps:update(LCnt, UpdList, Iter), + {LoopLs, PrevBest}, + FoundKVs, + Ls, BI, SI); + [{pointer, SSTPid, Slot, PSK, PEK}|RestOfKeys] -> + % Expansion required Pointer = {pointer, SSTPid, Slot, PSK, PEK}, - UpdList = leveled_sst:sst_expandpointer(Pointer, - RestOfKeys, - Width, - SegList, - LowLastMod), - NewEntry = {LCnt, UpdList}, + UpdList = + leveled_sst:sst_expandpointer( + Pointer, RestOfKeys, ScanWidth, SegChecker, LowLastMod), % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level - find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), - LCnt, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{Key, Val}, null, null} -> - % No best key set - so can assume that this key is the best key, - % and check the lower levels - find_nextkey(QueryArray, - LCnt + 1, - {LCnt, {Key, Val}}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{Key, Val}, _BKL, {BestKey, _BestVal}} when Key < BestKey -> - % There is a real key and a best key to compare, and the real key - % at this level is before the best key, and so is now the new best - % key - % The QueryArray is not modified until we have checked all levels - find_nextkey(QueryArray, - LCnt + 1, - {LCnt, {Key, Val}}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{Key, Val}, BKL, {BestKey, BestVal}} when Key == BestKey -> - SQN = leveled_codec:strip_to_seqonly({Key, Val}), - BestSQN = leveled_codec:strip_to_seqonly({BestKey, BestVal}), - if - SQN =< BestSQN -> - % This is a dominated key, so we need to skip over it - NewQArray = lists:keyreplace(LCnt, - 1, - QueryArray, - {LCnt, RestOfKeys}), - find_nextkey(NewQArray, - LCnt + 1, - {BKL, {BestKey, BestVal}}, - StartKey, EndKey, - SegList, LowLastMod, Width); - SQN > BestSQN -> - % There is a real key at the front of this level and it has - % a higher SQN than the best key, so we should use this as - % the best key - % But we also need to remove the dominated key from the - % lower level in the query array - OldBestEntry = lists:keyfind(BKL, 1, QueryArray), - {BKL, [{BestKey, BestVal}|BestTail]} = OldBestEntry, - find_nextkey(lists:keyreplace(BKL, - 1, - QueryArray, - {BKL, BestTail}), - LCnt + 1, - {LCnt, {Key, Val}}, - StartKey, EndKey, - SegList, LowLastMod, Width) - end; - {_, BKL, BKV} -> - % This is not the best key - find_nextkey(QueryArray, - LCnt + 1, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width) + find_nextkeys( + maps:update(LCnt, UpdList, Iter), + {LoopLs, PrevBest}, + FoundKVs, + Ls, BI, SI); + [{Key, Val}|_RestOfKeys] when BKV == null -> + find_nextkeys( + Iter, + {OtherLevels, {LCnt, {Key, Val}}}, + FoundKVs, + Ls, BI, SI); + [{Key, Val}|_RestOfKeys] when Key < element(1, BKV) -> + find_nextkeys( + Iter, + {OtherLevels, {LCnt, {Key, Val}}}, + FoundKVs, + Ls, BI, SI); + [{Key, _Val}|_RestOfKeys] when Key > element(1, BKV) -> + find_nextkeys( + Iter, + {OtherLevels, PrevBest}, + FoundKVs, + Ls, BI, SI); + [{Key, Val}|_RestOfKeys] -> + case leveled_codec:key_dominates({Key, Val}, BKV) of + true -> + find_nextkeys( + maps:update_with(BKL, fun tl/1, Iter), + {OtherLevels, {LCnt, {Key, Val}}}, + FoundKVs, + Ls, BI, SI); + false -> + find_nextkeys( + maps:update_with(LCnt, fun tl/1, Iter), + {OtherLevels, PrevBest}, + FoundKVs, + Ls, BI, SI) + end end. --spec maybelog_fetch_timing( - leveled_monitor:monitor(), - memory|leveled_pmanifest:lsm_level(), - leveled_monitor:timing(), - boolean()) -> ok. -maybelog_fetch_timing(_Monitor, _Level, no_timing, _NF) -> - ok; -maybelog_fetch_timing({Pid, _StatsFreq}, _Level, FetchTime, true) -> - leveled_monitor:add_stat(Pid, {pcl_fetch_update, not_found, FetchTime}); -maybelog_fetch_timing({Pid, _StatsFreq}, Level, FetchTime, _NF) -> - leveled_monitor:add_stat(Pid, {pcl_fetch_update, Level, FetchTime}). - - %%%============================================================================ %%% Test %%%============================================================================ @@ -1962,15 +1864,32 @@ pcl_fetch(Pid, Key) -> gen_server:call(Pid, {fetch, Key, Hash, true}, infinity) end. -keyfolder(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc, Now}) -> - keyfolder({IMMiter, SSTiter}, - {StartKey, EndKey}, - {AccFun, Acc, Now}, - {false, {0, infinity}, -1}). +keyfolder_test(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc, Now}) -> + keyfolder( + maps:put(-1, IMMiter, SSTiter), + {StartKey, EndKey}, + {AccFun, Acc, Now}, + {false, {0, infinity}, -1}). + +convert_qmanifest_tomap(SSTiter) -> + maps:from_list(SSTiter). find_nextkey(QueryArray, StartKey, EndKey) -> - find_nextkey(QueryArray, StartKey, EndKey, false, 0). - + {UpdArray, NextKeys} = + find_nextkeys( + QueryArray, + {maps:keys(QueryArray), ?NULL_KEY}, + [], + maps:keys(QueryArray), + {1, 1}, + {{StartKey, EndKey}, {0, infinity}, false}), + case UpdArray of + no_more_keys -> + no_more_keys; + UpdArray -> + [NextKey] = NextKeys, + {UpdArray, NextKey} + end. generate_randomkeys({Count, StartSQN}) -> generate_randomkeys(Count, StartSQN, []). @@ -1988,7 +1907,6 @@ generate_randomkeys(Count, SQN, Acc) -> leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). - clean_testdir(RootPath) -> clean_subdir(sst_rootpath(RootPath)), @@ -2008,7 +1926,6 @@ clean_subdir(DirPath) -> ok end. - maybe_pause_push(PCL, KL) -> T0 = [], I0 = leveled_pmem:new_index(), @@ -2196,30 +2113,22 @@ simple_server_test() -> ?assertMatch(Key2, pcl_fetch(PclSnap, {o,"Bucket0002", "Key0002", null})), ?assertMatch(Key3, pcl_fetch(PclSnap, {o,"Bucket0003", "Key0003", null})), ?assertMatch(Key4, pcl_fetch(PclSnap, {o,"Bucket0004", "Key0004", null})), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0001", - "Key0001", - null}, - 1)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0002", - "Key0002", - null}, - 1002)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0003", - "Key0003", - null}, - 2003)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0004", - "Key0004", - null}, - 3004)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0001", "Key0001", null}, 1)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0002", "Key0002", null}, 1002)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0003", "Key0003", null}, 2003)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0004", "Key0004", null}, 3004)), % Add some more keys and confirm that check sequence number still % sees the old version in the previous snapshot, but will see the new @@ -2231,12 +2140,10 @@ simple_server_test() -> KL1A = generate_randomkeys({2000, 4006}), ok = maybe_pause_push(PCLr, [Key1A]), ok = maybe_pause_push(PCLr, KL1A), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0001", - "Key0001", - null}, - 1)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0001", "Key0001", null}, 1)), ok = pcl_close(PclSnap), {ok, PclSnap2, null} = @@ -2249,36 +2156,31 @@ simple_server_test() -> undefined, false), - ?assertMatch(replaced, pcl_checksequencenumber(PclSnap2, - {o, - "Bucket0001", - "Key0001", - null}, - 1)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap2, - {o, - "Bucket0001", - "Key0001", - null}, - 4005)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap2, - {o, - "Bucket0002", - "Key0002", - null}, - 1002)), + ?assertMatch( + replaced, + pcl_checksequencenumber( + PclSnap2, {o, "Bucket0001", "Key0001", null}, 1)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap2, {o, "Bucket0001", "Key0001", null}, 4005)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap2, {o, "Bucket0002", "Key0002", null}, 1002)), ok = pcl_close(PclSnap2), ok = pcl_close(PCLr), clean_testdir(RootPath). simple_findnextkey_test() -> - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, {{o, "Bucket1", "Key5", null}, {4, {active, infinity}, {0, 0}, null}}]}, {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, {5, [{{o, "Bucket1", "Key2", null}, {2, {active, infinity}, {0, 0}, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0", null}, {o, "Bucket1", "Key5", null}), @@ -2309,12 +2211,13 @@ simple_findnextkey_test() -> ?assertMatch(no_more_keys, ER). sqnoverlap_findnextkey_test() -> - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, {{o, "Bucket1", "Key5", null}, {4, {active, infinity}, {0, 0}, null}}]}, {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, {0, 0}, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0", null}, {o, "Bucket1", "Key5", null}), @@ -2339,12 +2242,13 @@ sqnoverlap_findnextkey_test() -> ?assertMatch(no_more_keys, ER). sqnoverlap_otherway_findnextkey_test() -> - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, {0, 0}, null}}]}, {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, {0, 0}, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0", null}, {o, "Bucket1", "Key5", null}), @@ -2370,7 +2274,7 @@ sqnoverlap_otherway_findnextkey_test() -> foldwithimm_simple_test() -> Now = leveled_util:integer_now(), - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, 0, null}}, {{o, "Bucket1", "Key5", null}, @@ -2380,6 +2284,7 @@ foldwithimm_simple_test() -> {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, 0, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), KL1A = [{{o, "Bucket1", "Key6", null}, {7, {active, infinity}, 0, null}}, {{o, "Bucket1", "Key1", null}, {8, {active, infinity}, 0, null}}, {{o, "Bucket1", "Key8", null}, {9, {active, infinity}, 0, null}}], @@ -2389,7 +2294,7 @@ foldwithimm_simple_test() -> IMM2), AccFun = fun(K, V, Acc) -> SQN = leveled_codec:strip_to_seqonly({K, V}), Acc ++ [{K, SQN}] end, - Acc = keyfolder(IMMiter, + Acc = keyfolder_test(IMMiter, QueryArray, {o, "Bucket1", "Key1", null}, {o, "Bucket1", "Key6", null}, {AccFun, [], Now}), @@ -2400,7 +2305,7 @@ foldwithimm_simple_test() -> IMMiterA = [{{o, "Bucket1", "Key1", null}, {8, {active, infinity}, 0, null}}], - AccA = keyfolder(IMMiterA, + AccA = keyfolder_test(IMMiterA, QueryArray, {o, "Bucket1", "Key1", null}, {o, "Bucket1", "Key6", null}, @@ -2416,7 +2321,7 @@ foldwithimm_simple_test() -> {o, null, null, null}, IMM3), io:format("Compare IMM3 with QueryArrary~n"), - AccB = keyfolder(IMMiterB, + AccB = keyfolder_test(IMMiterB, QueryArray, {o, "Bucket1", "Key1", null}, {o, "Bucket1", "Key6", null}, {AccFun, [], Now}), @@ -2453,7 +2358,6 @@ slow_fetch_test() -> ?assertMatch(not_present, log_slowfetch(2, not_present, "fake", 0, 1)), ?assertMatch("value", log_slowfetch(2, "value", "fake", 0, 1)). - coverage_cheat_test() -> {noreply, _State0} = handle_info(timeout, #state{}), {ok, _State1} = code_change(null, #state{}, null). @@ -2533,4 +2437,4 @@ loop() -> ok end. --endif. +-endif. \ No newline at end of file diff --git a/src/leveled_pmanifest.erl b/src/leveled_pmanifest.erl index eb59ad0f..7f051224 100644 --- a/src/leveled_pmanifest.erl +++ b/src/leveled_pmanifest.erl @@ -451,23 +451,28 @@ key_lookup(Manifest, LevelIdx, Key) -> -spec query_manifest( manifest(), leveled_codec:ledger_key(), - leveled_codec:ledger_key()) -> list(). + leveled_codec:ledger_key()) + -> list( + {lsm_level(), + list({next, manifest_entry(), leveled_codec:ledger_key()})}). query_manifest(Manifest, StartKey, EndKey) -> SetupFoldFun = fun(Level, Acc) -> - Pointers = - range_lookup(Manifest, Level, StartKey, EndKey), - case Pointers of - [] -> Acc; - PL -> Acc ++ [{Level, PL}] + case range_lookup(Manifest, Level, StartKey, EndKey) of + [] -> + Acc; + Pointers -> + [{Level, Pointers}|Acc] end end, lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)). --spec range_lookup(manifest(), - integer(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key()) -> list(). +-spec range_lookup( + manifest(), + integer(), + leveled_codec:ledger_key(), + leveled_codec:ledger_key()) + -> list({next, manifest_entry(), leveled_codec:ledger_key()}). %% @doc %% Return a list of manifest_entry pointers at this level which cover the %% key query range. @@ -478,10 +483,11 @@ range_lookup(Manifest, LevelIdx, StartKey, EndKey) -> end, range_lookup_int(Manifest, LevelIdx, StartKey, EndKey, MakePointerFun). --spec merge_lookup(manifest(), - integer(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key()) -> list(). +-spec merge_lookup( + manifest(), + integer(), + leveled_codec:ledger_key(), + leveled_codec:ledger_key()) -> list({next, manifest_entry(), all}). %% @doc %% Return a list of manifest_entry pointers at this level which cover the %% key query range, only all keys in the files should be included in the @@ -494,8 +500,8 @@ merge_lookup(Manifest, LevelIdx, StartKey, EndKey) -> range_lookup_int(Manifest, LevelIdx, StartKey, EndKey, MakePointerFun). --spec mergefile_selector(manifest(), integer(), selector_strategy()) - -> manifest_entry(). +-spec mergefile_selector( + manifest(), integer(), selector_strategy()) -> manifest_entry(). %% @doc %% An algorithm for discovering which files to merge .... %% We can find the most optimal file: @@ -511,13 +517,15 @@ mergefile_selector(Manifest, LevelIdx, _Strategy) when LevelIdx =< 1 -> Level = array:get(LevelIdx, Manifest#manifest.levels), lists:nth(leveled_rand:uniform(length(Level)), Level); mergefile_selector(Manifest, LevelIdx, random) -> - Level = leveled_tree:to_list(array:get(LevelIdx, - Manifest#manifest.levels)), + Level = + leveled_tree:to_list( + array:get(LevelIdx, Manifest#manifest.levels)), {_SK, ME} = lists:nth(leveled_rand:uniform(length(Level)), Level), ME; mergefile_selector(Manifest, LevelIdx, {grooming, ScoringFun}) -> - Level = leveled_tree:to_list(array:get(LevelIdx, - Manifest#manifest.levels)), + Level = + leveled_tree:to_list( + array:get(LevelIdx, Manifest#manifest.levels)), SelectorFun = fun(_I, Acc) -> {_SK, ME} = lists:nth(leveled_rand:uniform(length(Level)), Level), @@ -555,12 +563,12 @@ add_snapshot(Manifest, Pid, Timeout) -> ManSQN = Manifest#manifest.manifest_sqn, case Manifest#manifest.min_snapshot_sqn of 0 -> - Manifest#manifest{snapshots = SnapList0, - min_snapshot_sqn = ManSQN}; + Manifest#manifest{ + snapshots = SnapList0, min_snapshot_sqn = ManSQN}; N -> N0 = min(N, ManSQN), - Manifest#manifest{snapshots = SnapList0, - min_snapshot_sqn = N0} + Manifest#manifest{ + snapshots = SnapList0, min_snapshot_sqn = N0} end. -spec release_snapshot(manifest(), pid()|atom()) -> manifest(). diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 9e1b8a3f..2f5bb885 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -43,7 +43,7 @@ -define(MAX_CACHE_LINES, 31). % Must be less than 128 --type index_array() :: list(array:array())|[]|none. +-type index_array() :: list(array:array())|[]|none. -export_type([index_array/0]). @@ -71,7 +71,7 @@ prepare_for_index(IndexArray, no_lookup) -> prepare_for_index(IndexArray, Hash) -> {Slot, H0} = split_hash(Hash), Bin = array:get(Slot, IndexArray), - array:set(Slot, <>, IndexArray). + array:set(Slot, <>, IndexArray). -spec add_to_index(array:array(), index_array(), integer()) -> index_array(). %% @doc @@ -201,16 +201,16 @@ merge_trees(StartKey, EndKey, TreeList, LevelMinus1) -> find_pos(<<>>, _Hash) -> false; -find_pos(<<1:1/integer, Hash:23/integer, _T/binary>>, Hash) -> +find_pos(<>, Hash) -> true; -find_pos(<<1:1/integer, _Miss:23/integer, T/binary>>, Hash) -> +find_pos(<<_Miss:24/integer, T/binary>>, Hash) -> find_pos(T, Hash). split_hash({SegmentID, ExtraHash}) -> Slot = SegmentID band 255, H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8), - {Slot, H0 band 8388607}. + {Slot, H0 band 16#FFFFFF}. check_slotlist(Key, _Hash, CheckList, TreeList) -> SlotCheckFun = diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl index 86f753db..a9d337aa 100644 --- a/src/leveled_runner.erl +++ b/src/leveled_runner.erl @@ -65,6 +65,7 @@ -type mp() :: {re_pattern, term(), term(), term(), term()}. +-export_type([acc_fun/0, mp/0]). %%%============================================================================ %%% External functions @@ -146,15 +147,7 @@ bucket_list(SnapFun, Tag, FoldBucketsFun, InitAcc, MaxBuckets) -> %% for a timeout index_query(SnapFun, {StartKey, EndKey, TermHandling}, FoldAccT) -> {FoldKeysFun, InitAcc} = FoldAccT, - {ReturnTerms, TermRegex} = TermHandling, - AddFun = - case ReturnTerms of - true -> - fun add_terms/2; - _ -> - fun add_keys/2 - end, - + Runner = fun() -> {ok, LedgerSnapshot, _JournalSnapshot, AfterFun} = SnapFun(), @@ -163,7 +156,7 @@ index_query(SnapFun, {StartKey, EndKey, TermHandling}, FoldAccT) -> LedgerSnapshot, StartKey, EndKey, - accumulate_index(TermRegex, AddFun, FoldKeysFun), + leveled_codec:accumulate_index(TermHandling, FoldKeysFun), InitAcc, by_runner), wrap_runner(Folder, AfterFun) @@ -680,47 +673,20 @@ check_presence(Key, Value, InkerClone) -> false end. +accumulate_keys(FoldKeysFun, undefined) -> + fun(Key, _Value, Acc) -> + {B, K} = leveled_codec:from_ledgerkey(Key), + FoldKeysFun(B, K, Acc) + end; accumulate_keys(FoldKeysFun, TermRegex) -> - AccFun = - fun(Key, _Value, Acc) -> - {B, K} = leveled_codec:from_ledgerkey(Key), - case TermRegex of - undefined -> - FoldKeysFun(B, K, Acc); - Re -> - case re:run(K, Re) of - nomatch -> - Acc; - _ -> - FoldKeysFun(B, K, Acc) - end - end - end, - AccFun. - -add_keys(ObjKey, _IdxValue) -> - ObjKey. - -add_terms(ObjKey, IdxValue) -> - {IdxValue, ObjKey}. - -accumulate_index(TermRe, AddFun, FoldKeysFun) -> - case TermRe of - undefined -> - fun(Key, _Value, Acc) -> - {Bucket, ObjKey, IdxValue} = leveled_codec:from_ledgerkey(Key), - FoldKeysFun(Bucket, AddFun(ObjKey, IdxValue), Acc) - end; - TermRe -> - fun(Key, _Value, Acc) -> - {Bucket, ObjKey, IdxValue} = leveled_codec:from_ledgerkey(Key), - case re:run(IdxValue, TermRe) of - nomatch -> - Acc; - _ -> - FoldKeysFun(Bucket, AddFun(ObjKey, IdxValue), Acc) - end - end + fun(Key, _Value, Acc) -> + {B, K} = leveled_codec:from_ledgerkey(Key), + case re:run(K, TermRegex) of + nomatch -> + Acc; + _ -> + FoldKeysFun(B, K, Acc) + end end. -spec wrap_runner(fun(), fun()) -> any(). diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index c11c36be..b9a705bc 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -79,16 +79,17 @@ -define(DELETE_TIMEOUT, 10000). -define(TREE_TYPE, idxt). -define(TREE_SIZE, 16). --define(TIMING_SAMPLECOUNTDOWN, 20000). --define(TIMING_SAMPLESIZE, 100). -define(BLOCK_LENGTHS_LENGTH, 20). -define(LMD_LENGTH, 4). -define(FLIPPER32, 4294967295). -define(DOUBLESIZE_LEVEL, 3). -define(INDEX_MODDATE, true). -define(TOMB_COUNT, true). --define(USE_SET_FOR_SPEED, 64). +-define(USE_SET_FOR_SPEED, 32). -define(STARTUP_TIMEOUT, 10000). +-define(MIN_HASH, 32768). +-define(MAX_HASH, 65535). +-define(LOG_BUILDTIMINGS_LEVELS, [3]). -ifdef(TEST). -define(HIBERNATE_TIMEOUT, 5000). @@ -128,13 +129,14 @@ -export([sst_newmerge/10]). --export([tune_seglist/1, extract_hash/1, member_check/2]). +-export([tune_seglist/1, extract_hash/1, segment_checker/1]). -export([in_range/3]). --record(slot_index_value, {slot_id :: integer(), - start_position :: integer(), - length :: integer()}). +-record(slot_index_value, + {slot_id :: integer(), + start_position :: integer(), + length :: integer()}). -record(summary, {first_key :: tuple(), @@ -143,62 +145,61 @@ size :: integer(), max_sqn :: integer()}). %% DO NOT CHANGE - %% The summary record is persisted as part of the sile format - %% Any chnage to this record will mean the change cannot be rolled back + %% The summary record is persisted as part of the file format + %% Any change to this record will mean the change cannot be rolled back +-type slot_index_value() + :: #slot_index_value{}. -type press_method() - :: lz4|native|none. + :: lz4|native|none. -type range_endpoint() - :: all|leveled_codec:ledger_key(). + :: all|leveled_codec:ledger_key(). -type slot_pointer() - :: {pointer, pid(), integer(), range_endpoint(), range_endpoint()}. + :: {pointer, + pid(), slot_index_value(), range_endpoint(), range_endpoint()}. -type sst_pointer() - % Used in sst_new - :: {next, - leveled_pmanifest:manifest_entry(), - range_endpoint()}. + % Used in sst_new + :: {next, + leveled_pmanifest:manifest_entry(), + range_endpoint()}. -type sst_closed_pointer() - % used in expand_list_by_pointer - % (close point is added by maybe_expand_pointer - :: {next, - leveled_pmanifest:manifest_entry(), - range_endpoint(), - range_endpoint()}. + % used in expand_list_by_pointer + % (close point is added by maybe_expand_pointer + :: {next, + leveled_pmanifest:manifest_entry(), + range_endpoint(), + range_endpoint()}. -type expandable_pointer() - :: slot_pointer()|sst_pointer()|sst_closed_pointer(). + :: slot_pointer()|sst_pointer()|sst_closed_pointer(). -type expanded_pointer() - :: leveled_codec:ledger_kv()|expandable_pointer(). --type binaryslot_element() - :: {tuple(), tuple()}|{binary(), integer(), tuple(), tuple()}. + :: leveled_codec:ledger_kv()|expandable_pointer(). +-type expanded_slot() :: + {binary(), non_neg_integer(), range_endpoint(), range_endpoint()}. -type tuned_seglist() - :: false| - {sets, sets:set(non_neg_integer())}| - {list, list(non_neg_integer())}. + :: false | list(non_neg_integer()). -type sst_options() - :: #sst_options{}. + :: #sst_options{}. -type binary_slot() - :: {binary(), binary(), list(integer()), leveled_codec:ledger_key()}. + :: {binary(), binary(), list(integer()), leveled_codec:ledger_key()}. -type sst_summary() - :: #summary{}. + :: #summary{}. -type blockindex_cache() - :: {non_neg_integer(), array:array(), non_neg_integer()}. + :: {non_neg_integer(), array:array(), non_neg_integer()}. -type fetch_cache() - :: array:array()|no_cache. + :: array:array()|no_cache. -type cache_size() - :: no_cache|4|32|64. + :: no_cache|4|32|64. -type cache_hash() - :: no_cache|non_neg_integer(). --type level() - :: non_neg_integer(). + :: no_cache|non_neg_integer(). -type summary_filter() - :: fun((leveled_codec:ledger_key()) -> any()). - -%% yield_blockquery is used to determine if the work necessary to process a -%% range query beyond the fetching the slot should be managed from within -%% this process, or should be handled by the calling process. -%% Handling within the calling process may lead to extra binary heap garbage -%% see Issue 52. Handling within the SST process may lead to contention and -%% extra copying. Files at the top of the tree yield, those lower down don't. + :: fun((leveled_codec:ledger_key()) -> any()). +-type segment_check_fun() + :: non_neg_integer() + | {non_neg_integer(), non_neg_integer(), + fun((non_neg_integer()) -> boolean())} + | false. +-type fetch_levelzero_fun() + :: fun((pos_integer(), leveled_penciller:levelzero_returnfun()) -> ok). -record(state, {summary, @@ -206,7 +207,6 @@ penciller :: pid() | undefined | false, root_path, filename, - yield_blockquery = false :: boolean(), blockindex_cache :: blockindex_cache() | undefined | redacted, compression_method = native :: press_method(), @@ -215,7 +215,7 @@ fetch_cache = no_cache :: fetch_cache() | redacted, new_slots :: list()|undefined, deferred_startup_tuple :: tuple()|undefined, - level :: level()|undefined, + level :: leveled_pmanifest:lsm_level()|undefined, tomb_count = not_counted :: non_neg_integer()|not_counted, high_modified_date :: non_neg_integer()|undefined, @@ -226,18 +226,19 @@ {slot_hashlist = 0 :: integer(), slot_serialise = 0 :: integer(), slot_finish = 0 :: integer(), - fold_toslot = 0 :: integer()}). + fold_toslot = 0 :: integer(), + last_timestamp = os:timestamp() :: erlang:timestamp()}). --type sst_state() :: #state{}. -type build_timings() :: no_timing|#build_timings{}. --export_type([expandable_pointer/0, press_method/0]). +-export_type([expandable_pointer/0, press_method/0, segment_check_fun/0]). %%%============================================================================ %%% API %%%============================================================================ --spec sst_open(string(), string(), sst_options(), level()) +-spec sst_open( + string(), string(), sst_options(), leveled_pmanifest:lsm_level()) -> {ok, pid(), {leveled_codec:ledger_key(), leveled_codec:ledger_key()}, binary()}. @@ -257,7 +258,7 @@ sst_open(RootPath, Filename, OptsSST, Level) -> {ok, Pid, {SK, EK}, Bloom} end. --spec sst_new(string(), string(), level(), +-spec sst_new(string(), string(), leveled_pmanifest:lsm_level(), list(leveled_codec:ledger_kv()), integer(), sst_options()) -> {ok, pid(), @@ -294,7 +295,7 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, OptsSST, IndexModDate) -> -spec sst_newmerge(string(), string(), list(leveled_codec:ledger_kv()|sst_pointer()), list(leveled_codec:ledger_kv()|sst_pointer()), - boolean(), level(), + boolean(), leveled_pmanifest:lsm_level(), integer(), sst_options()) -> empty|{ok, pid(), {{list(leveled_codec:ledger_kv()), @@ -337,39 +338,35 @@ sst_newmerge(RootPath, Filename, empty; _ -> {ok, Pid} = gen_statem:start_link(?MODULE, [], ?START_OPTS), - case gen_statem:call(Pid, {sst_new, - RootPath, - Filename, - Level, - {SlotList, FK}, - MaxSQN, - OptsSST0, - IndexModDate, - CountOfTombs, - self()}, - infinity) of - {ok, {SK, EK}, Bloom} -> - {ok, Pid, {{Rem1, Rem2}, SK, EK}, Bloom} - end + {ok, {SK, EK}, Bloom} = + gen_statem:call( + Pid, + {sst_new, + RootPath, + Filename, + Level, + {SlotList, FK}, + MaxSQN, + OptsSST0, + IndexModDate, + CountOfTombs, self()}, + infinity), + {ok, Pid, {{Rem1, Rem2}, SK, EK}, Bloom} end. --spec sst_newlevelzero(string(), string(), - integer(), - fun((pos_integer(), - leveled_penciller:levelzero_returnfun()) - -> ok)| - list(), - pid()|undefined, - integer(), - sst_options()) -> - {ok, pid(), noreply}. +-spec sst_newlevelzero( + string(), string(), + integer(), + fetch_levelzero_fun()|list(), + pid()|undefined, + integer(), + sst_options()) -> {ok, pid(), noreply}. %% @doc %% Start a new file at level zero. At this level the file size is not fixed - %% it will be as big as the input. Also the KVList is not passed in, it is %% fetched slot by slot using the FetchFun -sst_newlevelzero(RootPath, Filename, - Slots, Fetcher, Penciller, - MaxSQN, OptsSST) -> +sst_newlevelzero( + RootPath, Filename, Slots, Fetcher, Penciller, MaxSQN, OptsSST) -> OptsSST0 = update_options(OptsSST, 0), {ok, Pid} = gen_statem:start_link(?MODULE, [], ?START_OPTS), %% Initiate the file into the "starting" state @@ -383,14 +380,10 @@ sst_newlevelzero(RootPath, Filename, infinity), ok = case Fetcher of - FetchSlots when is_list(Fetcher) -> - gen_statem:cast(Pid, {complete_l0startup, FetchSlots}); - _ -> - % Fetcher is a function - gen_statem:cast(Pid, {sst_returnslot, none, Fetcher, Slots}) - % Start the fetch loop (async). Having the fetch loop running - % on async message passing means that the SST file can now be - % closed while the fetch loop is still completing + SlotList when is_list(SlotList) -> + gen_statem:cast(Pid, {complete_l0startup, SlotList}); + FetchFun when is_function(FetchFun, 2) -> + gen_statem:cast(Pid, {sst_returnslot, none, FetchFun, Slots}) end, {ok, Pid, noreply}. @@ -427,22 +420,21 @@ sst_getsqn(Pid, LedgerKey, Hash) -> sst_getmaxsequencenumber(Pid) -> gen_statem:call(Pid, get_maxsequencenumber, infinity). --spec sst_expandpointer(expandable_pointer(), - list(expandable_pointer()), - pos_integer(), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(expanded_pointer()). +-spec sst_expandpointer( + expandable_pointer(), + list(expandable_pointer()), + pos_integer(), + segment_check_fun(), + non_neg_integer()) -> list(expanded_pointer()). %% @doc %% Expand out a list of pointer to return a list of Keys and Values with a %% tail of pointers (once the ScanWidth has been satisfied). %% Folding over keys in a store uses this function, although this function %% does not directly call the gen_server - it does so by sst_getfilteredslots %% or sst_getfilteredrange depending on the nature of the pointer. -sst_expandpointer(Pointer, MorePointers, ScanWidth, SegmentList, LowLastMod) -> - expand_list_by_pointer(Pointer, MorePointers, ScanWidth, - SegmentList, LowLastMod). - +sst_expandpointer(Pointer, MorePointers, ScanWidth, SegChecker, LowLastMod) -> + expand_list_by_pointer( + Pointer, MorePointers, ScanWidth, SegChecker, LowLastMod). -spec sst_setfordelete(pid(), pid()|false) -> ok. %% @doc @@ -456,7 +448,7 @@ sst_setfordelete(Pid, Penciller) -> -spec sst_gettombcount(pid()) -> non_neg_integer()|not_counted. %% @doc -%% Get the count of tomb stones in this SST file, returning not_counted if this +%% Get the count of tombstones in this SST file, returning not_counted if this %% file was created with a version which did not support tombstone counting, or %% could also be because the file is L0 (which aren't counted as being chosen %% for merge is inevitable) @@ -477,9 +469,8 @@ sst_clear(Pid) -> sst_deleteconfirmed(Pid) -> gen_statem:cast(Pid, close). --spec sst_checkready(pid()) -> {ok, string(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key()}. +-spec sst_checkready(pid()) -> + {ok, string(), leveled_codec:ledger_key(), leveled_codec:ledger_key()}. %% @doc %% If a file has been set to be built, check that it has been built. Returns %% the filename and the {startKey, EndKey} for the manifest. @@ -491,8 +482,7 @@ sst_checkready(Pid) -> %% @doc %% Notify the SST file that it is now working at a new level %% This simply prompts a GC on the PID now (as this may now be a long-lived -%% file, so don't want all the startup state to be held on memory - want to -%% proactively drop it +%% file, so don't want all the startup state to be held on memory) sst_switchlevels(Pid, NewLevel) -> gen_statem:cast(Pid, {switch_levels, NewLevel}). @@ -502,8 +492,6 @@ sst_switchlevels(Pid, NewLevel) -> sst_close(Pid) -> gen_statem:call(Pid, close). - - %%%============================================================================ %%% gen_statem callbacks %%%============================================================================ @@ -546,22 +534,23 @@ starting({call, From}, {Length, SlotIndex, BlockEntries, SlotsBin, Bloom} = build_all_slots(SlotList), {_, BlockIndex, HighModDate} = - update_blockindex_cache(true, - BlockEntries, - new_blockindex_cache(Length), - undefined, - IdxModDate), + update_blockindex_cache( + BlockEntries, + new_blockindex_cache(Length), + undefined, + IdxModDate), SummaryBin = - build_table_summary(SlotIndex, Level, FirstKey, Length, - MaxSQN, Bloom, CountOfTombs), + build_table_summary( + SlotIndex, Level, FirstKey, Length, MaxSQN, Bloom, CountOfTombs), ActualFilename = - write_file(RootPath, Filename, SummaryBin, SlotsBin, - PressMethod, IdxModDate, CountOfTombs), - YBQ = Level =< 2, + write_file( + RootPath, Filename, SummaryBin, SlotsBin, + PressMethod, IdxModDate, CountOfTombs), {UpdState, Bloom} = - read_file(ActualFilename, - State#state{root_path=RootPath, yield_blockquery=YBQ}, - OptsSST#sst_options.pagecache_level >= Level), + read_file( + ActualFilename, + State#state{root_path=RootPath}, + OptsSST#sst_options.pagecache_level >= Level), Summary = UpdState#state.summary, leveled_log:log_timer( sst08, [ActualFilename, Level, Summary#summary.max_sqn], SW), @@ -620,17 +609,17 @@ starting(cast, complete_l0startup, State) -> {SlotCount, SlotIndex, BlockEntries, SlotsBin,Bloom} = build_all_slots(SlotList), {_, BlockIndex, HighModDate} = - update_blockindex_cache(true, - BlockEntries, - new_blockindex_cache(SlotCount), - undefined, - IdxModDate), + update_blockindex_cache( + BlockEntries, + new_blockindex_cache(SlotCount), + undefined, + IdxModDate), Time2 = timer:now_diff(os:timestamp(), SW2), SW3 = os:timestamp(), SummaryBin = - build_table_summary(SlotIndex, 0, FirstKey, SlotCount, - MaxSQN, Bloom, not_counted), + build_table_summary( + SlotIndex, 0, FirstKey, SlotCount, MaxSQN, Bloom, not_counted), Time3 = timer:now_diff(os:timestamp(), SW3), SW4 = os:timestamp(), @@ -638,14 +627,13 @@ starting(cast, complete_l0startup, State) -> write_file(RootPath, Filename, SummaryBin, SlotsBin, PressMethod, IdxModDate, not_counted), {UpdState, Bloom} = - read_file(ActualFilename, - State#state{root_path=RootPath, - yield_blockquery=true, - % Important to empty this from state rather - % than carry it through to the next stage - new_slots=undefined, - deferred_startup_tuple=undefined}, - true), + read_file( + ActualFilename, + State#state{ + root_path=RootPath, + new_slots=undefined, % Important to empty this from state + deferred_startup_tuple=undefined}, + true), Summary = UpdState#state.summary, Time4 = timer:now_diff(os:timestamp(), SW4), @@ -690,16 +678,14 @@ starting(cast, {sst_returnslot, FetchedSlot, FetchFun, SlotCount}, State) -> Self = self(), ReturnFun = fun(NextSlot) -> - gen_statem:cast(Self, - {sst_returnslot, NextSlot, - FetchFun, SlotCount}) + gen_statem:cast( + Self, {sst_returnslot, NextSlot, FetchFun, SlotCount}) end, FetchFun(length(FetchedSlots) + 1, ReturnFun), {keep_state, State#state{new_slots = FetchedSlots}} end. - reader({call, From}, {get_kv, LedgerKey, Hash, Filter}, State) -> % Get a KV value and potentially take sample timings Monitor = @@ -746,68 +732,30 @@ reader({call, From}, {get_kv, LedgerKey, Hash, Filter}, State) -> [hibernate, {reply, From, Result}]} end; reader({call, From}, - {get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod}, + {fetch_range, StartKey, EndKey, LowLastMod}, State) -> - ReadNeeded = - check_modified(State#state.high_modified_date, - LowLastMod, - State#state.index_moddate), - {NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} = - case ReadNeeded of - true -> - fetch_range(StartKey, EndKey, ScanWidth, - SegList, LowLastMod, - State); - false -> - {false, [], []} - end, - PressMethod = State#state.compression_method, - IdxModDate = State#state.index_moddate, - - case State#state.yield_blockquery of - true -> - {keep_state_and_data, - [{reply, - From, - {yield, - SlotsToFetchBinList, - SlotsToPoint, - PressMethod, - IdxModDate} - }]}; - false -> - {L, FoundBIC} = - binaryslot_reader( - SlotsToFetchBinList, PressMethod, IdxModDate, SegList), - {UpdateCache, BlockIdxC0, HighModDate} = - update_blockindex_cache(NeedBlockIdx, - FoundBIC, - State#state.blockindex_cache, - State#state.high_modified_date, - State#state.index_moddate), - case UpdateCache of - true -> - {keep_state, - State#state{ - blockindex_cache = BlockIdxC0, - high_modified_date = HighModDate}, - [{reply, From, L ++ SlotsToPoint}]}; - false -> - {keep_state_and_data, - [hibernate, {reply, From, L ++ SlotsToPoint}]} - end - end; -reader({call, From}, {get_slots, SlotList, SegList, LowLastMod}, State) -> + SlotsToPoint = + fetch_range( + StartKey, + EndKey, + State#state.summary, + State#state.filter_fun, + check_modified( + State#state.high_modified_date, + LowLastMod, + State#state.index_moddate) + ), + {keep_state_and_data, [{reply, From, SlotsToPoint}]}; +reader({call, From}, {get_slots, SlotList, SegChecker, LowLastMod}, State) -> PressMethod = State#state.compression_method, IdxModDate = State#state.index_moddate, {NeedBlockIdx, SlotBins} = - read_slots(State#state.handle, - SlotList, - {SegList, - LowLastMod, - State#state.blockindex_cache}, - State#state.compression_method, - State#state.index_moddate), + read_slots( + State#state.handle, + SlotList, + {SegChecker, LowLastMod, State#state.blockindex_cache}, + State#state.compression_method, + State#state.index_moddate), {keep_state_and_data, [{reply, From, {NeedBlockIdx, SlotBins, PressMethod, IdxModDate}}]}; reader({call, From}, get_maxsequencenumber, State) -> @@ -838,9 +786,11 @@ reader({call, From}, close, State) -> {stop_and_reply, normal, [{reply, From, ok}], State}; reader(cast, {switch_levels, NewLevel}, State) -> - FreshCache = new_cache(NewLevel), {keep_state, - State#state{level = NewLevel, fetch_cache = FreshCache}, + State#state{ + level = NewLevel, + fetch_cache = new_cache(NewLevel) + }, [hibernate]}; reader(info, {update_blockindex_cache, BIC}, State) -> handle_update_blockindex_cache(BIC, State); @@ -890,33 +840,33 @@ delete_pending({call, From}, {get_kv, LedgerKey, Hash, Filter}, State) -> {keep_state_and_data, [{reply, From, Result}, ?DELETE_TIMEOUT]}; delete_pending( {call, From}, - {get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod}, + {fetch_range, StartKey, EndKey, LowLastMod}, State) -> - {_NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} = - fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State), - % Always yield as about to clear and de-reference - PressMethod = State#state.compression_method, - IdxModDate = State#state.index_moddate, - {keep_state_and_data, - [{reply, From, - {yield, - SlotsToFetchBinList, - SlotsToPoint, - PressMethod, - IdxModDate}}, - ?DELETE_TIMEOUT]}; + SlotsToPoint = + fetch_range( + StartKey, + EndKey, + State#state.summary, + State#state.filter_fun, + check_modified( + State#state.high_modified_date, + LowLastMod, + State#state.index_moddate) + ), + {keep_state_and_data, [{reply, From, SlotsToPoint}, ?DELETE_TIMEOUT]}; delete_pending( {call, From}, - {get_slots, SlotList, SegList, LowLastMod}, + {get_slots, SlotList, SegChecker, LowLastMod}, State) -> PressMethod = State#state.compression_method, IdxModDate = State#state.index_moddate, {_NeedBlockIdx, SlotBins} = - read_slots(State#state.handle, - SlotList, - {SegList, LowLastMod, State#state.blockindex_cache}, - PressMethod, - IdxModDate), + read_slots( + State#state.handle, + SlotList, + {SegChecker, LowLastMod, State#state.blockindex_cache}, + PressMethod, + IdxModDate), {keep_state_and_data, [{reply, From, {false, SlotBins, PressMethod, IdxModDate}}, ?DELETE_TIMEOUT]}; @@ -952,17 +902,21 @@ delete_pending(timeout, _, State) -> {keep_state_and_data, [leveled_rand:uniform(10) * ?DELETE_TIMEOUT]}. handle_update_blockindex_cache(BIC, State) -> - {_, BlockIndexCache, HighModDate} = - update_blockindex_cache(true, - BIC, - State#state.blockindex_cache, - State#state.high_modified_date, - State#state.index_moddate), - {keep_state, - State#state{ - blockindex_cache = BlockIndexCache, - high_modified_date = HighModDate}}. - + {NeedBlockIdx, BlockIndexCache, HighModDate} = + update_blockindex_cache( + BIC, + State#state.blockindex_cache, + State#state.high_modified_date, + State#state.index_moddate), + case NeedBlockIdx of + true -> + {keep_state, + State#state{ + blockindex_cache = BlockIndexCache, + high_modified_date = HighModDate}}; + false -> + keep_state_and_data + end. terminate(normal, delete_pending, _State) -> ok; @@ -983,10 +937,10 @@ format_status(terminate, [_PDict, _, State]) -> %%% External Functions %%%============================================================================ --spec expand_list_by_pointer(expandable_pointer(), - list(expandable_pointer()), - pos_integer()) - -> list(expanded_pointer()). +-spec expand_list_by_pointer( + expandable_pointer(), + list(expandable_pointer()), + pos_integer()) -> list(expanded_pointer()). %% @doc %% Expand a list of pointers, maybe ending up with a list of keys and values %% with a tail of pointers @@ -996,114 +950,96 @@ format_status(terminate, [_PDict, _, State]) -> %% skip those slots not containing any information over the low last modified %% date expand_list_by_pointer(Pointer, Tail, Width) -> - expand_list_by_pointer(Pointer, Tail, Width, false). - -%% TODO until leveled_penciller updated -expand_list_by_pointer(Pointer, Tail, Width, SegList) -> - expand_list_by_pointer(Pointer, Tail, Width, SegList, 0). - --spec expand_list_by_pointer(expandable_pointer(), - list(expandable_pointer()), - pos_integer(), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(expanded_pointer()). + expand_list_by_pointer(Pointer, Tail, Width, false, 0). + +-spec expand_list_by_pointer( + expandable_pointer(), + list(expandable_pointer()), + pos_integer(), + segment_check_fun(), + non_neg_integer()) -> list(expanded_pointer()). %% @doc %% With filters (as described in expand_list_by_pointer/3 -expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, - Tail, Width, SegList, LowLastMod) -> - FoldFun = - fun(X, {Pointers, Remainder}) -> - case length(Pointers) of - L when L < Width -> - case X of - {pointer, SSTPid, S, SK, EK} -> - {Pointers ++ [{pointer, S, SK, EK}], Remainder}; - _ -> - {Pointers, Remainder ++ [X]} - end; - _ -> - {Pointers, Remainder ++ [X]} - end +expand_list_by_pointer( + {pointer, SSTPid, Slot, StartKey, EndKey}, + Tail, Width, SegChecker, LowLastMod) -> + {PotentialPointers, Remainder} = + lists:split(min(Width - 1, length(Tail)), Tail), + {LocalPointers, OtherPointers} = + lists:partition( + fun(Pointer) -> + case Pointer of + {pointer, SSTPid, _S, _SK, _EK} -> + true; + _ -> + false + end end, - InitAcc = {[{pointer, Slot, StartKey, EndKey}], []}, - {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), - ExpPointers = sst_getfilteredslots(SSTPid, - AccPointers, - SegList, - LowLastMod), - lists:append(ExpPointers, AccTail); -expand_list_by_pointer({next, ManEntry, StartKey, EndKey}, - Tail, Width, SegList, LowLastMod) -> + PotentialPointers + ), + sst_getfilteredslots( + SSTPid, + [{pointer, SSTPid, Slot, StartKey, EndKey}|LocalPointers], + SegChecker, + LowLastMod, + OtherPointers ++ Remainder + ); +expand_list_by_pointer( + {next, ManEntry, StartKey, EndKey}, + Tail, _Width, _SegChecker, LowLastMod) -> + % The first pointer is a pointer to a file - expand_list_by_pointer will + % in this case convert this into list of pointers within that SST file + % i.e. of the form {pointer, SSTPid, Slot, StartKey, EndKey} + % This can then be further expanded by calling again to + % expand_list_by_pointer SSTPid = ManEntry#manifest_entry.owner, leveled_log:log(sst10, [SSTPid, is_process_alive(SSTPid)]), - ExpPointer = sst_getfilteredrange(SSTPid, - StartKey, - EndKey, - Width, - SegList, - LowLastMod), + ExpPointer = sst_getfilteredrange(SSTPid, StartKey, EndKey, LowLastMod), ExpPointer ++ Tail. --spec sst_getfilteredrange(pid(), - range_endpoint(), - range_endpoint(), - integer(), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(leveled_codec:ledger_kv()|slot_pointer()). +-spec sst_getfilteredrange( + pid(), + range_endpoint(), + range_endpoint(), + non_neg_integer()) -> list(slot_pointer()). %% @doc -%% Get a range of {Key, Value} pairs as a list between StartKey and EndKey -%% (inclusive). The ScanWidth is the maximum size of the range, a pointer -%% will be placed on the tail of the resulting list if results expand beyond -%% the Scan Width -%% -%% To make the range open-ended (either to start, end or both) the all atom -%% can be used in place of the Key tuple. -%% -%% A segment list can also be passed, which inidcates a subset of segment -%% hashes of interest in the query. -%% -%% TODO: Optimise this so that passing a list of segments that tune to the -%% same hash is faster - perhaps provide an exportable function in -%% leveled_tictac -sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, SegList, LowLastMod) -> - SegList0 = tune_seglist(SegList), - case gen_statem:call(Pid, {get_kvrange, - StartKey, EndKey, - ScanWidth, SegList0, LowLastMod}, - infinity) of - {yield, SlotsToFetchBinList, SlotsToPoint, PressMethod, IdxModDate} -> - {L, _BIC} = - binaryslot_reader(SlotsToFetchBinList, - PressMethod, IdxModDate, SegList0), - L ++ SlotsToPoint; - Reply -> - Reply - end. - - --spec sst_getfilteredslots(pid(), - list(slot_pointer()), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(leveled_codec:ledger_kv()). +%% Get a list of slot_pointers that contain the information to look into those +%% slots to find the actual {K, V} pairs between the range endpoints. +%% Expanding these slot_pointers can be done using sst_getfilteredslots/5 +%% +%% Use segment_checker/1 to produce a segment_check_fun if the hashes of the +%% keys to be found are known. The LowLastMod integer will skip any blocks +%% where all keys were modified before thta date. +sst_getfilteredrange(Pid, StartKey, EndKey, LowLastMod) -> + gen_statem:call( + Pid, {fetch_range, StartKey, EndKey, LowLastMod}, infinity). + + +-spec sst_getfilteredslots( + pid(), + list(slot_pointer()), + segment_check_fun(), + non_neg_integer(), + list(expandable_pointer())) -> list(leveled_codec:ledger_kv()). %% @doc %% Get a list of slots by their ID. The slot will be converted from the binary -%% to term form outside of the FSM loop +%% to term form outside of the FSM loop, unless a segment_check_fun is passed, +%% and this process has cached the index to be used by the segment_check_fun, +%% and in this case the list of Slotbins will include the actual {K, V} pairs. %% -%% A list of 16-bit integer Segment IDs can be passed to filter the keys -%% returned (not precisely - with false results returned in addition). Use -%% false as a SegList to not filter. -%% An integer can be provided which gives a floor for the LastModified Date -%% of the object, if the object is to be covered by the query -sst_getfilteredslots(Pid, SlotList, SegList, LowLastMod) -> - SegL0 = tune_seglist(SegList), +%% Use segment_checker/1 to produce a segment_check_fun if the hashes of the +%% keys to be found are known. The LowLastMod integer will skip any blocks +%% where all keys were modified before thta date, but the results may still +%% contain older values (the calling function should still filter by modified +%% date as required). +sst_getfilteredslots(Pid, SlotList, SegChecker, LowLastMod, Pointers) -> {NeedBlockIdx, SlotBins, PressMethod, IdxModDate} = gen_statem:call( - Pid, {get_slots, SlotList, SegL0, LowLastMod}, infinity), - {L, BIC} = binaryslot_reader(SlotBins, PressMethod, IdxModDate, SegL0), + Pid, {get_slots, SlotList, SegChecker, LowLastMod}, infinity), + {L, BIC} = + binaryslot_reader( + SlotBins, PressMethod, IdxModDate, SegChecker, Pointers), case NeedBlockIdx of true -> erlang:send(Pid, {update_blockindex_cache, BIC}); @@ -1112,45 +1048,67 @@ sst_getfilteredslots(Pid, SlotList, SegList, LowLastMod) -> end, L. - --spec find_pos(binary(), - non_neg_integer()| - {list, list(non_neg_integer())}| - {sets, sets:set(non_neg_integer())}, - list(non_neg_integer()), - non_neg_integer()) -> list(non_neg_integer()). +-spec find_pos( + binary(), segment_check_fun()) -> list(non_neg_integer()). %% @doc %% Find a list of positions where there is an element with a matching segment %% ID to the expected segments (which can either be a single segment, a list of -%% segments or a set of segments depending on size. -find_pos(<<1:1/integer, PotentialHit:15/integer, T/binary>>, - Checker, PosList, Count) -> - case member_check(PotentialHit, Checker) of +%% segments or a set of segments depending on size). The segment_check_fun +%% will do the matching. Segments are 15-bits of the hash of the key. +find_pos(Bin, H) when is_integer(H) -> + find_posint(Bin, H, [], 0); +find_pos(Bin, {Min, Max, CheckFun}) -> + find_posmlt(Bin, Min, Max, CheckFun, [], 0). + +find_posint(<>, H, PosList, Count) -> + find_posint(T, H, [Count|PosList], Count + 1); +find_posint(<>, H, PosList, Count) + when Miss >= ?MIN_HASH -> + find_posint(T, H, PosList, Count + 1); +find_posint(<>, H, PosList, Count) when NHC < 128 -> + find_posint(T, H, PosList, Count + NHC + 1); +find_posint(_BinRem, _H, PosList, _Count) -> + lists:reverse(PosList). + +find_posmlt(<>, Min, Max, CheckFun, PosList, Count) + when H >= Min, H =< Max -> + case CheckFun(H) of true -> - find_pos(T, Checker, PosList ++ [Count], Count + 1); + find_posmlt(T, Min, Max, CheckFun, [Count|PosList], Count + 1); false -> - find_pos(T, Checker, PosList, Count + 1) + find_posmlt(T, Min, Max, CheckFun, PosList, Count + 1) end; -find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Checker, PosList, Count) -> - find_pos(T, Checker, PosList, Count + NHC + 1); -find_pos(_BinRem, _Hash, PosList, _Count) -> - %% Expect this to be <<>> - i.e. at end of binary, but if there is - %% corruption, could be some other value - so return as well in this - %% case - PosList. - - --spec member_check(non_neg_integer(), - non_neg_integer()| - {list, list(non_neg_integer())}| - {sets, sets:set(non_neg_integer())}) -> boolean(). -member_check(Hash, Hash) -> - true; -member_check(Hash, {list, HashList}) -> - lists:member(Hash, HashList); -member_check(Hash, {sets, HashSet}) -> - sets:is_element(Hash, HashSet); -member_check(_Miss, _Checker) -> +find_posmlt(<>, Min, Max, CheckFun, PosList, Count) + when Miss >= ?MIN_HASH -> + find_posmlt(T, Min, Max, CheckFun, PosList, Count + 1); +find_posmlt(<>, Min, Max, CheckFun, PosList, Count) + when NHC < 128 -> + find_posmlt(T, Min, Max, CheckFun, PosList, Count + NHC + 1); +find_posmlt(_BinRem, _Min, _Max, _CheckFun, PosList, _Count) -> + lists:reverse(PosList). + + +-spec segment_checker( + non_neg_integer()| list(non_neg_integer())| false) + -> segment_check_fun(). +segment_checker(Hash) when is_integer(Hash) -> + Hash; +segment_checker(HashList) when is_list(HashList) -> + %% Note that commonly segments will be close together numerically. The + %% guess/estimate process for checking vnode size selects a contiguous + %% range. Also the kv_index_tictactree segment selector tries to group + %% segment IDs close together. Hence checking the bounds first is + %% generally much faster than a straight membership test. + Min = lists:min(HashList), + Max = lists:max(HashList), + case length(HashList) > ?USE_SET_FOR_SPEED of + true -> + HashSet = sets:from_list(HashList), + {Min, Max, fun(H) -> sets:is_element(H, HashSet) end}; + false -> + {Min, Max, fun(H) -> lists:member(H, HashList) end} + end; +segment_checker(false) -> false. -spec sqn_only(leveled_codec:ledger_kv()|not_present) @@ -1160,13 +1118,15 @@ sqn_only(not_present) -> sqn_only(KV) -> leveled_codec:strip_to_seqonly(KV). +-spec extract_hash( + leveled_codec:segment_hash()) -> non_neg_integer()|no_lookup. extract_hash({SegHash, _ExtraHash}) when is_integer(SegHash) -> tune_hash(SegHash); extract_hash(NotHash) -> NotHash. --spec new_cache(level()) -> fetch_cache(). +-spec new_cache(leveled_pmanifest:lsm_level()) -> fetch_cache(). new_cache(Level) -> case cache_size(Level) of no_cache -> @@ -1188,7 +1148,7 @@ cache_hash({_SegHash, ExtraHash}, Level) when is_integer(ExtraHash) -> %% as each level has more files than the previous level. Load tests with %% any sort of pareto distribution show far better cost/benefit ratios for %% cache at higher levels. --spec cache_size(level()) -> cache_size(). +-spec cache_size(leveled_pmanifest:lsm_level()) -> cache_size(). cache_size(N) when N < 3 -> 64; cache_size(3) -> @@ -1220,25 +1180,18 @@ add_to_cache(CacheHash, KV, FetchCache) -> array:set(CacheHash, KV, FetchCache). --spec tune_hash(non_neg_integer()) -> non_neg_integer(). +-spec tune_hash(non_neg_integer()) -> ?MIN_HASH..?MAX_HASH. %% @doc -%% Only 15 bits of the hash is ever interesting +%% Only 15 bits of the hash is ever interesting, and this is converted +%% into a 16-bit hash for matching by adding 2 ^ 15 (i.e. a leading 1) tune_hash(SegHash) -> - SegHash band 32767. + ?MIN_HASH + (SegHash band (?MIN_HASH - 1)). -spec tune_seglist(leveled_codec:segment_list()) -> tuned_seglist(). -%% @doc -%% Only 15 bits of the hash is ever interesting tune_seglist(SegList) -> case is_list(SegList) of true -> - SL0 = lists:usort(lists:map(fun tune_hash/1, SegList)), - case length(SL0) > ?USE_SET_FOR_SPEED of - true -> - {sets, sets:from_list(SL0)}; - false -> - {list, SL0} - end; + lists:usort(lists:map(fun tune_hash/1, SegList)); false -> false end. @@ -1289,11 +1242,12 @@ updatebic_foldfun(HMDRequired) -> end. -spec update_blockindex_cache( - boolean(), list({integer(), binary()}), - blockindex_cache(), non_neg_integer()|undefined, + list({integer(), binary()}), + blockindex_cache(), + non_neg_integer()|undefined, boolean()) -> {boolean(), blockindex_cache(), non_neg_integer()|undefined}. -update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) -> +update_blockindex_cache(Entries, BIC, HighModDate, IdxModDate) -> case {element(1, BIC), array:size(element(2, BIC))} of {N, N} -> {false, BIC, HighModDate}; @@ -1315,9 +1269,7 @@ update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) -> _ -> {true, BIC0, undefined} end - end; -update_blockindex_cache(_Needed, _Entries, BIC, HighModDate, _IdxModDate) -> - {false, BIC, HighModDate}. + end. -spec check_modified(non_neg_integer()|undefined, non_neg_integer(), @@ -1345,8 +1297,8 @@ check_modified(_, _, _) -> blockindex_cache()|no_update, non_neg_integer()|undefined|no_update, fetch_cache()|no_update}. + %% @doc -%% %% Fetch a key from the store, potentially taking timings. Result should be %% not_present if the key is not in the store. fetch(LedgerKey, Hash, @@ -1366,11 +1318,8 @@ fetch(LedgerKey, Hash, binaryslot_get( SlotBin, LedgerKey, Hash, PressMethod, IndexModDate), {_UpdateState, BIC0, HMD0} = - update_blockindex_cache(true, - [{SlotID, Header}], - BIC, - HighModDate, - IndexModDate), + update_blockindex_cache( + [{SlotID, Header}], BIC, HighModDate, IndexModDate), case Result of not_present -> maybelog_fetch_timing( @@ -1381,7 +1330,8 @@ fetch(LedgerKey, Hash, end, {Result, BIC0, HMD0, no_update}; {BlockLengths, _LMD, PosBin} -> - PosList = find_pos(PosBin, extract_hash(Hash), [], 0), + PosList = + find_pos(PosBin, segment_checker(extract_hash(Hash))), case PosList of [] -> maybelog_fetch_timing(Monitor, Level, not_found, SW0), @@ -1396,14 +1346,15 @@ fetch(LedgerKey, Hash, _ -> StartPos = Slot#slot_index_value.start_position, Result = - check_blocks(PosList, - {Handle, StartPos}, - BlockLengths, - byte_size(PosBin), - LedgerKey, - PressMethod, - IndexModDate, - not_present), + check_blocks( + PosList, + {Handle, StartPos}, + BlockLengths, + byte_size(PosBin), + LedgerKey, + PressMethod, + IndexModDate, + not_present), case Result of not_present -> maybelog_fetch_timing( @@ -1424,92 +1375,65 @@ fetch(LedgerKey, Hash, end. --spec fetch_range(tuple(), tuple(), integer(), - leveled_codec:segment_list(), non_neg_integer(), - sst_state()) -> - {boolean(), list(), list()}. +-spec fetch_range( + range_endpoint(), + range_endpoint(), + sst_summary(), + summary_filter(), + boolean()) -> list(slot_pointer()). %% @doc -%% Fetch the contents of the SST file for a given key range. This will -%% pre-fetch some results, and append pointers for additional results. -%% -%% A filter can be provided based on the Segment ID (usable for hashable -%% objects not no_lookup entries) to accelerate the query if the 5-arity -%% version is used -fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State) -> - Summary = State#state.summary, - Handle = State#state.handle, +%% Fetch pointers to the slots the SST file covered by a given key range. +fetch_range(StartKey, EndKey, Summary, FilterFun, true) -> {Slots, RTrim} = lookup_slots( StartKey, EndKey, Summary#summary.index, - State#state.filter_fun), + FilterFun), Self = self(), SL = length(Slots), - - ExpandedSlots = - case SL of - 1 -> - [Slot] = Slots, - case RTrim of - true -> - [{pointer, Self, Slot, StartKey, EndKey}]; - false -> - [{pointer, Self, Slot, StartKey, all}] - end; - N -> - {LSlot, MidSlots, RSlot} = - case N of - 2 -> - [Slot1, Slot2] = Slots, - {Slot1, [], Slot2}; - N -> - [Slot1|_Rest] = Slots, - SlotN = lists:last(Slots), - {Slot1, lists:sublist(Slots, 2, N - 2), SlotN} - end, - MidSlotPointers = lists:map(fun(S) -> - {pointer, Self, S, all, all} - end, - MidSlots), - case RTrim of - true -> - [{pointer, Self, LSlot, StartKey, all}] ++ - MidSlotPointers ++ - [{pointer, Self, RSlot, all, EndKey}]; - false -> - [{pointer, Self, LSlot, StartKey, all}] ++ - MidSlotPointers ++ - [{pointer, Self, RSlot, all, all}] - end - end, - {SlotsToFetch, SlotsToPoint} = - case ScanWidth of - SW when SW >= SL -> - {ExpandedSlots, []}; - _ -> - lists:split(ScanWidth, ExpandedSlots) - end, - - {NeededBlockIdx, SlotsToFetchBinList} = - read_slots(Handle, - SlotsToFetch, - {SegList, LowLastMod, State#state.blockindex_cache}, - State#state.compression_method, - State#state.index_moddate), - {NeededBlockIdx, SlotsToFetchBinList, SlotsToPoint}. + case SL of + 1 -> + [Slot] = Slots, + case RTrim of + true -> + [{pointer, Self, Slot, StartKey, EndKey}]; + false -> + [{pointer, Self, Slot, StartKey, all}] + end; + N -> + {LSlot, MidSlots, RSlot} = + {hd(Slots), lists:sublist(Slots, 2, N - 2), lists:last(Slots)}, + MidSlotPointers = + lists:map( + fun(S) -> {pointer, Self, S, all, all} end, + MidSlots), + case RTrim of + true -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, EndKey}]; + false -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, all}] + end + end; +fetch_range(_StartKey, _EndKey, _Summary, _FilterFun, false) -> + []. -spec compress_level( non_neg_integer(), non_neg_integer(), press_method()) -> press_method(). %% @doc -%% disable compression at higher levels for improved performance +%% Disable compression at higher levels for improved performance compress_level( Level, LevelToCompress, _PressMethod) when Level < LevelToCompress -> none; compress_level(_Level, _LevelToCompress, PressMethod) -> PressMethod. --spec maxslots_level(level(), pos_integer()) -> pos_integer(). +-spec maxslots_level( + leveled_pmanifest:lsm_level(), pos_integer()) -> pos_integer(). maxslots_level(Level, MaxSlotCount) when Level < ?DOUBLESIZE_LEVEL -> MaxSlotCount; maxslots_level(_Level, MaxSlotCount) -> @@ -1627,17 +1551,18 @@ open_reader(Filename, LoadPageCache) -> {ok, SummaryBin} = file:pread(Handle, SlotsLength + 9, SummaryLength), {Handle, FileVersion, SummaryBin}. -build_table_summary(SlotIndex, _Level, FirstKey, - SlotCount, MaxSQN, Bloom, CountOfTombs) -> +build_table_summary( + SlotIndex, _Level, FirstKey, SlotCount, MaxSQN, Bloom, CountOfTombs) -> [{LastKey, _LastV}|_Rest] = SlotIndex, - Summary = #summary{first_key = FirstKey, - last_key = LastKey, - size = SlotCount, - max_sqn = MaxSQN}, + Summary = + #summary{ + first_key = FirstKey, + last_key = LastKey, + size = SlotCount, + max_sqn = MaxSQN}, SummBin0 = - term_to_binary({Summary, Bloom, lists:reverse(SlotIndex)}, - ?BINARY_SETTINGS), - + term_to_binary( + {Summary, Bloom, lists:reverse(SlotIndex)}, ?BINARY_SETTINGS), SummBin = case CountOfTombs of not_counted -> @@ -1645,7 +1570,6 @@ build_table_summary(SlotIndex, _Level, FirstKey, I -> <> end, - SummCRC = hmac(SummBin), <>. @@ -1665,8 +1589,8 @@ read_table_summary(BinWithCheck, TombCount) -> % If not might it might be possible to rebuild from all the slots case TombCount of not_counted -> - erlang:append_element(binary_to_term(SummBin), - not_counted); + erlang:append_element( + binary_to_term(SummBin), not_counted); _ -> <> = SummBin, erlang:append_element(binary_to_term(SummBin0), I) @@ -1677,33 +1601,32 @@ read_table_summary(BinWithCheck, TombCount) -> build_all_slots(SlotList) -> SlotCount = length(SlotList), {SlotIndex, BlockIndex, SlotsBin, HashLists} = - build_all_slots(SlotList, - 9, - 1, - [], - [], - <<>>, - []), + build_all_slots( + SlotList, 9, 1, [], [], <<>>, []), Bloom = leveled_ebloom:create_bloom(HashLists), {SlotCount, SlotIndex, BlockIndex, SlotsBin, Bloom}. -build_all_slots([], _Pos, _SlotID, - SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> +build_all_slots( + [], + _Pos, _SlotID, SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> {SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists}; -build_all_slots([SlotD|Rest], Pos, SlotID, - SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> +build_all_slots( + [SlotD|Rest], + Pos, SlotID, SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> {BlockIdx, SlotBin, HashList, LastKey} = SlotD, Length = byte_size(SlotBin), - SlotIndexV = #slot_index_value{slot_id = SlotID, - start_position = Pos, - length = Length}, - build_all_slots(Rest, - Pos + Length, - SlotID + 1, - [{LastKey, SlotIndexV}|SlotIdxAcc], - [{SlotID, BlockIdx}|BlockIdxAcc], - <>, - lists:append(HashLists, HashList)). + SlotIndexV = + #slot_index_value{ + slot_id = SlotID, start_position = Pos, length = Length}, + build_all_slots( + Rest, + Pos + Length, + SlotID + 1, + [{LastKey, SlotIndexV}|SlotIdxAcc], + [{SlotID, BlockIdx}|BlockIdxAcc], + <>, + lists:append(HashList, HashLists) + ). generate_filenames(RootFilename) -> @@ -1767,8 +1690,6 @@ deserialise_checkedblock(Bin, _Other) -> % native or none can be treated the same binary_to_term(Bin). - - -spec hmac(binary()|integer()) -> integer(). %% @doc %% Perform a CRC check on an input @@ -1911,7 +1832,7 @@ lookup_slots(StartKey, EndKey, Tree, FilterFun) -> %% binary_to_term is an often repeated task, and this is better with smaller %% slots. %% -%% The outcome has been to divide the slot into four small blocks to minimise +%% The outcome has been to divide the slot into five small blocks to minimise %% the binary_to_term time. A binary index is provided for the slot for all %% Keys that are directly fetchable (i.e. standard keys not index keys). %% @@ -1919,54 +1840,58 @@ lookup_slots(StartKey, EndKey, Tree, FilterFun) -> %% compared to using a 128-member gb:tree. %% %% The binary index is cacheable and doubles as a not_present filter, as it is -%% based on a 17-bit hash (so 0.0039 fpr). - - --spec accumulate_positions(leveled_codec:ledger_kv(), - {binary(), - non_neg_integer(), - list(non_neg_integer()), - leveled_codec:last_moddate()}) -> - {binary(), - non_neg_integer(), - list(non_neg_integer()), - leveled_codec:last_moddate()}. +%% based on a 15-bit hash. + + +-spec accumulate_positions( + list(leveled_codec:ledger_kv()), + {binary(), + non_neg_integer(), + list(leveled_codec:segment_hash()), + leveled_codec:last_moddate()}) -> + {binary(), + non_neg_integer(), + list(leveled_codec:segment_hash()), + leveled_codec:last_moddate()}. %% @doc %% Fold function use to accumulate the position information needed to %% populate the summary of the slot -accumulate_positions({K, V}, {PosBinAcc, NoHashCount, HashAcc, LMDAcc}) -> +accumulate_positions([], Acc) -> + Acc; +accumulate_positions([{K, V}|T], {PosBin, NoHashCount, HashAcc, LMDAcc}) -> {_SQN, H1, LMD} = leveled_codec:strip_to_indexdetails({K, V}), LMDAcc0 = take_max_lastmoddate(LMD, LMDAcc), - PosH1 = extract_hash(H1), - case is_integer(PosH1) of - true -> + case extract_hash(H1) of + PosH1 when is_integer(PosH1) -> case NoHashCount of 0 -> - {<<1:1/integer, PosH1:15/integer,PosBinAcc/binary>>, - 0, - [H1|HashAcc], - LMDAcc0}; - N -> + accumulate_positions( + T, + {<>, + 0, + [H1|HashAcc], + LMDAcc0} + ); + N when N =< 128 -> % The No Hash Count is an integer between 0 and 127 % and so at read time should count NHC + 1 NHC = N - 1, - {<<1:1/integer, - PosH1:15/integer, - 0:1/integer, - NHC:7/integer, - PosBinAcc/binary>>, - 0, - [H1|HashAcc], - LMDAcc0} + accumulate_positions( + T, + {<>, + 0, + [H1|HashAcc], + LMDAcc0}) end; - false -> - {PosBinAcc, NoHashCount + 1, HashAcc, LMDAcc0} + _ -> + accumulate_positions( + T, {PosBin, NoHashCount + 1, HashAcc, LMDAcc0}) end. --spec take_max_lastmoddate(leveled_codec:last_moddate(), - leveled_codec:last_moddate()) -> - leveled_codec:last_moddate(). +-spec take_max_lastmoddate( + leveled_codec:last_moddate(), leveled_codec:last_moddate()) + -> leveled_codec:last_moddate(). %% @doc %% Get the last modified date. If no Last Modified Date on any object, can't %% add the accelerator and should check each object in turn @@ -1975,26 +1900,33 @@ take_max_lastmoddate(undefined, _LMDAcc) -> take_max_lastmoddate(LMD, LMDAcc) -> max(LMD, LMDAcc). --spec generate_binary_slot(leveled_codec:maybe_lookup(), - list(leveled_codec:ledger_kv()), - press_method(), - boolean(), - build_timings()) -> - {binary_slot(), - build_timings()}. +-spec generate_binary_slot( + leveled_codec:maybe_lookup(), + {forward|reverse, list(leveled_codec:ledger_kv())}, + press_method(), + boolean(), + build_timings()) -> {binary_slot(), build_timings()}. %% @doc %% Generate the serialised slot to be used when storing this sublist of keys %% and values -generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> - - SW0 = os:timestamp(), +generate_binary_slot( + Lookup, {DR, KVL0}, PressMethod, IndexModDate, BuildTimings0) -> + % The slot should be received reversed - get last key before flipping + % accumulate_positions/2 should use the reversed KVL for efficiency + {KVL, KVLr} = + case DR of + forward -> + {KVL0, lists:reverse(KVL0)}; + reverse -> + {lists:reverse(KVL0), KVL0} + end, + LastKey = element(1, hd(KVLr)), {HashL, PosBinIndex, LMD} = case Lookup of lookup -> - InitAcc = {<<>>, 0, [], 0}, {PosBinIndex0, NHC, HashL0, LMD0} = - lists:foldr(fun accumulate_positions/2, InitAcc, KVL), + accumulate_positions(KVLr, {<<>>, 0, [], 0}), PosBinIndex1 = case NHC of 0 -> @@ -2008,8 +1940,7 @@ generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> {[], <<0:1/integer, 127:7/integer>>, 0} end, - BuildTimings1 = update_buildtimings(SW0, BuildTimings0, slot_hashlist), - SW1 = os:timestamp(), + BuildTimings1 = update_buildtimings(BuildTimings0, slot_hashlist), {SideBlockSize, MidBlockSize} = case Lookup of @@ -2063,8 +1994,7 @@ generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> serialise_block(KVLE, PressMethod)} end, - BuildTimings2 = update_buildtimings(SW1, BuildTimings1, slot_serialise), - SW2 = os:timestamp(), + BuildTimings2 = update_buildtimings(BuildTimings1, slot_serialise), B1P = case IndexModDate of @@ -2102,9 +2032,7 @@ generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> CheckH:32/integer, Header/binary, B1/binary, B2/binary, B3/binary, B4/binary, B5/binary>>, - {LastKey, _LV} = lists:last(KVL), - - BuildTimings3 = update_buildtimings(SW2, BuildTimings2, slot_finish), + BuildTimings3 = update_buildtimings(BuildTimings2, slot_finish), {{Header, SlotBin, HashL, LastKey}, BuildTimings3}. @@ -2201,27 +2129,22 @@ read_slot(Handle, Slot) -> Slot#slot_index_value.length), SlotBin. - -pointer_mapfun(Pointer) -> - {Slot, SK, EK} = - case Pointer of - {pointer, _Pid, Slot0, SK0, EK0} -> - {Slot0, SK0, EK0}; - {pointer, Slot0, SK0, EK0} -> - {Slot0, SK0, EK0} - end, - +-spec pointer_mapfun( + slot_pointer()) -> + {non_neg_integer(), non_neg_integer(), non_neg_integer(), + range_endpoint(), range_endpoint()}. +pointer_mapfun({pointer, _Pid, Slot, SK, EK}) -> {Slot#slot_index_value.start_position, Slot#slot_index_value.length, Slot#slot_index_value.slot_id, SK, EK}. - -type slotbin_fun() :: fun(({non_neg_integer(), non_neg_integer(), non_neg_integer(), - range_endpoint(), range_endpoint()}) -> - {binary(), non_neg_integer(), range_endpoint(), range_endpoint()}). + range_endpoint(), range_endpoint()}) + -> expanded_slot() + ). -spec binarysplit_mapfun(binary(), integer()) -> slotbin_fun(). %% @doc @@ -2234,18 +2157,15 @@ binarysplit_mapfun(MultiSlotBin, StartPos) -> {SlotBin, ID, SK, EK} end. - --spec read_slots(file:io_device(), list(), - {false|list(), non_neg_integer(), blockindex_cache()}, - press_method(), boolean()) -> - {boolean(), list(binaryslot_element())}. +-spec read_slots( + file:io_device(), + list(), + {segment_check_fun(), non_neg_integer(), blockindex_cache()}, + press_method(), + boolean()) + -> {boolean(), list(expanded_slot()|leveled_codec:ledger_kv())}. %% @doc -%% The reading of sots will return a list of either 2-tuples containing -%% {K, V} pairs - or 3-tuples containing {Binary, SK, EK}. The 3 tuples -%% can be exploded into lists of {K, V} pairs using the binaryslot_reader/4 -%% function -%% -%% Reading slots is generally unfiltered, but in the sepcial case when +%% Reading slots is generally unfiltered, but in the special case when %% querting across slots when only matching segment IDs are required the %% BlockIndexCache can be used %% @@ -2258,12 +2178,13 @@ read_slots(Handle, SlotList, {false, 0, _BlockIndexCache}, % No list of segments passed or useful Low LastModified Date % Just read slots in SlotList {false, read_slotlist(SlotList, Handle)}; -read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, +read_slots(Handle, SlotList, {SegChecker, LowLastMod, BlockIndexCache}, PressMethod, IdxModDate) -> - % List of segments passed so only {K, V} pairs matching those segments - % should be returned. This required the {K, V} pair to have been added - % with the appropriate hash - if the pair were added with no_lookup as - % the hash value this will fail unexpectedly. + % Potentially need to check the low last modified date, and also the + % segment_check_fun against the index. If the index is cached, return the + % KV pairs at this point, otherwise return the slot pointer so that the + % term_to_binary work can be conducted by the fold process and not impact + % the heap of this SST process BinMapFun = fun(Pointer, {NeededBlockIdx, Acc}) -> {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer), @@ -2273,17 +2194,13 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % If there is an attempt to use the seg list query and the % index block cache isn't cached for any part this may be % slower as each slot will be read in turn - {true, Acc ++ read_slotlist([Pointer], Handle)}; + {true, read_slotlist([Pointer], Handle) ++ Acc}; {BlockLengths, LMD, BlockIdx} -> % If there is a BlockIndex cached then we can use it to % check to see if any of the expected segments are % present without lifting the slot off disk. Also the % fact that we know position can be used to filter out - % other keys - % - % Note that LMD will be 0 if the indexing of last mod - % date was not enable at creation time. So in this - % case the filter should always map + % blocks. case LowLastMod > LMD of true -> % The highest LMD on the slot was before the @@ -2292,51 +2209,48 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % slot - it is all too old {NeededBlockIdx, Acc}; false -> - case SegList of + case SegChecker of false -> - % Need all the slot now + % No SegChecker - need all the slot now {NeededBlockIdx, - Acc ++ - read_slotlist([Pointer], Handle)}; - _SL -> - % Need to find just the right keys - PositionList = - find_pos(BlockIdx, SegList, [], 0), - % Note check_blocks should return [] if - % PositionList is empty (which it may be) - KVL = - check_blocks(PositionList, - {Handle, SP}, - BlockLengths, - byte_size(BlockIdx), - false, - PressMethod, - IdxModDate, - []), - % There is no range passed through to the - % binaryslot_reader, so these results need - % to be filtered - FilterFun = - fun(KV) -> in_range(KV, SK, EK) end, - {NeededBlockIdx, - Acc ++ lists:filter(FilterFun, KVL)} + read_slotlist([Pointer], Handle) ++ Acc + }; + SegChecker -> + TrimmedKVL = + checkblocks_segandrange( + BlockIdx, + {Handle, SP}, + BlockLengths, + PressMethod, + IdxModDate, + SegChecker, + {SK, EK}), + {NeededBlockIdx, TrimmedKVL ++ Acc} end end end end, - lists:foldl(BinMapFun, {false, []}, SlotList). - - --spec in_range(leveled_codec:ledger_kv(), - range_endpoint(), range_endpoint()) -> boolean(). -%% @doc -%% Is the ledger key in the range. -in_range({_LK, _LV}, all, all) -> - true; -in_range({LK, _LV}, all, EK) -> - not leveled_codec:endkey_passed(EK, LK); -in_range({LK, LV}, SK, EK) -> - (LK >= SK) and in_range({LK, LV}, all, EK). + lists:foldr(BinMapFun, {false, []}, SlotList). + + +-spec checkblocks_segandrange( + binary(), + binary()|{file:io_device(), integer()}, + binary(), + press_method(), + boolean(), + segment_check_fun(), + {range_endpoint(), range_endpoint()}) + -> list(leveled_codec:ledger_kv()). +checkblocks_segandrange( + BlockIdx, SlotOrHandle, BlockLengths, + PressMethod, IdxModDate, SegChecker, {StartKey, EndKey}) -> + PositionList = find_pos(BlockIdx, SegChecker), + KVL = + check_blocks( + PositionList, SlotOrHandle, BlockLengths, byte_size(BlockIdx), + false, PressMethod, IdxModDate, []), + in_range(KVL, StartKey, EndKey). read_slotlist(SlotList, Handle) -> @@ -2345,12 +2259,13 @@ read_slotlist(SlotList, Handle) -> lists:map(binarysplit_mapfun(MultiSlotBin, StartPos), LengthList). --spec binaryslot_reader(list(binaryslot_element()), - press_method(), - boolean(), - leveled_codec:segment_list()) - -> {list({tuple(), tuple()}), - list({integer(), binary()})}. +-spec binaryslot_reader( + list(expanded_slot()), + press_method(), + boolean(), + segment_check_fun(), + list(expandable_pointer())) + -> {list({tuple(), tuple()}), list({integer(), binary()})}. %% @doc %% Read the binary slots converting them to {K, V} pairs if they were not %% already {K, V} pairs. If they are already {K, V} pairs it is assumed @@ -2358,11 +2273,12 @@ read_slotlist(SlotList, Handle) -> %% %% Keys which are still to be extracted from the slot, are accompanied at %% this function by the range against which the keys need to be checked. -%% This range is passed with the slot to binaryslot_trimmedlist which should -%% open the slot block by block, filtering individual keys where the endpoints -%% of the block are outside of the range, and leaving blocks already proven to -%% be outside of the range unopened. -binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList) -> +%% This range is passed with the slot to binaryslot_trimmed which +%% should open the slot block by block, filtering individual keys where the +%% endpoints of the block are outside of the range, and leaving blocks already +%% proven to be outside of the range unopened. +binaryslot_reader( + SlotBinsToFetch, PressMethod, IdxModDate, SegChecker, SlotsToPoint) -> % Two accumulators are added. % One to collect the list of keys and values found in the binary slots % (subject to range filtering if the slot is still deserialised at this @@ -2372,37 +2288,34 @@ binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList) -> % of get_kvreader calls. This means that slots which are only used in % range queries can still populate their block_index caches (on the FSM % loop state), and those caches can be used for future queries. - binaryslot_reader(SlotBinsToFetch, - PressMethod, IdxModDate, SegList, [], []). + {Acc, BIAcc} = + binaryslot_reader( + SlotBinsToFetch, PressMethod, IdxModDate, SegChecker, [], []), + {lists:reverse(lists:reverse(SlotsToPoint) ++ Acc), BIAcc}. -binaryslot_reader([], _PressMethod, _IdxModDate, _SegList, Acc, BIAcc) -> +binaryslot_reader([], _PressMethod, _IdxModDate, _SegChecker, Acc, BIAcc) -> {Acc, BIAcc}; -binaryslot_reader([{SlotBin, ID, SK, EK}|Tail], - PressMethod, IdxModDate, SegList, Acc, BIAcc) -> +binaryslot_reader( + [{SlotBin, ID, SK, EK}|Tail], + PressMethod, IdxModDate, SegChecker, Acc, BIAcc) -> % The start key and end key here, may not the start key and end key the % application passed into the query. If the slot is known to lie entirely % inside the range, on either of both sides, the SK and EK may be % substituted for the 'all' key work to indicate there is no need for % entries in this slot to be trimmed from either or both sides. {TrimmedL, BICache} = - binaryslot_trimmedlist(SlotBin, - SK, EK, - PressMethod, - IdxModDate, - SegList), - binaryslot_reader(Tail, - PressMethod, - IdxModDate, - SegList, - Acc ++ TrimmedL, - [{ID, BICache}|BIAcc]); -binaryslot_reader([{K, V}|Tail], - PressMethod, IdxModDate, SegList, Acc, BIAcc) -> + binaryslot_trimmed( + SlotBin, SK, EK, PressMethod, IdxModDate, SegChecker), + binaryslot_reader( + Tail, PressMethod, IdxModDate, SegChecker, + lists:reverse(TrimmedL) ++ Acc, [{ID, BICache}|BIAcc]); +binaryslot_reader(L, PressMethod, IdxModDate, SegChecker, Acc, BIAcc) -> + {KVs, Tail} = lists:splitwith(fun(SR) -> tuple_size(SR) == 2 end, L), % These entries must already have been filtered for membership inside any % range used in the query. - binaryslot_reader(Tail, - PressMethod, IdxModDate, SegList, - Acc ++ [{K, V}], BIAcc). + binaryslot_reader( + Tail, PressMethod, IdxModDate, SegChecker, + lists:reverse(KVs) ++ Acc, BIAcc). read_length_list(Handle, LengthList) -> @@ -2413,8 +2326,8 @@ read_length_list(Handle, LengthList) -> {MultiSlotBin, StartPos}. --spec extract_header(binary()|none, boolean()) -> - {binary(), non_neg_integer(), binary()}|none. +-spec extract_header( + binary()|none, boolean()) -> {binary(), non_neg_integer(), binary()}|none. %% @doc %% Helper for extracting the binaries from the header ignoring the missing LMD %% if LMD is not indexed @@ -2434,10 +2347,8 @@ binaryslot_get(FullBin, Key, Hash, PressMethod, IdxModDate) -> {Header, Blocks} -> {BlockLengths, _LMD, PosBinIndex} = extract_header(Header, IdxModDate), - PosList = find_pos(PosBinIndex, - extract_hash(Hash), - [], - 0), + PosList = + find_pos(PosBinIndex, segment_checker(extract_hash(Hash))), {fetch_value(PosList, BlockLengths, Blocks, Key, PressMethod), Header}; crc_wonky -> @@ -2445,90 +2356,63 @@ binaryslot_get(FullBin, Key, Hash, PressMethod, IdxModDate) -> none} end. -binaryslot_tolist(FullBin, PressMethod, IdxModDate) -> - BlockFetchFun = - fun(Length, {Acc, Bin}) -> - case Length of - 0 -> - {Acc, Bin}; - _ -> - <> = Bin, - {Acc ++ deserialise_block(Block, PressMethod), Rest} - end - end, - - {Out, _Rem} = - case crc_check_slot(FullBin) of - {Header, Blocks} -> - {BlockLengths, _LMD, _PosBinIndex} = - extract_header(Header, IdxModDate), - <> = BlockLengths, - lists:foldl(BlockFetchFun, - {[], Blocks}, - [B1L, B2L, B3L, B4L, B5L]); - crc_wonky -> - {[], <<>>} - end, - Out. +-spec binaryslot_blockstolist( + list(non_neg_integer()), + binary(), + press_method(), + list(leveled_codec:ledger_kv())) -> list(leveled_codec:ledger_kv()). +binaryslot_blockstolist([], _Bin, _PressMethod, Acc) -> + Acc; +binaryslot_blockstolist([0|RestLengths], RestBin, PressMethod, Acc) -> + binaryslot_blockstolist(RestLengths, RestBin, PressMethod, Acc); +binaryslot_blockstolist([L|RestLengths], Bin, PressMethod, Acc) -> + <> = Bin, + binaryslot_blockstolist( + RestLengths, + RestBin, + PressMethod, + Acc ++ deserialise_block(Block, PressMethod)). + +-spec binaryslot_tolist( + binary(), press_method(), boolean()) + -> list(leveled_codec:ledger_kv()). +binaryslot_tolist(FullBin, PressMethod, IdxModDate) -> + case crc_check_slot(FullBin) of + {Header, Blocks} -> + {BlockLengths, _LMD, _PosBinIndex} = + extract_header(Header, IdxModDate), + <> = BlockLengths, + binaryslot_blockstolist( + [B1L, B2L, B3L, B4L, B5L], Blocks, PressMethod, []); + crc_wonky -> + [] + end. -binaryslot_trimmedlist(FullBin, all, all, - PressMethod, IdxModDate, false) -> +-spec binaryslot_trimmed( + binary(), + range_endpoint(), + range_endpoint(), + press_method(), + boolean(), + segment_check_fun()) -> + {list(leveled_codec:ledger_kv()), + list({integer(), binary()})|none}. +%% @doc +%% Must return a trimmed and reversed list of results in the range +binaryslot_trimmed( + FullBin, all, all, PressMethod, IdxModDate, false) -> {binaryslot_tolist(FullBin, PressMethod, IdxModDate), none}; -binaryslot_trimmedlist(FullBin, StartKey, EndKey, - PressMethod, IdxModDate, SegList) -> - LTrimFun = fun({K, _V}) -> K < StartKey end, - RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, - BlockCheckFun = - fun(Block, {Acc, Continue}) -> - case {Block, Continue} of - {<<>>, _} -> - {Acc, false}; - {_, true} -> - BlockList = - case is_binary(Block) of - true -> - deserialise_block(Block, PressMethod); - false -> - Block - end, - case fetchends_rawblock(BlockList) of - {_, LastKey} when StartKey > LastKey -> - %% This includes the case when LastKey is - %% not_present due to corruption in the BlockList - %% as tuple is > not_present. - {Acc, true}; - {_, LastKey} -> - {_LDrop, RKeep} = lists:splitwith(LTrimFun, - BlockList), - case leveled_codec:endkey_passed(EndKey, - LastKey) of - true -> - {LKeep, _RDrop} - = lists:splitwith(RTrimFun, RKeep), - {Acc ++ LKeep, false}; - false -> - {Acc ++ RKeep, true} - end - end; - {_ , false} -> - {Acc, false} - end - end, - - case {crc_check_slot(FullBin), SegList} of - % It will be more effecient to check a subset of blocks. To work out - % the best subset we always look in the middle block of 5, and based on - % the first and last keys of that middle block when compared to the Start - % and EndKey of the query determines a subset of blocks - % - % This isn't perfectly efficient, esepcially if the query overlaps Block2 - % and Block3 (as Block 1 will also be checked), but finessing this last - % scenario is hard to do in concise code +binaryslot_trimmed( + FullBin, StartKey, EndKey, PressMethod, IdxModDate, SegmentChecker) -> + case {crc_check_slot(FullBin), SegmentChecker} of + % Get a trimmed list of keys in the slot based on the range, trying + % to minimise the number of blocks which are deserialised by + % checking the middle block first. {{Header, Blocks}, false} -> {BlockLengths, _LMD, _PosBinIndex} = extract_header(Header, IdxModDate), @@ -2540,66 +2424,141 @@ binaryslot_trimmedlist(FullBin, StartKey, EndKey, <> = Blocks, - BlocksToCheck = - blocks_required({StartKey, EndKey}, - [Block1, Block2, MidBlock, Block4, Block5], - PressMethod), - {Acc, _Continue} = - lists:foldl(BlockCheckFun, {[], true}, BlocksToCheck), - {Acc, none}; - {{Header, _Blocks}, SegList} -> + TrimmedKVL = + blocks_required( + {StartKey, EndKey}, + Block1, Block2, MidBlock, Block4, Block5, + PressMethod), + {TrimmedKVL, none}; + {{Header, _Blocks}, SegmentChecker} -> {BlockLengths, _LMD, BlockIdx} = extract_header(Header, IdxModDate), - PosList = find_pos(BlockIdx, SegList, [], 0), - KVL = check_blocks(PosList, - FullBin, - BlockLengths, - byte_size(BlockIdx), - false, - PressMethod, - IdxModDate, - []), - {KVL, Header}; + TrimmedKVL = + checkblocks_segandrange( + BlockIdx, + FullBin, + BlockLengths, + PressMethod, + IdxModDate, + SegmentChecker, + {StartKey, EndKey}), + {TrimmedKVL, Header}; {crc_wonky, _} -> {[], none} end. - -blocks_required({StartKey, EndKey}, [B1, B2, MidBlock, B4, B5], PressMethod) -> +-spec blocks_required( + {range_endpoint(), range_endpoint()}, + binary(), binary(), binary(), binary(), binary(), + press_method()) -> list(leveled_codec:ledger_kv()). +blocks_required( + {StartKey, EndKey}, B1, B2, MidBlock, B4, B5, PressMethod) -> MidBlockList = deserialise_block(MidBlock, PressMethod), - filter_blocks_required(fetchends_rawblock(MidBlockList), - {StartKey, EndKey}, - [B1, B2, MidBlockList, B4, B5]). - -filter_blocks_required({not_present, not_present}, _RangeKeys, AllBlocks) -> - AllBlocks; -filter_blocks_required({_MidFirst, MidLast}, {StartKey, _EndKey}, - [_Block1, _Block2, _MidBlockList, Block4, Block5]) - when StartKey > MidLast -> - [Block4, Block5]; -filter_blocks_required({MidFirst, MidLast}, {StartKey, EndKey}, - [_Block1, _Block2, MidBlockList, Block4, Block5]) - when StartKey >= MidFirst -> - NoneAfter = leveled_codec:endkey_passed(EndKey, MidLast), - case NoneAfter of + case filterby_midblock( + fetchends_rawblock(MidBlockList), {StartKey, EndKey}) of + empty -> + in_range(deserialise_block(B1, PressMethod), StartKey, EndKey) + ++ in_range(deserialise_block(B2, PressMethod), StartKey, EndKey) + ++ in_range(deserialise_block(B4, PressMethod), StartKey, EndKey) + ++ in_range(deserialise_block(B5, PressMethod), StartKey, EndKey); + all_blocks -> + get_lefthand_blocks(B1, B2, PressMethod, StartKey) + ++ MidBlockList + ++ get_righthand_blocks(B4, B5, PressMethod, EndKey); + lt_mid -> + in_range( + get_lefthand_blocks(B1, B2, PressMethod, StartKey), + all, + EndKey); + le_mid -> + get_lefthand_blocks(B1, B2, PressMethod, StartKey) + ++ in_range(MidBlockList, all, EndKey); + mid_only -> + in_range(MidBlockList, StartKey, EndKey); + ge_mid -> + in_range(MidBlockList, StartKey, all) + ++ get_righthand_blocks(B4, B5, PressMethod, EndKey); + gt_mid -> + in_range( + get_righthand_blocks(B4, B5, PressMethod, EndKey), + StartKey, + all) + end. + +get_lefthand_blocks(B1, B2, PressMethod, StartKey) -> + BlockList2 = deserialise_block(B2, PressMethod), + case previous_block_required( + fetchends_rawblock(BlockList2), StartKey) of true -> - [MidBlockList]; + in_range(deserialise_block(B1, PressMethod), StartKey, all) + ++ BlockList2; false -> - [MidBlockList, Block4, Block5] + in_range(BlockList2, StartKey, all) + end. + +get_righthand_blocks(B4, B5, PressMethod, EndKey) -> + BlockList4 = deserialise_block(B4, PressMethod), + case next_block_required( + fetchends_rawblock(BlockList4), EndKey) of + true -> + BlockList4 + ++ in_range(deserialise_block(B5, PressMethod), all, EndKey); + false -> + in_range(BlockList4, all, EndKey) + end. + +filterby_midblock({not_present, not_present}, _RangeKeys) -> + empty; +filterby_midblock( + {_MidFirst, MidLast}, {StartKey, _EndKey}) when StartKey > MidLast -> + gt_mid; +filterby_midblock( + {MidFirst, MidLast}, {StartKey, EndKey}) when StartKey >= MidFirst -> + case leveled_codec:endkey_passed(EndKey, MidLast) of + true -> + mid_only; + false -> + ge_mid end; -filter_blocks_required({MidFirst, MidLast}, {_StartKey, EndKey}, - [Block1, Block2, MidBlockList, Block4, Block5]) -> +filterby_midblock({MidFirst, MidLast}, {_StartKey, EndKey}) -> AllBefore = leveled_codec:endkey_passed(EndKey, MidFirst), NoneAfter = leveled_codec:endkey_passed(EndKey, MidLast), case {AllBefore, NoneAfter} of {true, true} -> - [Block1, Block2]; + lt_mid; {false, true} -> - [Block1, Block2, MidBlockList]; + le_mid; {false, false} -> - [Block1, Block2, MidBlockList, Block4, Block5] + all_blocks end. +previous_block_required({not_present, not_present}, _SK) -> + true; +previous_block_required({FK, _LK}, StartKey) when FK < StartKey -> + false; +previous_block_required(_BlockEnds, _StartKey) -> + true. + +next_block_required({not_present, not_present}, _EK) -> + true; +next_block_required({_FK, LK}, EndKey) -> + not leveled_codec:endkey_passed(EndKey, LK). + +-spec in_range( + list(leveled_codec:ledger_kv()), + range_endpoint(), + range_endpoint()) -> list(leveled_codec:ledger_kv()). +%% @doc +%% Is the ledger key in the range. +in_range(KVL, all, all) -> + KVL; +in_range(KVL, all, EK) -> + lists:takewhile( + fun({K, _V}) -> not leveled_codec:endkey_passed(EK, K) end, KVL); +in_range(KVL, SK, all) -> + lists:dropwhile(fun({K, _V}) -> K < SK end, KVL); +in_range(KVL, SK, EK) -> + in_range(in_range(KVL, SK, all), all, EK). crc_check_slot(FullBin) -> < fetch_value(Rest, BlockLengths, Blocks, Key, PressMethod) end. --spec fetchfrom_rawblock(pos_integer(), list(leveled_codec:ledger_kv())) -> - not_present|leveled_codec:ledger_kv(). +-spec fetchfrom_rawblock( + pos_integer(), list(leveled_codec:ledger_kv())) + -> not_present|leveled_codec:ledger_kv(). %% @doc %% Fetch from a deserialised block, but accounting for potential corruption %% in that block which may lead to it returning as an empty list if that @@ -2676,19 +2636,19 @@ fetchfrom_rawblock(BlockPos, RawBlock) when BlockPos > length(RawBlock) -> fetchfrom_rawblock(BlockPos, RawBlock) -> lists:nth(BlockPos, RawBlock). --spec fetchends_rawblock(list(leveled_codec:ledger_kv())) -> - {not_present, not_present}| - {leveled_codec:ledger_key(), leveled_codec:ledger_key()}. +-spec fetchends_rawblock( + list(leveled_codec:ledger_kv())) + -> {not_present, not_present}| + {leveled_codec:ledger_key(), leveled_codec:ledger_key()}. %% @doc %% Fetch the first and last key from a block, and not_present if the block %% is empty (rather than crashing) fetchends_rawblock([]) -> {not_present, not_present}; fetchends_rawblock(RawBlock) -> - {element(1, lists:nth(1, RawBlock)), + {element(1, hd(RawBlock)), element(1, lists:last(RawBlock))}. - revert_position(Pos) -> {SideBlockSize, MidBlockSize} = ?LOOK_BLOCKSIZE, case Pos < 2 * SideBlockSize of @@ -2705,8 +2665,6 @@ revert_position(Pos) -> end end. - - %%%============================================================================ %%% Merge Functions %%%============================================================================ @@ -2755,21 +2713,20 @@ merge_lists(KVList1, SSTOpts, IdxModDate) -> element(1, lists:nth(1, KVList1)), not_counted}. - split_lists([], SlotLists, 0, _PressMethod, _IdxModDate) -> lists:reverse(SlotLists); split_lists(LastPuff, SlotLists, 0, PressMethod, IdxModDate) -> {SlotD, _} = generate_binary_slot( - lookup, LastPuff, PressMethod, IdxModDate, no_timing), + lookup, {forward, LastPuff}, PressMethod, IdxModDate, no_timing), lists:reverse([SlotD|SlotLists]); split_lists(KVList1, SlotLists, N, PressMethod, IdxModDate) -> {Slot, KVListRem} = lists:split(?LOOK_SLOTSIZE, KVList1), {SlotD, _} = - generate_binary_slot(lookup, Slot, PressMethod, IdxModDate, no_timing), + generate_binary_slot( + lookup, {forward, Slot}, PressMethod, IdxModDate, no_timing), split_lists(KVListRem, [SlotD|SlotLists], N - 1, PressMethod, IdxModDate). - -spec merge_lists( list(expanded_pointer()), list(expanded_pointer()), @@ -2785,17 +2742,22 @@ split_lists(KVList1, SlotLists, N, PressMethod, IdxModDate) -> %% provided may include pointers to fetch more Keys/Values from the source %% file merge_lists( - KVList1, KVList2, LevelInfo, SSTOpts, IndexModDate, SaveTombCount) -> + KVList1, KVList2, {IsBase, L}, SSTOpts, IndexModDate, SaveTombCount) -> InitTombCount = case SaveTombCount of true -> 0; false -> not_counted end, - merge_lists(KVList1, KVList2, - LevelInfo, - [], null, 0, - SSTOpts#sst_options.max_sstslots, - SSTOpts#sst_options.press_method, - IndexModDate, - InitTombCount, - #build_timings{}). + BuildTimings = + case IsBase orelse lists:member(L, ?LOG_BUILDTIMINGS_LEVELS) of + true -> + #build_timings{}; + false -> + no_timing + end, + merge_lists( + KVList1, KVList2, + {IsBase, L}, [], null, 0, + SSTOpts#sst_options.max_sstslots, SSTOpts#sst_options.press_method, + IndexModDate, InitTombCount, + BuildTimings). -spec merge_lists( @@ -2829,10 +2791,9 @@ merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, MaxSlots, PressMethod, IdxModDate, CountOfTombs, T0) -> % Form a slot by merging the two lists until the next 128 K/V pairs have % been determined - SW = os:timestamp(), {KVRem1, KVRem2, Slot, FK0} = form_slot(KVL1, KVL2, LI, no_lookup, 0, [], FirstKey), - T1 = update_buildtimings(SW, T0, fold_toslot), + T1 = update_buildtimings(T0, fold_toslot), case Slot of {_, []} -> % There were no actual keys in the slot (maybe some expired) @@ -2851,7 +2812,8 @@ merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, MaxSlots, % Convert the list of KVs for the slot into a binary, and related % metadata {SlotD, T2} = - generate_binary_slot(Lookup, KVL, PressMethod, IdxModDate, T1), + generate_binary_slot( + Lookup, {reverse, KVL}, PressMethod, IdxModDate, T1), merge_lists(KVRem1, KVRem2, LI, @@ -2861,30 +2823,10 @@ merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, MaxSlots, MaxSlots, PressMethod, IdxModDate, - count_tombs(KVL, CountOfTombs), + leveled_codec:count_tombs(KVL, CountOfTombs), T2) end. - --spec count_tombs( - list(leveled_codec:ledger_kv()), non_neg_integer()|not_counted) -> - non_neg_integer()|not_counted. -%% @doc -%% Count the tombstones in a list of KVs -count_tombs(_KVL, not_counted) -> - not_counted; -count_tombs(KVL, InitCount) -> - FoldFun = - fun(KV, Count) -> - case leveled_codec:strip_to_statusonly(KV) of - tomb -> - Count + 1; - _ -> - Count - end - end, - lists:foldl(FoldFun, InitCount, KVL). - -spec form_slot(list(expanded_pointer()), list(expanded_pointer()), {boolean(), non_neg_integer()}, @@ -2896,164 +2838,168 @@ count_tombs(KVL, InitCount) -> {lookup|no_lookup, list(leveled_codec:ledger_kv())}, leveled_codec:ledger_key()}. %% @doc -%% Merge together Key Value lists to provide an ordered slot of KVs +%% Merge together Key Value lists to provide a reverse-ordered slot of KVs form_slot([], [], _LI, Type, _Size, Slot, FK) -> - {[], [], {Type, lists:reverse(Slot)}, FK}; + {[], [], {Type, Slot}, FK}; form_slot(KVList1, KVList2, _LI, lookup, ?LOOK_SLOTSIZE, Slot, FK) -> - {KVList1, KVList2, {lookup, lists:reverse(Slot)}, FK}; + {KVList1, KVList2, {lookup, Slot}, FK}; form_slot(KVList1, KVList2, _LI, no_lookup, ?NOLOOK_SLOTSIZE, Slot, FK) -> - {KVList1, KVList2, {no_lookup, lists:reverse(Slot)}, FK}; -form_slot(KVList1, KVList2, {IsBasement, TS}, lookup, Size, Slot, FK) -> - case {key_dominates(KVList1, KVList2, {IsBasement, TS}), FK} of - {{{next_key, TopKV}, Rem1, Rem2}, _} -> - form_slot(Rem1, - Rem2, - {IsBasement, TS}, - lookup, - Size + 1, - [TopKV|Slot], - FK); - {{skipped_key, Rem1, Rem2}, _} -> - form_slot(Rem1, Rem2, {IsBasement, TS}, lookup, Size, Slot, FK) + {KVList1, KVList2, {no_lookup, Slot}, FK}; +form_slot(KVList1, KVList2, LevelInfo, lookup, Size, Slot, FK) -> + case key_dominates(KVList1, KVList2, LevelInfo) of + {{next_key, TopKV}, Rem1, Rem2} -> + form_slot( + Rem1, Rem2, LevelInfo, lookup, Size + 1, [TopKV|Slot], FK); + {skipped_key, Rem1, Rem2} -> + form_slot(Rem1, Rem2, LevelInfo, lookup, Size, Slot, FK) end; -form_slot(KVList1, KVList2, {IsBasement, TS}, no_lookup, Size, Slot, FK) -> - case key_dominates(KVList1, KVList2, {IsBasement, TS}) of +form_slot(KVList1, KVList2, LevelInfo, no_lookup, Size, Slot, FK) -> + case key_dominates(KVList1, KVList2, LevelInfo) of {{next_key, {TopK, TopV}}, Rem1, Rem2} -> - FK0 = - case FK of - null -> - TopK; - _ -> - FK - end, + FK0 = case FK of null -> TopK; _ -> FK end, case leveled_codec:to_lookup(TopK) of no_lookup -> - form_slot(Rem1, - Rem2, - {IsBasement, TS}, - no_lookup, - Size + 1, - [{TopK, TopV}|Slot], - FK0); + form_slot( + Rem1, + Rem2, + LevelInfo, + no_lookup, + Size + 1, + [{TopK, TopV}|Slot], + FK0); lookup -> case Size >= ?LOOK_SLOTSIZE of true -> - {KVList1, - KVList2, - {no_lookup, lists:reverse(Slot)}, - FK}; + {KVList1, KVList2, {no_lookup, Slot}, FK}; false -> - form_slot(Rem1, - Rem2, - {IsBasement, TS}, - lookup, - Size + 1, - [{TopK, TopV}|Slot], - FK0) + form_slot( + Rem1, + Rem2, + LevelInfo, + lookup, + Size + 1, + [{TopK, TopV}|Slot], + FK0) end end; {skipped_key, Rem1, Rem2} -> - form_slot(Rem1, Rem2, {IsBasement, TS}, no_lookup, Size, Slot, FK) + form_slot(Rem1, Rem2, LevelInfo, no_lookup, Size, Slot, FK) end. +-spec key_dominates( + list(expanded_pointer()), + list(expanded_pointer()), + {boolean()|undefined, leveled_pmanifest:lsm_level()}) + -> + {{next_key, leveled_codec:ledger_kv()}|skipped_key, + list(expanded_pointer()), + list(expanded_pointer())}. +key_dominates([{pointer, SSTPid, Slot, StartKey, all}|T1], KL2, Level) -> + key_dominates( + expand_list_by_pointer( + {pointer, SSTPid, Slot, StartKey, all}, T1, ?MERGE_SCANWIDTH), + KL2, + Level); +key_dominates([{next, ManEntry, StartKey}|T1], KL2, Level) -> + key_dominates( + expand_list_by_pointer( + {next, ManEntry, StartKey, all}, T1, ?MERGE_SCANWIDTH), + KL2, + Level); +key_dominates(KL1, [{pointer, SSTPid, Slot, StartKey, all}|T2], Level) -> + key_dominates( + KL1, + expand_list_by_pointer( + {pointer, SSTPid, Slot, StartKey, all}, T2, ?MERGE_SCANWIDTH), + Level); +key_dominates(KL1, [{next, ManEntry, StartKey}|T2], Level) -> + key_dominates( + KL1, + expand_list_by_pointer( + {next, ManEntry, StartKey, all}, T2, ?MERGE_SCANWIDTH), + Level); +key_dominates( + [{K1, _V1}|_T1]=Rest1, [{K2, V2}|Rest2], {false, _TS}) when K2 < K1 -> + {{next_key, {K2, V2}}, Rest1, Rest2}; +key_dominates( + [{K1, V1}|Rest1], [{K2, _V2}|_T2]=Rest2, {false, _TS}) when K1 < K2 -> + {{next_key, {K1, V1}}, Rest1, Rest2}; key_dominates(KL1, KL2, Level) -> - key_dominates_expanded(maybe_expand_pointer(KL1), - maybe_expand_pointer(KL2), - Level). - -key_dominates_expanded([H1|T1], [], Level) -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of - true -> - {skipped_key, T1, []}; - false -> - {{next_key, H1}, T1, []} - end; -key_dominates_expanded([], [H2|T2], Level) -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [], T2}; - false -> - {{next_key, H2}, [], T2} - end; -key_dominates_expanded([H1|T1], [H2|T2], Level) -> - case leveled_codec:key_dominates(H1, H2) of - left_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of + case key_dominates_expanded(KL1, KL2) of + {{next_key, NKV}, Rest1, Rest2} -> + case leveled_codec:maybe_reap_expiredkey(NKV, Level) of true -> - {skipped_key, T1, [H2|T2]}; + {skipped_key, Rest1, Rest2}; false -> - {{next_key, H1}, T1, [H2|T2]} + {{next_key, NKV}, Rest1, Rest2} end; - right_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [H1|T1], T2}; - false -> - {{next_key, H2}, [H1|T1], T2} - end; - left_hand_dominant -> + {skipped_key, Rest1, Rest2} -> + {skipped_key, Rest1, Rest2} + end. + +-spec key_dominates_expanded( + list(expanded_pointer()), list(expanded_pointer())) + -> {{next_key, leveled_codec:ledger_kv()}|skipped_key, + list(expanded_pointer()), + list(expanded_pointer())}. +key_dominates_expanded([H1|T1], []) -> + {{next_key, H1}, T1, []}; +key_dominates_expanded([], [H2|T2]) -> + {{next_key, H2}, [], T2}; +key_dominates_expanded([{K1, _V1}|_T1]=LHL, [{K2, V2}|T2]) when K2 < K1 -> + {{next_key, {K2, V2}}, LHL, T2}; +key_dominates_expanded([{K1, V1}|T1], [{K2, _V2}|_T2]=RHL) when K1 < K2 -> + {{next_key, {K1, V1}}, T1, RHL}; +key_dominates_expanded([H1|T1], [H2|T2]) -> + case leveled_codec:key_dominates(H1, H2) of + true -> {skipped_key, [H1|T1], T2}; - right_hand_dominant -> + false -> {skipped_key, T1, [H2|T2]} end. -%% When a list is provided it may include a pointer to gain another batch of -%% entries from the same file, or a new batch of entries from another file -%% -%% This resultant list should include the Tail of any pointers added at the -%% end of the list - -maybe_expand_pointer([]) -> - []; -maybe_expand_pointer([{pointer, SSTPid, Slot, StartKey, all}|Tail]) -> - expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, - Tail, - ?MERGE_SCANWIDTH); -maybe_expand_pointer([{next, ManEntry, StartKey}|Tail]) -> - expand_list_by_pointer({next, ManEntry, StartKey, all}, - Tail, - ?MERGE_SCANWIDTH); -maybe_expand_pointer(List) -> - List. - - %%%============================================================================ %%% Timing Functions %%%============================================================================ --spec update_buildtimings( - erlang:timestamp(), build_timings(), atom()) -> build_timings(). +-spec update_buildtimings(build_timings(), atom()) -> build_timings(). %% @doc %% %% Timings taken from the build of a SST file. %% %% There is no sample window, but the no_timing status is still used for %% level zero files where we're not breaking down the build time in this way. -update_buildtimings(_SW, no_timing, _Stage) -> +update_buildtimings(no_timing, _Stage) -> no_timing; -update_buildtimings(SW, Timings, Stage) -> - Timer = timer:now_diff(os:timestamp(), SW), - case Stage of - slot_hashlist -> - HLT = Timings#build_timings.slot_hashlist + Timer, - Timings#build_timings{slot_hashlist = HLT}; - slot_serialise -> - SST = Timings#build_timings.slot_serialise + Timer, - Timings#build_timings{slot_serialise = SST}; - slot_finish -> - SFT = Timings#build_timings.slot_finish + Timer, - Timings#build_timings{slot_finish = SFT}; - fold_toslot -> - FST = Timings#build_timings.fold_toslot + Timer, - Timings#build_timings{fold_toslot = FST} - end. +update_buildtimings(Timings, Stage) -> + LastTS = Timings#build_timings.last_timestamp, + ThisTS = os:timestamp(), + Timer = timer:now_diff(ThisTS, LastTS), + NewTimings = + case Stage of + slot_hashlist -> + HLT = Timings#build_timings.slot_hashlist + Timer, + Timings#build_timings{slot_hashlist = HLT}; + slot_serialise -> + SST = Timings#build_timings.slot_serialise + Timer, + Timings#build_timings{slot_serialise = SST}; + slot_finish -> + SFT = Timings#build_timings.slot_finish + Timer, + Timings#build_timings{slot_finish = SFT}; + fold_toslot -> + FST = Timings#build_timings.fold_toslot + Timer, + Timings#build_timings{fold_toslot = FST} + end, + NewTimings#build_timings{last_timestamp = ThisTS}. -spec log_buildtimings(build_timings(), tuple()) -> ok. %% @doc %% %% Log out the time spent during the merge lists part of the SST build +log_buildtimings(no_timing, _LI) -> + ok; log_buildtimings(Timings, LI) -> leveled_log:log( sst13, @@ -3084,27 +3030,36 @@ maybelog_fetch_timing({Pid, _SlotFreq}, Level, Type, SW) -> -define(TEST_AREA, "test/test_area/"). --spec sst_getkvrange(pid(), - range_endpoint(), - range_endpoint(), - integer()) - -> list(leveled_codec:ledger_kv()|slot_pointer()). + +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> + sst_getkvrange(Pid, StartKey, EndKey, ScanWidth, false, 0). + +-spec sst_getkvrange( + pid(), + range_endpoint(), + range_endpoint(), + integer(), + segment_check_fun(), + non_neg_integer()) -> list(leveled_codec:ledger_kv()|slot_pointer()). %% @doc %% Get a range of {Key, Value} pairs as a list between StartKey and EndKey %% (inclusive). The ScanWidth is the maximum size of the range, a pointer %% will be placed on the tail of the resulting list if results expand beyond %% the Scan Width -sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, false, 0). - --spec sst_getslots(pid(), list(slot_pointer())) - -> list(leveled_codec:ledger_kv()). +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth, SegChecker, LowLastMod) -> + [Pointer|MorePointers] = + sst_getfilteredrange(Pid, StartKey, EndKey, LowLastMod), + sst_expandpointer( + Pointer, MorePointers, ScanWidth, SegChecker, LowLastMod). + +-spec sst_getslots( + pid(), list(slot_pointer())) -> list(leveled_codec:ledger_kv()). %% @doc %% Get a list of slots by their ID. The slot will be converted from the binary %% to term form outside of the FSM loop, this is to stop the copying of the %% converted term to the calling process. sst_getslots(Pid, SlotList) -> - sst_getfilteredslots(Pid, SlotList, false, 0). + sst_getfilteredslots(Pid, SlotList, false, 0, []). testsst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> OptsSST = @@ -3266,7 +3221,8 @@ indexed_list_test() -> SW0 = os:timestamp(), {{_PosBinIndex1, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, KVL1, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, KVL1}, native, ?INDEX_MODDATE, no_timing), io:format(user, "Indexed list created slot in ~w microseconds of size ~w~n", [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), @@ -3295,7 +3251,8 @@ indexed_list_mixedkeys_test() -> Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), {{_PosBinIndex1, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), {TestK1, TestV1} = lists:nth(4, KVL1), MH1 = leveled_codec:segment_hash(TestK1), @@ -3322,7 +3279,8 @@ indexed_list_mixedkeys2_test() -> % this isn't actually ordered correctly Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, {{_Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), lists:foreach(fun({K, V}) -> MH = leveled_codec:segment_hash(K), test_binary_slot(FullBin, K, MH, {K, V}) @@ -3333,9 +3291,11 @@ indexed_list_allindexkeys_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), ?LOOK_SLOTSIZE), {{HeaderT, FullBinT, HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, true, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, true, no_timing), {{HeaderF, FullBinF, HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, false, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, false, no_timing), EmptySlotSize = ?LOOK_SLOTSIZE - 1, LMD = ?FLIPPER32, ?assertMatch(<<_BL:20/binary, LMD:32/integer, EmptySlotSize:8/integer>>, @@ -3348,92 +3308,92 @@ indexed_list_allindexkeys_test() -> % io:format(user, % "Indexed list flattened in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]), + io:format("BinToListT ~p~n", [BinToListT]), ?assertMatch(Keys, BinToListT), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBinT, - all, all, - native, - true, - false)), + ?assertMatch( + {Keys, none}, + binaryslot_trimmed( + FullBinT, all, all, native, true, false)), ?assertMatch(Keys, BinToListF), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBinF, - all, all, - native, - false, - false)). + ?assertMatch( + {Keys, none}, + binaryslot_trimmed( + FullBinF, all, all, native, false, false)). indexed_list_allindexkeys_nolookup_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(1000)), ?NOLOOK_SLOTSIZE), {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(no_lookup, Keys, native, ?INDEX_MODDATE,no_timing), + generate_binary_slot( + no_lookup, {forward, Keys}, native, ?INDEX_MODDATE,no_timing), ?assertMatch(<<_BL:20/binary, _LMD:32/integer, 127:8/integer>>, Header), % SW = os:timestamp(), - BinToList = binaryslot_tolist(FullBin, native, ?INDEX_MODDATE), + BinToList = + binaryslot_tolist(FullBin, native, ?INDEX_MODDATE), % io:format(user, % "Indexed list flattened in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]), ?assertMatch(Keys, BinToList), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, - all, all, - native, - ?INDEX_MODDATE, - false)). + ?assertMatch( + {Keys, none}, + binaryslot_trimmed(FullBin, all, all, native, ?INDEX_MODDATE, false)). indexed_list_allindexkeys_trimmed_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), ?LOOK_SLOTSIZE), {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE,no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), EmptySlotSize = ?LOOK_SLOTSIZE - 1, - ?assertMatch(<<_BL:20/binary, _LMD:32/integer, EmptySlotSize:8/integer>>, - Header), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, - {i, - "Bucket", - {"t1_int", 0}, - null}, - {i, - "Bucket", - {"t1_int", 99999}, - null}, - native, - ?INDEX_MODDATE, - false)), + ?assertMatch( + <<_BL:20/binary, _LMD:32/integer, EmptySlotSize:8/integer>>, + Header), + ?assertMatch( + {Keys, none}, + binaryslot_trimmed( + FullBin, + {i, "Bucket", {"t1_int", 0}, null}, + {i, "Bucket", {"t1_int", 99999}, null}, + native, + ?INDEX_MODDATE, + false)), {SK1, _} = lists:nth(10, Keys), {EK1, _} = lists:nth(100, Keys), R1 = lists:sublist(Keys, 10, 91), - {O1, none} = binaryslot_trimmedlist(FullBin, SK1, EK1, - native, ?INDEX_MODDATE, false), + {O1, none} = + binaryslot_trimmed( + FullBin, SK1, EK1, native, ?INDEX_MODDATE, false), ?assertMatch(91, length(O1)), ?assertMatch(R1, O1), {SK2, _} = lists:nth(10, Keys), {EK2, _} = lists:nth(20, Keys), R2 = lists:sublist(Keys, 10, 11), - {O2, none} = binaryslot_trimmedlist(FullBin, SK2, EK2, - native, ?INDEX_MODDATE, false), + {O2, none} = + binaryslot_trimmed(FullBin, SK2, EK2, native, ?INDEX_MODDATE, false), ?assertMatch(11, length(O2)), ?assertMatch(R2, O2), {SK3, _} = lists:nth(?LOOK_SLOTSIZE - 1, Keys), {EK3, _} = lists:nth(?LOOK_SLOTSIZE, Keys), R3 = lists:sublist(Keys, ?LOOK_SLOTSIZE - 1, 2), - {O3, none} = binaryslot_trimmedlist(FullBin, SK3, EK3, - native, ?INDEX_MODDATE, false), + {O3, none} = + binaryslot_trimmed(FullBin, SK3, EK3, native, ?INDEX_MODDATE, false), ?assertMatch(2, length(O3)), ?assertMatch(R3, O3). findposfrag_test() -> - ?assertMatch([], find_pos(<<128:8/integer>>, 1, [], 0)). + ?assertMatch([], find_pos(<<128:8/integer>>, segment_checker(1))). indexed_list_mixedkeys_bitflip_test() -> KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), KVL1 = lists:sublist(KVL0, 33), Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), {{Header, SlotBin, _HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), ?assertMatch(LK, element(1, lists:last(Keys))), @@ -3452,11 +3412,12 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin, TestKey1, MH1, lists:nth(1, KVL1)), test_binary_slot(SlotBin, TestKey2, MH2, lists:nth(33, KVL1)), - ToList = binaryslot_tolist(SlotBin, native, ?INDEX_MODDATE), + ToList = + binaryslot_tolist(SlotBin, native, ?INDEX_MODDATE), ?assertMatch(Keys, ToList), - [Pos1] = find_pos(PosBin, extract_hash(MH1), [], 0), - [Pos2] = find_pos(PosBin, extract_hash(MH2), [], 0), + [Pos1] = find_pos(PosBin, segment_checker(extract_hash(MH1))), + [Pos2] = find_pos(PosBin, segment_checker(extract_hash(MH2))), {BN1, _BP1} = revert_position(Pos1), {BN2, _BP2} = revert_position(Pos2), {Offset1, Length1} = block_offsetandlength(Header, BN1), @@ -3471,8 +3432,10 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin1, TestKey1, MH1, not_present), test_binary_slot(SlotBin2, TestKey2, MH2, not_present), - ToList1 = binaryslot_tolist(SlotBin1, native, ?INDEX_MODDATE), - ToList2 = binaryslot_tolist(SlotBin2, native, ?INDEX_MODDATE), + ToList1 = + binaryslot_tolist(SlotBin1, native, ?INDEX_MODDATE), + ToList2 = + binaryslot_tolist(SlotBin2, native, ?INDEX_MODDATE), ?assertMatch(true, is_list(ToList1)), ?assertMatch(true, is_list(ToList2)), @@ -3485,8 +3448,8 @@ indexed_list_mixedkeys_bitflip_test() -> {SK1, _} = lists:nth(10, Keys), {EK1, _} = lists:nth(20, Keys), - {O1, none} = binaryslot_trimmedlist(SlotBin3, SK1, EK1, - native, ?INDEX_MODDATE, false), + {O1, none} = + binaryslot_trimmed(SlotBin3, SK1, EK1, native, ?INDEX_MODDATE, false), ?assertMatch([], O1), SlotBin4 = flip_byte(SlotBin, 0, 20), @@ -3494,14 +3457,16 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin4, TestKey1, MH1, not_present), test_binary_slot(SlotBin5, TestKey1, MH1, not_present), - ToList4 = binaryslot_tolist(SlotBin4, native, ?INDEX_MODDATE), - ToList5 = binaryslot_tolist(SlotBin5, native, ?INDEX_MODDATE), + ToList4 = + binaryslot_tolist(SlotBin4, native, ?INDEX_MODDATE), + ToList5 = + binaryslot_tolist(SlotBin5, native, ?INDEX_MODDATE), ?assertMatch([], ToList4), ?assertMatch([], ToList5), - {O4, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, - native, ?INDEX_MODDATE, false), - {O5, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, - native, ?INDEX_MODDATE, false), + {O4, none} = + binaryslot_trimmed(SlotBin4, SK1, EK1, native, ?INDEX_MODDATE, false), + {O5, none} = + binaryslot_trimmed(SlotBin4, SK1, EK1, native, ?INDEX_MODDATE, false), ?assertMatch([], O4), ?assertMatch([], O5). @@ -3716,11 +3681,12 @@ simple_persisted_rangesegfilter_tester(SSTNewFun) -> SegList = lists:map(GetSegFun, [SK1, SK2, SK3, SK4, SK5, EK1, EK2, EK3, EK4, EK5]), + SegChecker = segment_checker(tune_seglist(SegList)), TestFun = fun(StartKey, EndKey, OutList) -> RangeKVL = - sst_getfilteredrange(Pid, StartKey, EndKey, 4, SegList, 0), + sst_getkvrange(Pid, StartKey, EndKey, 4, SegChecker, 0), RangeKL = lists:map(fun({LK0, _LV0}) -> LK0 end, RangeKVL), ?assertMatch(true, lists:member(StartKey, RangeKL)), ?assertMatch(true, lists:member(EndKey, RangeKL)), @@ -4196,7 +4162,8 @@ hashmatching_bytreesize_test() -> end, KVL = lists:map(GenKeyFun, lists:seq(1, 128)), {{PosBinIndex1, _FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, KVL, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, KVL}, native, ?INDEX_MODDATE, no_timing), check_segment_match(PosBinIndex1, KVL, small), check_segment_match(PosBinIndex1, KVL, medium). @@ -4208,8 +4175,8 @@ check_segment_match(PosBinIndex1, KVL, TreeSize) -> leveled_tictac:get_segment( leveled_tictac:keyto_segment32(<>), TreeSize), - SegList0 = tune_seglist([Seg]), - PosList = find_pos(PosBinIndex1, SegList0, [], 0), + SegChecker = segment_checker(tune_seglist([Seg])), + PosList = find_pos(PosBinIndex1, SegChecker), ?assertMatch(true, length(PosList) >= 1) end, lists:foreach(CheckFun, KVL). @@ -4271,21 +4238,13 @@ corrupted_block_rangetester(PressMethod, TestCount) -> CheckFun = fun({SK, EK}) -> - InputBlocks = + [CB1, CB2, CBMid, CB4, CB5] = lists:map(CorruptBlockFun, [B1, B2, MidBlock, B4, B5]), - BR = blocks_required({SK, EK}, InputBlocks, PressMethod), - ?assertMatch(true, length(BR) =< 5), - BlockListFun = - fun(B) -> - case is_binary(B) of - true -> - deserialise_block(B, PressMethod); - false -> - B - end - end, - BRL = lists:flatten(lists:map(BlockListFun, BR)), - lists:foreach(fun({_K, _V}) -> ok end, BRL) + BR = + blocks_required( + {SK, EK}, CB1, CB2, CBMid, CB4, CB5, PressMethod), + ?assertMatch(true, length(BR) =< 100), + lists:foreach(fun({_K, _V}) -> ok end, BR) end, lists:foreach(CheckFun, RandomRanges). @@ -4299,7 +4258,8 @@ corrupted_block_fetch_tester(PressMethod) -> KVL1 = lists:ukeysort(1, generate_randomkeys(1, KC, 1, 2)), {{Header, SlotBin, _HashL, _LastKey}, _BT} = - generate_binary_slot(lookup, KVL1, PressMethod, false, no_timing), + generate_binary_slot( + lookup, {forward, KVL1}, PressMethod, false, no_timing), < HeaderTS = <<0:160/integer, Now:32/integer, 0:32/integer>>, HeaderNoTS = <<0:192>>, BIC = new_blockindex_cache(8), - {_, BIC0, undefined} = - update_blockindex_cache(false, EntriesNoTS, BIC, undefined, false), - {_, BIC1, undefined} = - update_blockindex_cache(false, EntriesTS, BIC, undefined, true), {_, BIC2, undefined} = - update_blockindex_cache(true, EntriesNoTS, BIC, undefined, false), + update_blockindex_cache(EntriesNoTS, BIC, undefined, false), {ETSP1, ETSP2} = lists:split(6, EntriesTS), {_, BIC3, undefined} = - update_blockindex_cache(true, ETSP1, BIC, undefined, true), + update_blockindex_cache(ETSP1, BIC, undefined, true), {_, BIC3, undefined} = - update_blockindex_cache(true, ETSP1, BIC3, undefined, true), + update_blockindex_cache(ETSP1, BIC3, undefined, true), {_, BIC4, LMD4} = - update_blockindex_cache(true, ETSP2, BIC3, undefined, true), + update_blockindex_cache(ETSP2, BIC3, undefined, true), {_, BIC4, LMD4} = - update_blockindex_cache(true, ETSP2, BIC4, LMD4, true), - - ?assertMatch(none, array:get(0, element(2, BIC0))), - ?assertMatch(none, array:get(0, element(2, BIC1))), + update_blockindex_cache(ETSP2, BIC4, LMD4, true), ?assertMatch(HeaderNoTS, array:get(0, element(2, BIC2))), ?assertMatch(HeaderTS, array:get(0, element(2, BIC3))), ?assertMatch(HeaderTS, array:get(0, element(2, BIC4))), @@ -5088,4 +5041,108 @@ start_sst_fun(ProcessToInform) -> ProcessToInform ! {sst_pid, P1}. --endif. +blocks_required_test() -> + B = <<"Bucket">>, + Idx = <<"idx_bin">>, + Chunk = leveled_rand:rand_bytes(32), + KeyFun = + fun(I) -> + list_to_binary(io_lib:format("B~6..0B", [I])) + end, + IdxKey = + fun(I) -> + {?IDX_TAG, B, {Idx, KeyFun(I)}, KeyFun(I)} + end, + StdKey = + fun(I) -> {?STD_TAG, B, KeyFun(I), null} end, + MetaValue = + fun(I) -> + element( + 3, + leveled_codec:generate_ledgerkv( + StdKey(I), I, Chunk, 32, infinity)) + end, + IdxValue = + fun(I) -> + element( + 3, + leveled_codec:generate_ledgerkv( + IdxKey(I), I, null, 0, infinity)) + end, + Block1L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(1, 16)), + Block2L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(17, 32)), + MidBlockL = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(33, 48)), + Block4L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(49, 64)), + Block5L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(65, 70)) + ++ + lists:map(fun(I) -> {StdKey(I), MetaValue(I)} end, lists:seq(1, 8)), + B1 = serialise_block(Block1L, native), + B2 = serialise_block(Block2L, native), + B3 = serialise_block(MidBlockL, native), + B4 = serialise_block(Block4L, native), + B5 = serialise_block(Block5L, native), + Empty = serialise_block([], native), + + TestFun = + fun(SK, EK, Exp) -> + KVL = blocks_required({SK, EK}, B1, B2, B3, B4, B5, native), + io:format( + "Length KVL ~w First ~p Last ~p~n", + [length(KVL), hd(KVL), lists:last(KVL)]), + ?assert(length(KVL) == Exp) + end, + + TestFun( + {?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}, + 68 + ), + TestFun( + {?IDX_TAG, B, {Idx, KeyFun(35)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}, + 36 + ), + TestFun( + {?IDX_TAG, B, {Idx, KeyFun(68)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}, + 3 + ), + KVL1 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, B2, Empty, B4, B5, native), + ?assertMatch(52, length(KVL1)), + KVL2 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, B2, Empty, Empty, Empty, native), + ?assertMatch(30, length(KVL2)), + KVL3 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, Empty, Empty, Empty, Empty, native), + ?assertMatch(14, length(KVL3)), + KVL4 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, Empty, B3, B4, B5, native), + ?assertMatch(52, length(KVL4)), + KVL5 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, B2, B3, Empty, B5, native), + ?assertMatch(52, length(KVL5)) + . + + +-endif. \ No newline at end of file diff --git a/src/leveled_util.erl b/src/leveled_util.erl index 247f047b..8a2162c3 100644 --- a/src/leveled_util.erl +++ b/src/leveled_util.erl @@ -5,8 +5,6 @@ -module(leveled_util). --include("include/leveled.hrl"). - -export([generate_uuid/0, integer_now/0, integer_time/1, @@ -42,7 +40,7 @@ integer_time(TS) -> calendar:datetime_to_gregorian_seconds(DT). --spec magic_hash(any()) -> integer(). +-spec magic_hash(any()) -> 0..16#FFFFFFFF. %% @doc %% Use DJ Bernstein magic hash function. Note, this is more expensive than %% phash2 but provides a much more balanced result. @@ -52,7 +50,7 @@ integer_time(TS) -> %% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function magic_hash({binary, BinaryKey}) -> H = 5381, - hash1(H, BinaryKey) band 16#FFFFFFFF; + hash1(H, BinaryKey); magic_hash(AnyKey) -> BK = t2b(AnyKey), magic_hash({binary, BK}). @@ -60,7 +58,7 @@ magic_hash(AnyKey) -> hash1(H, <<>>) -> H; hash1(H, <>) -> - H1 = H * 33, + H1 = (H * 33) band 16#FFFFFFFF, H2 = H1 bxor B, hash1(H2, Rest). diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index bd41b2d2..217d209f 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -34,8 +34,8 @@ expiring_indexes(_Config) -> % before). Confirm that replacing an object has the expected outcome, if % the IndexSpecs are updated as part of the request. KeyCount = 50000, - Future = 60, - % 1 minute - if running tests on a slow machine, may need to increase + Future = 120, + % 2 minutes - if running tests on a slow machine, may need to increase % this value RootPath = testutil:reset_filestructure(), StartOpts1 = @@ -44,13 +44,30 @@ expiring_indexes(_Config) -> {max_journalobjectcount, 30000}, {sync_strategy, testutil:sync_strategy()}], {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), - + SW1 = os:timestamp(), + timer:sleep(1000), + + V9 = testutil:get_compressiblevalue(), + Indexes9 = testutil:get_randomindexes_generator(2), + TempRiakObjects = + testutil:generate_objects( + KeyCount, binary_uuid, [], V9, Indexes9, "riakBucket"), + IBKL1 = testutil:stdload_expiring(Bookie1, KeyCount, Future), + lists:foreach( + fun({_RN, Obj, Spc}) -> + testutil:book_tempriakput( + Bookie1, Obj, Spc, leveled_util:integer_now() + Future) + end, + TempRiakObjects + ), timer:sleep(1000), % Wait a second after last key so that none loaded in the last second LoadTime = timer:now_diff(os:timestamp(), SW1)/1000000, io:format("Load of ~w std objects in ~w seconds~n", [KeyCount, LoadTime]), + + timer:sleep(1000), SW2 = os:timestamp(), FilterFun = fun({I, _B, _K}) -> lists:member(I, [5, 6, 7, 8]) end, @@ -76,6 +93,25 @@ expiring_indexes(_Config) -> {async, I0Counter1} = CountI0Fold(), I0Count1 = I0Counter1(), + HeadFold = + fun(LowTS, HighTS) -> + leveled_bookie:book_headfold( + Bookie1, + ?RIAK_TAG, + {range, <<"riakBucket">>, all}, + {fun(_B, _K, _V, Acc) -> Acc + 1 end, 0}, + false, true, false, + {testutil:convert_to_seconds(LowTS), + testutil:convert_to_seconds(HighTS)}, + false + ) + end, + {async, HeadCount0Fun} = HeadFold(SW1, SW2), + {async, HeadCount1Fun} = HeadFold(SW2, os:timestamp()), + HeadCounts = {HeadCount0Fun(), HeadCount1Fun()}, + io:format("HeadCounts ~w before expiry~n", [HeadCounts]), + {KeyCount, 0} = HeadCounts, + FoldFun = fun(BF, {IdxV, KeyF}, Acc) -> [{IdxV, BF, KeyF}|Acc] end, InitAcc = [], IndexFold = @@ -145,6 +181,12 @@ expiring_indexes(_Config) -> true = QR4 == [], true = QR5 == [], + {async, HeadCount0ExpFun} = HeadFold(SW1, SW2), + {async, HeadCount1ExpFun} = HeadFold(SW2, os:timestamp()), + HeadCountsExp = {HeadCount0ExpFun(), HeadCount1ExpFun()}, + io:format("HeadCounts ~w after expiry~n", [HeadCountsExp]), + {0, 0} = HeadCountsExp, + ok = leveled_bookie:book_close(Bookie1), testutil:reset_filestructure(). @@ -379,12 +421,14 @@ single_object_with2i(_Config) -> %% @TODO replace all index queries with new Top-Level API if tests %% pass - {async, IdxFolder1} = leveled_bookie:book_indexfold(Bookie1, - "Bucket1", - {fun testutil:foldkeysfun/3, []}, - {list_to_binary("binary_bin"), - <<99:32/integer>>, <<101:32/integer>>}, - {true, undefined}), + {async, IdxFolder1} = + leveled_bookie:book_indexfold( + Bookie1, + "Bucket1", + {fun testutil:foldkeysfun/3, []}, + {list_to_binary("binary_bin"), + <<99:32/integer>>, <<101:32/integer>>}, + {true, undefined}), R1 = IdxFolder1(), io:format("R1 of ~w~n", [R1]), true = [{<<100:32/integer>>,"Key1"}] == R1, diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index f8d786a1..2c8dfc7e 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -3,6 +3,7 @@ -include("../include/leveled.hrl"). -export([book_riakput/3, + book_tempriakput/4, book_riakdelete/4, book_riakget/3, book_riakhead/3, @@ -182,6 +183,16 @@ book_riakput(Pid, RiakObject, IndexSpecs) -> IndexSpecs, ?RIAK_TAG). +book_tempriakput(Pid, RiakObject, IndexSpecs, TTL) -> + leveled_bookie:book_tempput( + Pid, + RiakObject#r_object.bucket, + RiakObject#r_object.key, + to_binary(v1, RiakObject), + IndexSpecs, + ?RIAK_TAG, + TTL). + book_riakdelete(Pid, Bucket, Key, IndexSpecs) -> leveled_bookie:book_put(Pid, Bucket, Key, delete, IndexSpecs, ?RIAK_TAG).