diff --git a/include/leveled.hrl b/include/leveled.hrl index 8f79da6e..55b82816 100644 --- a/include/leveled.hrl +++ b/include/leveled.hrl @@ -84,7 +84,7 @@ end_key :: tuple() | undefined, owner :: pid()|list(), filename :: string() | undefined, - bloom :: binary() | none | undefined}). + bloom = none :: leveled_ebloom:bloom() | none}). -record(cdb_options, {max_size :: pos_integer() | undefined, diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 462a3956..1c92344b 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -18,7 +18,6 @@ strip_to_keyseqonly/1, strip_to_indexdetails/1, striphead_to_v1details/1, - is_active/3, endkey_passed/2, key_dominates/2, maybe_reap_expiredkey/2, @@ -48,7 +47,10 @@ to_lookup/1, next_key/1, return_proxy/4, - get_metadata/1]). + get_metadata/1, + maybe_accumulate/5, + accumulate_index/2, + count_tombs/2]). -define(LMD_FORMAT, "~4..0w~2..0w~2..0w~2..0w~2..0w"). -define(NRT_IDX, "$aae."). @@ -251,22 +253,79 @@ striphead_to_v1details(V) -> get_metadata(LV) -> element(4, LV). --spec key_dominates(ledger_kv(), ledger_kv()) -> - left_hand_first|right_hand_first|left_hand_dominant|right_hand_dominant. +-spec maybe_accumulate( + list(leveled_codec:ledger_kv()), + term(), + non_neg_integer(), + {pos_integer(), {non_neg_integer(), non_neg_integer()|infinity}}, + leveled_penciller:pclacc_fun()) + -> {term(), non_neg_integer()}. +%% @doc +%% Make an accumulation decision based on the date range and also the expiry +%% status of the ledger key and value Needs to handle v1 and v2 values. When +%% folding over heads -> v2 values, index-keys -> v1 values. +maybe_accumulate([], Acc, Count, _Filter, _Fun) -> + {Acc, Count}; +maybe_accumulate( + [{K, {_SQN, {active, TS}, _SH, _MD, undefined}=V}|T], + Acc, Count, {Now, _ModRange}=Filter, AccFun) + when TS >= Now -> + maybe_accumulate(T, AccFun(K, V, Acc), Count + 1, Filter, AccFun); +maybe_accumulate( + [{K, {_SQN, {active, TS}, _SH, _MD}=V}|T], + Acc, Count, {Now, _ModRange}=Filter, AccFun) + when TS >= Now -> + maybe_accumulate(T, AccFun(K, V, Acc), Count + 1, Filter, AccFun); +maybe_accumulate( + [{_K, {_SQN, tomb, _SH, _MD, _LMD}}|T], + Acc, Count, Filter, AccFun) -> + maybe_accumulate(T, Acc, Count, Filter, AccFun); +maybe_accumulate( + [{_K, {_SQN, tomb, _SH, _MD}}|T], + Acc, Count, Filter, AccFun) -> + maybe_accumulate(T, Acc, Count, Filter, AccFun); +maybe_accumulate( + [{K, {_SQN, {active, TS}, _SH, _MD, LMD}=V}|T], + Acc, Count, {Now, {LowDate, HighDate}}=Filter, AccFun) + when TS >= Now, LMD >= LowDate, LMD =< HighDate -> + maybe_accumulate(T, AccFun(K, V, Acc), Count + 1, Filter, AccFun); +maybe_accumulate( + [_LV|T], + Acc, Count, Filter, AccFun) -> + maybe_accumulate(T, Acc, Count, Filter, AccFun). + +-spec accumulate_index( + {boolean(), undefined|leveled_runner:mp()}, leveled_runner:acc_fun()) + -> any(). +accumulate_index({false, undefined}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, _IndexInfo, ObjKey}, _Value, Acc) -> + FoldKeysFun(Bucket, ObjKey, Acc) + end; +accumulate_index({true, undefined}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc) + end; +accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + case re:run(IdxValue, TermRegex) of + nomatch -> + Acc; + _ -> + case AddTerm of + true -> + FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc); + false -> + FoldKeysFun(Bucket, ObjKey, Acc) + end + end + end. + +-spec key_dominates(ledger_kv(), ledger_kv()) -> boolean(). %% @doc %% When comparing two keys in the ledger need to find if one key comes before %% the other, or if the match, which key is "better" and should be the winner -key_dominates({LK, _LVAL}, {RK, _RVAL}) when LK < RK -> - left_hand_first; -key_dominates({LK, _LVAL}, {RK, _RVAL}) when RK < LK -> - right_hand_first; key_dominates(LObj, RObj) -> - case strip_to_seqonly(LObj) >= strip_to_seqonly(RObj) of - true -> - left_hand_dominant; - false -> - right_hand_dominant - end. + strip_to_seqonly(LObj) >= strip_to_seqonly(RObj). -spec maybe_reap_expiredkey(ledger_kv(), {boolean(), integer()}) -> boolean(). %% @doc @@ -286,20 +345,18 @@ maybe_reap(tomb, {true, _CurrTS}) -> maybe_reap(_, _) -> false. --spec is_active(ledger_key(), ledger_value(), non_neg_integer()) -> boolean(). -%% @doc -%% Is this an active KV pair or has the timestamp expired -is_active(Key, Value, Now) -> - case strip_to_statusonly({Key, Value}) of - {active, infinity} -> - true; - tomb -> - false; - {active, TS} when TS >= Now -> - true; - {active, _TS} -> - false - end. +-spec count_tombs( + list(ledger_kv()), non_neg_integer()|not_counted) -> + non_neg_integer()|not_counted. +count_tombs(_List, not_counted) -> + not_counted; +count_tombs([], Count) -> + Count; +count_tombs([{_K, V}|T], Count) when element(2, V) == tomb -> + count_tombs(T, Count + 1); +count_tombs([_KV|T], Count) -> + count_tombs(T, Count). + -spec from_ledgerkey(atom(), tuple()) -> false|tuple(). %% @doc diff --git a/src/leveled_ebloom.erl b/src/leveled_ebloom.erl index f5050edc..17cb9384 100644 --- a/src/leveled_ebloom.erl +++ b/src/leveled_ebloom.erl @@ -1,23 +1,36 @@ %% -------- TinyBloom --------- %% -%% A fixed size bloom that supports 32K keys only, made to try and minimise -%% the cost of producing the bloom -%% - +%% A 1-byte per key bloom filter with a 5% fpr. Pre-prepared segment hashes +%% (a leveled codec type) are, used for building and checking - the filter +%% splits a single hash into a 1 byte slot identifier, and 2 x 12 bit hashes +%% (so k=2, although only a single hash is used). +%% +%% The filter is designed to support a maximum of 64K keys, larger numbers of +%% keys will see higher fprs - with a 40% fpr at 250K keys. +%% +%% The filter uses the second "Extra Hash" part of the segment-hash to ensure +%% no overlap of fpr with the leveled_sst find_pos function. +%% +%% The completed bloom is a binary - to minimise the cost of copying between +%% processes and holding in memory. -module(leveled_ebloom). --include("include/leveled.hrl"). - -export([ create_bloom/1, check_hash/2 ]). --define(BLOOM_SIZE_BYTES, 512). --define(INTEGER_SIZE, 4096). --define(BAND_MASK, ?INTEGER_SIZE - 1). - +-define(BLOOM_SLOTSIZE_BYTES, 512). +-define(INTEGER_SLICE_SIZE, 64). +-define(INTEGER_SLICES, 64). + % i.e. ?INTEGER_SLICES * ?INTEGER_SLICE_SIZE = ?BLOOM_SLOTSIZE_BYTES div 8 +-define(MASK_BSR, 6). + % i.e. 2 ^ (12 - 6) = ?INTEGER_SLICES +-define(MASK_BAND, 63). + % i.e. integer slize size - 1 +-define(SPLIT_BAND, 4095). + % i.e. (?BLOOM_SLOTSIZE_BYTES * 8) - 1 -type bloom() :: binary(). @@ -29,64 +42,39 @@ -spec create_bloom(list(leveled_codec:segment_hash())) -> bloom(). %% @doc -%% Create a binary bloom filter from a list of hashes +%% Create a binary bloom filter from a list of hashes. In the leveled +%% implementation the hashes are leveled_codec:segment_hash/0 type, but only +%% a single 32-bit hash (the second element of the tuple is actually used in +%% the building of the bloom filter create_bloom(HashList) -> - case length(HashList) of - 0 -> - <<>>; - L when L > 32768 -> - {HL0, HL1} = - lists:partition(fun({_, Hash}) -> Hash band 32 == 0 end, - HashList), - Bin1 = - add_hashlist(HL0, - 32, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0), - Bin2 = - add_hashlist(HL1, - 32, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0), - <>; - L when L > 16384 -> - add_hashlist(HashList, - 32, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0); - L when L > 4096 -> - add_hashlist(HashList, - 16, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0); - L when L > 2048 -> - add_hashlist(HashList, 4, 0, 0, 0, 0); - _ -> - add_hashlist(HashList, 2, 0, 0) - end. - + SlotCount = + case length(HashList) of + 0 -> + 0; + L -> + min(128, max(2, (L - 1) div 512)) + end, + SlotHashes = + map_hashes( + HashList, + list_to_tuple(lists:duplicate(SlotCount, [])), + SlotCount + ), + build_bloom(SlotHashes, SlotCount). -spec check_hash(leveled_codec:segment_hash(), bloom()) -> boolean(). %% @doc -%% Check for the presence of a given hash within a bloom +%% Check for the presence of a given hash within a bloom. Only the second +%% element of the leveled_codec:segment_hash/0 type is used - a 32-bit hash. check_hash(_Hash, <<>>) -> false; -check_hash({_SegHash, Hash}, BloomBin) -> - SlotSplit = byte_size(BloomBin) div ?BLOOM_SIZE_BYTES, - {Slot, Hashes} = split_hash(Hash, SlotSplit), - Mask = get_mask(Hashes), - Pos = Slot * ?BLOOM_SIZE_BYTES, - IntSize = ?INTEGER_SIZE, - <<_H:Pos/binary, CheckInt:IntSize/integer, _T/binary>> = BloomBin, - case CheckInt band Mask of - Mask -> - true; +check_hash({_SegHash, Hash}, BloomBin) when is_binary(BloomBin)-> + SlotSplit = byte_size(BloomBin) div ?BLOOM_SLOTSIZE_BYTES, + {Slot, [H0, H1]} = split_hash(Hash, SlotSplit), + Pos = ((Slot + 1) * ?BLOOM_SLOTSIZE_BYTES) - 1, + case match_hash(BloomBin, Pos - (H0 div 8), H0 rem 8) of + true -> + match_hash(BloomBin, Pos - (H1 div 8), H1 rem 8); _ -> false end. @@ -95,408 +83,78 @@ check_hash({_SegHash, Hash}, BloomBin) -> %%% Internal Functions %%%============================================================================ +-type slot_count() :: 0|2..128. +-type bloom_hash() :: 0..16#FFF. +-type external_hash() :: 0..16#FFFFFFFF. + +-spec map_hashes( + list(leveled_codec:segment_hash()), tuple(), slot_count()) -> tuple(). +map_hashes([], HashListTuple, _SlotCount) -> + HashListTuple; +map_hashes([Hash|Rest], HashListTuple, SlotCount) -> + {Slot, Hashes} = split_hash(element(2, Hash), SlotCount), + SlotHL = element(Slot + 1, HashListTuple), + map_hashes( + Rest, + setelement(Slot + 1, HashListTuple, Hashes ++ SlotHL), + SlotCount). + +-spec split_hash(external_hash(), slot_count()) + -> {non_neg_integer(), [bloom_hash()]}. split_hash(Hash, SlotSplit) -> Slot = (Hash band 255) rem SlotSplit, - H0 = (Hash bsr 8) band (?BAND_MASK), - H1 = (Hash bsr 20) band (?BAND_MASK), + H0 = (Hash bsr 8) band ?SPLIT_BAND, + H1 = (Hash bsr 20) band ?SPLIT_BAND, {Slot, [H0, H1]}. -get_mask([H0, H1]) -> - (1 bsl H0) bor (1 bsl H1). - - -%% This looks ugly and clunky, but in tests it was quicker than modifying an -%% Erlang term like an array as it is passed around the loop - -add_hashlist([], _S, S0, S1) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, SlotSplit, S0 bor Mask, S1); - 1 -> - add_hashlist(T, SlotSplit, S0, S1 bor Mask) - end. - -add_hashlist([], _S, S0, S1, S2, S3) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], SlotSplit, S0, S1, S2, S3) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, SlotSplit, S0 bor Mask, S1, S2, S3); - 1 -> - add_hashlist(T, SlotSplit, S0, S1 bor Mask, S2, S3); - 2 -> - add_hashlist(T, SlotSplit, S0, S1, S2 bor Mask, S3); - 3 -> - add_hashlist(T, SlotSplit, S0, S1, S2, S3 bor Mask) - end. - -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, - SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 1 -> - add_hashlist(T, - SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 2 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 3 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 4 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 5 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 6 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, - S8, S9, S10, S11, S12, S13, S14, S15); - 7 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, - S8, S9, S10, S11, S12, S13, S14, S15); - 8 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8 bor Mask, S9, S10, S11, S12, S13, S14, S15); - 9 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9 bor Mask, S10, S11, S12, S13, S14, S15); - 10 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10 bor Mask, S11, S12, S13, S14, S15); - 11 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11 bor Mask, S12, S13, S14, S15); - 12 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12 bor Mask, S13, S14, S15); - 13 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13 bor Mask, S14, S15); - 14 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14 bor Mask, S15); - 15 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15 bor Mask) - end. - - -add_hashlist([], _S, S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31) -> - IntSize = ?INTEGER_SIZE, - <>; -add_hashlist([{_SegHash, TopHash}|T], - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31) -> - {Slot, Hashes} = split_hash(TopHash, SlotSplit), - Mask = get_mask(Hashes), - case Slot of - 0 -> - add_hashlist(T, - SlotSplit, - S0 bor Mask, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 1 -> - add_hashlist(T, - SlotSplit, - S0, S1 bor Mask, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 2 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2 bor Mask, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 3 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3 bor Mask, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 4 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4 bor Mask, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 5 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5 bor Mask, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 6 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6 bor Mask, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 7 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7 bor Mask, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 8 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8 bor Mask, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 9 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9 bor Mask, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 10 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10 bor Mask, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 11 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11 bor Mask, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 12 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12 bor Mask, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 13 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13 bor Mask, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 14 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14 bor Mask, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 15 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15 bor Mask, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 16 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16 bor Mask, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 17 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17 bor Mask, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 18 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18 bor Mask, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 19 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19 bor Mask, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 20 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20 bor Mask, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 21 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21 bor Mask, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 22 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22 bor Mask, S23, - S24, S25, S26, S27, S28, S29, S30, S31); - 23 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23 bor Mask, - S24, S25, S26, S27, S28, S29, S30, S31); - 24 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24 bor Mask, S25, S26, S27, S28, S29, S30, S31); - 25 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25 bor Mask, S26, S27, S28, S29, S30, S31); - 26 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26 bor Mask, S27, S28, S29, S30, S31); - 27 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27 bor Mask, S28, S29, S30, S31); - 28 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28 bor Mask, S29, S30, S31); - 29 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29 bor Mask, S30, S31); - 30 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30 bor Mask, S31); - 31 -> - add_hashlist(T, - SlotSplit, - S0, S1, S2, S3, S4, S5, S6, S7, - S8, S9, S10, S11, S12, S13, S14, S15, - S16, S17, S18, S19, S20, S21, S22, S23, - S24, S25, S26, S27, S28, S29, S30, S31 bor Mask) - - end. - +-spec match_hash(bloom(), non_neg_integer(), 0..16#FF) -> boolean(). +match_hash(BloomBin, Pos, Hash) -> + <<_Pre:Pos/binary, CheckInt:8/integer, _Rest/binary>> = BloomBin, + (CheckInt bsr Hash) band 1 == 1. + +-spec build_bloom(tuple(), slot_count()) -> bloom(). +build_bloom(_SlotHashes, 0) -> + <<>>; +build_bloom(SlotHashes, SlotCount) when SlotCount > 0 -> + lists:foldr( + fun(I, AccBin) -> + HashList = element(I, SlotHashes), + SlotBin = + add_hashlist( + lists:usort(HashList), 0, 1, ?INTEGER_SLICES, <<>>), + <> + end, + <<>>, + lists:seq(1, SlotCount) + ). + +-spec add_hashlist( + list(bloom_hash()), + non_neg_integer(), + non_neg_integer(), + 0..?INTEGER_SLICES, + binary()) -> bloom(). +add_hashlist([], ThisSlice, SliceCount, SliceCount, AccBin) -> + <>; +add_hashlist([], ThisSlice, SliceNumber, SliceCount, AccBin) -> + add_hashlist( + [], + 0, + SliceNumber + 1, + SliceCount, + <>); +add_hashlist([H0|Rest], ThisSlice, SliceNumber, SliceCount, AccBin) + when ((H0 bsr ?MASK_BSR) + 1) == SliceNumber -> + Mask0 = 1 bsl (H0 band (?MASK_BAND)), + add_hashlist( + Rest, ThisSlice bor Mask0, SliceNumber, SliceCount, AccBin); +add_hashlist(Rest, ThisSlice, SliceNumber, SliceCount, AccBin) -> + add_hashlist( + Rest, + 0, + SliceNumber + 1, + SliceCount, + <>). %%%============================================================================ %%% Test @@ -507,11 +165,7 @@ add_hashlist([{_SegHash, TopHash}|T], -include_lib("eunit/include/eunit.hrl"). generate_orderedkeys(Seqn, Count, BucketRangeLow, BucketRangeHigh) -> - generate_orderedkeys(Seqn, - Count, - [], - BucketRangeLow, - BucketRangeHigh). + generate_orderedkeys(Seqn, Count, [], BucketRangeLow, BucketRangeHigh). generate_orderedkeys(_Seqn, 0, Acc, _BucketLow, _BucketHigh) -> Acc; @@ -521,17 +175,12 @@ generate_orderedkeys(Seqn, Count, Acc, BucketLow, BucketHigh) -> io_lib:format("K~4..0B", [BucketLow + BNumber]), KeyExt = io_lib:format("K~8..0B", [Seqn * 100 + leveled_rand:uniform(100)]), - LK = leveled_codec:to_ledgerkey("Bucket" ++ BucketExt, "Key" ++ KeyExt, o), Chunk = leveled_rand:rand_bytes(16), {_B, _K, MV, _H, _LMs} = leveled_codec:generate_ledgerkv(LK, Seqn, Chunk, 64, infinity), - generate_orderedkeys(Seqn + 1, - Count - 1, - [{LK, MV}|Acc], - BucketLow, - BucketHigh). - + generate_orderedkeys( + Seqn + 1, Count - 1, [{LK, MV}|Acc], BucketLow, BucketHigh). get_hashlist(N) -> KVL = generate_orderedkeys(1, N, 1, 20), @@ -560,16 +209,16 @@ check_neg_hashes(BloomBin, HashList, Counters) -> end, lists:foldl(CheckFun, Counters, HashList). - empty_bloom_test() -> BloomBin0 = create_bloom([]), - ?assertMatch({0, 4}, - check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). + ?assertMatch( + {0, 4}, check_neg_hashes(BloomBin0, [0, 10, 100, 100000], {0, 0})). bloom_test_() -> {timeout, 120, fun bloom_test_ranges/0}. bloom_test_ranges() -> + test_bloom(250000, 2), test_bloom(80000, 4), test_bloom(60000, 4), test_bloom(40000, 4), @@ -577,7 +226,8 @@ bloom_test_ranges() -> test_bloom(20000, 4), test_bloom(10000, 4), test_bloom(5000, 4), - test_bloom(2000, 4). + test_bloom(2000, 4), + test_bloom(1000, 4). test_bloom(N, Runs) -> ListOfHashLists = @@ -599,35 +249,44 @@ test_bloom(N, Runs) -> SWa = os:timestamp(), ListOfBlooms = - lists:map(fun({HL, _ML}) -> create_bloom(HL) end, - SplitListOfHashLists), + lists:map( + fun({HL, _ML}) -> create_bloom(HL) end, SplitListOfHashLists), TSa = timer:now_diff(os:timestamp(), SWa)/Runs, SWb = os:timestamp(), - lists:foreach(fun(Nth) -> - {HL, _ML} = lists:nth(Nth, SplitListOfHashLists), - BB = lists:nth(Nth, ListOfBlooms), - check_all_hashes(BB, HL) - end, - lists:seq(1, Runs)), - TSb = timer:now_diff(os:timestamp(), SWb)/Runs, - + PosChecks = + lists:foldl( + fun(Nth, ChecksMade) -> + {HL, _ML} = lists:nth(Nth, SplitListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_all_hashes(BB, HL), + ChecksMade + length(HL) + end, + 0, + lists:seq(1, Runs)), + TSb = timer:now_diff(os:timestamp(), SWb), + SWc = os:timestamp(), {Pos, Neg} = - lists:foldl(fun(Nth, Acc) -> - {_HL, ML} = lists:nth(Nth, SplitListOfHashLists), - BB = lists:nth(Nth, ListOfBlooms), - check_neg_hashes(BB, ML, Acc) - end, - {0, 0}, - lists:seq(1, Runs)), + lists:foldl( + fun(Nth, Acc) -> + {_HL, ML} = lists:nth(Nth, SplitListOfHashLists), + BB = lists:nth(Nth, ListOfBlooms), + check_neg_hashes(BB, ML, Acc) + end, + {0, 0}, + lists:seq(1, Runs)), FPR = Pos / (Pos + Neg), - TSc = timer:now_diff(os:timestamp(), SWc)/Runs, - - io:format(user, - "Test with size ~w has microsecond timings: -" - ++ " build ~w check ~w neg_check ~w and fpr ~w~n", - [N, TSa, TSb, TSc, FPR]). + TSc = timer:now_diff(os:timestamp(), SWc), + + BytesPerKey = + (lists:sum(lists:map(fun byte_size/1, ListOfBlooms)) div 4) / N, + io:format( + user, + "Test with size ~w has microsecond timings: - " + "build in ~w then ~.3f per pos-check, ~.3f per neg-check, " + "fpr ~.3f with bytes-per-key ~.3f~n", + [N, round(TSa), TSb / PosChecks, TSc / (Pos + Neg), FPR, BytesPerKey]). -endif. diff --git a/src/leveled_pclerk.erl b/src/leveled_pclerk.erl index 4e5fc38d..c242dbef 100644 --- a/src/leveled_pclerk.erl +++ b/src/leveled_pclerk.erl @@ -107,7 +107,7 @@ clerk_removelogs(Pid, ForcedLogs) -> -spec clerk_close(pid()) -> ok. clerk_close(Pid) -> - gen_server:call(Pid, close, 20000). + gen_server:call(Pid, close, 60000). %%%============================================================================ %%% gen_server callbacks diff --git a/src/leveled_penciller.erl b/src/leveled_penciller.erl index 97d76188..eeb358fa 100644 --- a/src/leveled_penciller.erl +++ b/src/leveled_penciller.erl @@ -204,8 +204,7 @@ -export([pcl_getsstpids/1, pcl_getclerkpid/1]). -ifdef(TEST). --export([ - clean_testdir/1]). +-export([clean_testdir/1]). -endif. -define(MAX_WORK_WAIT, 300). @@ -220,7 +219,9 @@ -define(WORKQUEUE_BACKLOG_TOLERANCE, 4). -define(COIN_SIDECOUNT, 4). -define(SLOW_FETCH, 500000). % Log a very slow fetch - longer than 500ms +-define(FOLD_SCANWIDTH, 32). -define(ITERATOR_SCANWIDTH, 4). +-define(ITERATOR_MINSCANWIDTH, 1). -define(TIMING_SAMPLECOUNTDOWN, 10000). -define(TIMING_SAMPLESIZE, 100). -define(SHUTDOWN_LOOPS, 10). @@ -287,10 +288,6 @@ -type pcl_state() :: #state{}. -type levelzero_cacheentry() :: {pos_integer(), leveled_tree:leveled_tree()}. -type levelzero_cache() :: list(levelzero_cacheentry()). --type iterator_entry() - :: {pos_integer(), - list(leveled_codec:ledger_kv()|leveled_sst:expandable_pointer())}. --type iterator() :: list(iterator_entry()). -type bad_ledgerkey() :: list(). -type sqn_check() :: current|replaced|missing. -type sst_fetchfun() :: @@ -303,10 +300,14 @@ -type pclacc_fun() :: fun((leveled_codec:ledger_key(), leveled_codec:ledger_value(), - any()) -> any()). + term()) -> term()). -type sst_options() :: #sst_options{}. --export_type([levelzero_cacheentry/0, levelzero_returnfun/0, sqn_check/0]). +-export_type( + [levelzero_cacheentry/0, + levelzero_returnfun/0, + sqn_check/0, + pclacc_fun/0]). %%%============================================================================ %%% API @@ -421,7 +422,7 @@ pcl_fetchkeys(Pid, StartKey, EndKey, AccFun, InitAcc, By) -> %% all keys in the range - so must only be run against snapshots of the %% penciller to avoid blocking behaviour. %% -%% This version allows an additional input of a SegmentList. This is a list +%% This version allows an additional input of a SegChecker. This is a list %% of 16-bit integers representing the segment IDs band ((2 ^ 16) -1) that %% are interesting to the fetch %% @@ -771,25 +772,32 @@ handle_call({fetch_keys, L0AsList = case State#state.levelzero_astree of undefined -> - leveled_pmem:merge_trees(StartKey, - EndKey, - State#state.levelzero_cache, - leveled_tree:empty(?CACHE_TYPE)); + leveled_pmem:merge_trees( + StartKey, + EndKey, + State#state.levelzero_cache, + leveled_tree:empty(?CACHE_TYPE)); List -> List end, + SegChecker = + leveled_sst:segment_checker(leveled_sst:tune_seglist(SegmentList)), FilteredL0 = - case SegmentList of + case SegChecker of false -> L0AsList; - _ -> - TunedList = leveled_sst:tune_seglist(SegmentList), + {Min, Max, CheckFun} -> FilterFun = fun(LKV) -> CheckSeg = leveled_sst:extract_hash( leveled_codec:strip_to_segmentonly(LKV)), - leveled_sst:member_check(CheckSeg, TunedList) + case CheckSeg of + CheckSeg when CheckSeg >= Min, CheckSeg =< Max -> + CheckFun(CheckSeg); + _ -> + false + end end, lists:filter(FilterFun, L0AsList) end, @@ -809,13 +817,14 @@ handle_call({fetch_keys, QueryManifest end, SnapshotTime = State#state.snapshot_time, - + PersistedIterator = maps:from_list(SSTiter), Folder = fun() -> - keyfolder({FilteredL0, SSTiter}, - {StartKey, EndKey}, - {AccFun, InitAcc, SnapshotTime}, - {SegmentList, LastModRange0, MaxKeys}) + keyfolder( + maps:put(-1, FilteredL0, PersistedIterator), + {StartKey, EndKey}, + {AccFun, InitAcc, SnapshotTime}, + {SegChecker, LastModRange0, MaxKeys}) end, case By of as_pcl -> @@ -1072,10 +1081,9 @@ handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) -> filename=FN, bloom=Bloom}, ManifestSQN = leveled_pmanifest:get_manifest_sqn(State#state.manifest) + 1, - UpdMan = leveled_pmanifest:insert_manifest_entry(State#state.manifest, - ManifestSQN, - 0, - ManEntry), + UpdMan = + leveled_pmanifest:insert_manifest_entry( + State#state.manifest, ManifestSQN, 0, ManEntry), % Prompt clerk to ask about work - do this for every L0 roll ok = leveled_pclerk:clerk_prompt(State#state.clerk), {noreply, State#state{levelzero_cache=[], @@ -1209,7 +1217,6 @@ handle_cast({complete_shutdown, ShutdownType, From}, State) -> end, {stop, normal, State}. - %% handle the bookie stopping and stop this snapshot handle_info({'DOWN', BookieMonRef, process, _BookiePid, _Info}, State=#state{bookie_monref = BookieMonRef}) -> @@ -1226,10 +1233,11 @@ terminate(Reason, _State) -> format_status(normal, [_PDict, State]) -> State; format_status(terminate, [_PDict, State]) -> - State#state{manifest = redacted, - levelzero_cache = redacted, - levelzero_index = redacted, - levelzero_astree = redacted}. + State#state{ + manifest = redacted, + levelzero_cache = redacted, + levelzero_index = redacted, + levelzero_astree = redacted}. code_change(_OldVsn, State, _Extra) -> @@ -1280,15 +1288,17 @@ start_from_file(PCLopts) -> % vnode syncronisation issues (e.g. stop them all by default merging to % level zero concurrently) - InitState = #state{clerk = MergeClerk, - root_path = RootPath, - levelzero_maxcachesize = MaxTableSize, - levelzero_cointoss = CoinToss, - levelzero_index = [], - snaptimeout_short = SnapTimeoutShort, - snaptimeout_long = SnapTimeoutLong, - sst_options = OptsSST, - monitor = Monitor}, + InitState = + #state{ + clerk = MergeClerk, + root_path = RootPath, + levelzero_maxcachesize = MaxTableSize, + levelzero_cointoss = CoinToss, + levelzero_index = [], + snaptimeout_short = SnapTimeoutShort, + snaptimeout_long = SnapTimeoutLong, + sst_options = OptsSST, + monitor = Monitor}, %% Open manifest Manifest0 = leveled_pmanifest:open_manifest(RootPath), @@ -1311,33 +1321,34 @@ start_from_file(PCLopts) -> case filelib:is_file(filename:join(sst_rootpath(RootPath), L0FN)) of true -> leveled_log:log(p0015, [L0FN]), - L0Open = leveled_sst:sst_open(sst_rootpath(RootPath), - L0FN, - OptsSST, - 0), + L0Open = + leveled_sst:sst_open( + sst_rootpath(RootPath), L0FN, OptsSST, 0), {ok, L0Pid, {L0StartKey, L0EndKey}, Bloom} = L0Open, L0SQN = leveled_sst:sst_getmaxsequencenumber(L0Pid), - L0Entry = #manifest_entry{start_key = L0StartKey, - end_key = L0EndKey, - filename = L0FN, - owner = L0Pid, - bloom = Bloom}, + L0Entry = + #manifest_entry{ + start_key = L0StartKey, + end_key = L0EndKey, + filename = L0FN, + owner = L0Pid, + bloom = Bloom}, Manifest2 = - leveled_pmanifest:insert_manifest_entry(Manifest1, - ManSQN + 1, - 0, - L0Entry), + leveled_pmanifest:insert_manifest_entry( + Manifest1, ManSQN + 1, 0, L0Entry), leveled_log:log(p0016, [L0SQN]), LedgerSQN = max(MaxSQN, L0SQN), - {InitState#state{manifest = Manifest2, - ledger_sqn = LedgerSQN, - persisted_sqn = LedgerSQN}, + {InitState#state{ + manifest = Manifest2, + ledger_sqn = LedgerSQN, + persisted_sqn = LedgerSQN}, [L0FN|FileList]}; false -> leveled_log:log(p0017, []), - {InitState#state{manifest = Manifest1, - ledger_sqn = MaxSQN, - persisted_sqn = MaxSQN}, + {InitState#state{ + manifest = Manifest1, + ledger_sqn = MaxSQN, + persisted_sqn = MaxSQN}, FileList} end, ok = archive_files(RootPath, FileList0), @@ -1373,7 +1384,6 @@ shutdown_manifest(Manifest, L0Constructor) -> leveled_pmanifest:close_manifest(Manifest, EntryCloseFun), EntryCloseFun(L0Constructor). - -spec check_alive(pid()|undefined) -> boolean(). %% @doc %% Double-check a processis active before attempting to terminate @@ -1382,7 +1392,6 @@ check_alive(Owner) when is_pid(Owner) -> check_alive(_Owner) -> false. - -spec archive_files(list(), list()) -> ok. %% @doc %% Archive any sst files in the folder that have not been used to build the @@ -1483,7 +1492,6 @@ roll_memory(NextManSQN, LedgerSQN, RootPath, L0Cache, CL, SSTOpts, true) -> L0Path, L0FN, 0, KVList, LedgerSQN, SSTOpts), {Constructor, Bloom}. - -spec timed_fetch_mem( tuple(), {integer(), integer()}, @@ -1507,7 +1515,6 @@ timed_fetch_mem(Key, Hash, Manifest, L0Cache, L0Index, Monitor) -> maybelog_fetch_timing(Monitor, Level, TS0, R == not_present), R. - -spec fetch_sqn( leveled_codec:ledger_key(), leveled_codec:segment_hash(), @@ -1587,7 +1594,6 @@ log_slowfetch(T0, R, PID, Level, FetchTolerance) -> R end. - -spec compare_to_sqn( leveled_codec:ledger_kv()|leveled_codec:sqn()|not_present, integer()) -> sqn_check(). @@ -1609,341 +1615,237 @@ compare_to_sqn(ObjSQN, _SQN) when is_integer(ObjSQN) -> compare_to_sqn(Obj, SQN) -> compare_to_sqn(leveled_codec:strip_to_seqonly(Obj), SQN). +-spec maybelog_fetch_timing( + leveled_monitor:monitor(), + memory|leveled_pmanifest:lsm_level(), + leveled_monitor:timing(), + boolean()) -> ok. +maybelog_fetch_timing(_Monitor, _Level, no_timing, _NF) -> + ok; +maybelog_fetch_timing({Pid, _StatsFreq}, _Level, FetchTime, true) -> + leveled_monitor:add_stat(Pid, {pcl_fetch_update, not_found, FetchTime}); +maybelog_fetch_timing({Pid, _StatsFreq}, Level, FetchTime, _NF) -> + leveled_monitor:add_stat(Pid, {pcl_fetch_update, Level, FetchTime}). %%%============================================================================ -%%% Iterator functions -%%% -%%% TODO - move to dedicated module with extended unit testing +%%% Key folder %%%============================================================================ +-type sst_iterator() + :: #{ + leveled_pmanifest:lsm_level() => + list(leveled_sst:expandable_pointer()|leveled_codec:ledger_kv()), + -1 => + list(leveled_codec:ledger_kv())}. +-type max_keys() :: unlimited|non_neg_integer(). +-type iterator_level() :: -1|leveled_pmanifest:lsm_level(). +-type search_info() :: + {{leveled_codec:ledger_key(), leveled_codec:ledger_key()}, + {non_neg_integer(), pos_integer()|infinity}, + leveled_sst:segment_check_fun()}. + +-define(NULL_KEY, {null, null}). -spec keyfolder( - {list(), list()}, + sst_iterator(), {leveled_codec:ledger_key(), leveled_codec:ledger_key()}, {pclacc_fun(), any(), pos_integer()}, - {boolean(), {non_neg_integer(), pos_integer()|infinity}, integer()}) - -> any(). + {leveled_sst:segment_check_fun(), + {non_neg_integer(), pos_integer()|infinity}, + -1|non_neg_integer()}) -> {non_neg_integer(), term()}|term(). +keyfolder( + Iterator, + {StartKey, EndKey}, + {AccFun, InitAcc, Now}, + {SegCheckFun, LastModRange, KeyLimit}) -> + % The in-memory dump of keys in this range, may go beyond the end key - so + % strip these back before starting the fold + StripIMMFun = + fun(MemIter) -> + lists:reverse( + lists:dropwhile( + fun({K, _V}) -> leveled_codec:endkey_passed(EndKey, K) end, + lists:reverse(MemIter))) + end, + MaxKeys = + case KeyLimit of + -1 -> unlimited; + KeyLimit when is_integer(KeyLimit), KeyLimit >= 0 -> KeyLimit + end, + keyfolder( + maps:update_with(-1, StripIMMFun, Iterator), + InitAcc, + MaxKeys, + {?FOLD_SCANWIDTH, lists:sort(maps:keys(Iterator))}, + {{StartKey, EndKey}, LastModRange, SegCheckFun}, + {AccFun, Now}). + +-spec keyfolder( + sst_iterator()|no_more_keys, + term(), + max_keys(), + {pos_integer(), list(iterator_level())}, + search_info(), + {pclacc_fun(), integer()}) -> {non_neg_integer(), term()}|term(). %% @doc -%% The keyfolder will compare an iterator across the immutable in-memory cache -%% of the Penciller (the IMMiter), with an iterator across the persisted part -%% (the SSTiter). -%% -%% A Segment List and a MaxKeys may be passed. Every time something is added -%% to the accumulator MaxKeys is reduced - so set MaxKeys to -1 if it is -%% intended to be infinite. -%% -%% The basic principle is to take the next key in the IMMiter and compare it -%% to the next key in the SSTiter, and decide which one should be added to the -%% accumulator. The iterators are advanced if they either win (i.e. are the -%% next key), or are dominated. This goes on until the iterators are empty. +%% The keyfolder takes an iterator - a map with an entry for each level, from +%% level -1 (the in-memory cache of keys) through to level 7 (the theoretical) +%% maximum level. %% -%% To advance the SSTiter the find_nextkey/4 function is used, as the SSTiter -%% is an iterator across multiple levels - and so needs to do its own -%% comparisons to pop the next result. -keyfolder(_Iterators, - _KeyRange, - {_AccFun, Acc, _Now}, - {_SegmentList, _LastModRange, MaxKeys}) when MaxKeys == 0 -> - {0, Acc}; -keyfolder({[], SSTiter}, KeyRange, {AccFun, Acc, Now}, - {SegmentList, LastModRange, MaxKeys}) -> - {StartKey, EndKey} = KeyRange, - case find_nextkey(SSTiter, StartKey, EndKey, - SegmentList, element(1, LastModRange)) of - no_more_keys -> - case MaxKeys > 0 of - true -> - % This query had a max count, so we must respond with the - % remainder on the count - {MaxKeys, Acc}; - false -> - % This query started with a MaxKeys set to -1. Query is - % not interested in having MaxKeys in Response - Acc - end; - {NxSSTiter, {SSTKey, SSTVal}} -> - {Acc1, MK1} = - maybe_accumulate(SSTKey, SSTVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - keyfolder({[], NxSSTiter}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}) +%% The find_nextkeys function is used to scan the iterators to find the next +%% set of W keys. These can then be accumulated. If there is a MaxKeys set +%% (i.e. a maximum number of KV pairs to be accumulated), then this must be +%% tracked so the keyfolder never asks for more than the remainder from +%% find_nextkeys +keyfolder(no_more_keys, Acc, MaxKeys, _LevelInfo, _SearchInfo, _AccDetails) -> + case MaxKeys of + unlimited -> Acc; + MaxKeys -> {MaxKeys, Acc} end; -keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], SSTiterator}, - KeyRange, - {AccFun, Acc, Now}, - {SegmentList, LastModRange, MaxKeys}) -> - {StartKey, EndKey} = KeyRange, - case {IMMKey < StartKey, leveled_codec:endkey_passed(EndKey, IMMKey)} of - {false, true} -> - % There are no more keys in-range in the in-memory - % iterator, so take action as if this iterator is empty - % (see above) - keyfolder({[], SSTiterator}, - KeyRange, - {AccFun, Acc, Now}, - {SegmentList, LastModRange, MaxKeys}); - {false, false} -> - case find_nextkey(SSTiterator, StartKey, EndKey, - SegmentList, element(1, LastModRange)) of - no_more_keys -> - % No more keys in range in the persisted store, so use the - % in-memory KV as the next - {Acc1, MK1} = - maybe_accumulate(IMMKey, IMMVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - keyfolder({NxIMMiterator, - []}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}); - {NxSSTiterator, {SSTKey, SSTVal}} -> - % There is a next key, so need to know which is the - % next key between the two (and handle two keys - % with different sequence numbers). - case leveled_codec:key_dominates({IMMKey, - IMMVal}, - {SSTKey, - SSTVal}) of - left_hand_first -> - {Acc1, MK1} = - maybe_accumulate(IMMKey, IMMVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - % Stow the previous best result away at Level -1 - % so that there is no need to iterate to it again - NewEntry = {-1, [{SSTKey, SSTVal}]}, - keyfolder({NxIMMiterator, - lists:keystore(-1, - 1, - NxSSTiterator, - NewEntry)}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}); - right_hand_first -> - {Acc1, MK1} = - maybe_accumulate(SSTKey, SSTVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - keyfolder({[{IMMKey, IMMVal}|NxIMMiterator], - NxSSTiterator}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}); - left_hand_dominant -> - {Acc1, MK1} = - maybe_accumulate(IMMKey, IMMVal, - {Acc, AccFun, Now}, - MaxKeys, LastModRange), - % We can add to the accumulator here. As the SST - % key was the most dominant across all SST levels, - % so there is no need to hold off until the IMMKey - % is left hand first. - keyfolder({NxIMMiterator, - NxSSTiterator}, - KeyRange, - {AccFun, Acc1, Now}, - {SegmentList, LastModRange, MK1}) - end - end - end. - --spec maybe_accumulate(leveled_codec:ledger_key(), - leveled_codec:ledger_value(), - {any(), pclacc_fun(), pos_integer()}, - integer(), - {non_neg_integer(), non_neg_integer()|infinity}) - -> any(). -%% @doc -%% Make an accumulation decision based one the date range -maybe_accumulate(LK, LV, - {Acc, AccFun, QueryStartTime}, - MaxKeys, - {LowLastMod, HighLastMod}) -> - {_SQN, _SH, LMD} = leveled_codec:strip_to_indexdetails({LK, LV}), - RunAcc = - (LMD == undefined) or - ((LMD >= LowLastMod) and (LMD =< HighLastMod)), - case RunAcc and leveled_codec:is_active(LK, LV, QueryStartTime) of - true -> - {AccFun(LK, LV, Acc), MaxKeys - 1}; - false -> - {Acc, MaxKeys} - end. - - --spec find_nextkey( - iterator(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key(), - list(non_neg_integer())|false, - non_neg_integer()) - -> no_more_keys|{iterator(), leveled_codec:ledger_kv()}. -%% @doc -%% Looks to find the best choice for the next key across the levels (other -%% than in-memory table) -%% In finding the best choice, the next key in a given level may be a next -%% block or next file pointer which will need to be expanded -find_nextkey(QueryArray, StartKey, EndKey, SegmentList, LowLastMod) -> - find_nextkey(QueryArray, - -1, - {null, null}, - StartKey, EndKey, - SegmentList, - LowLastMod, - ?ITERATOR_SCANWIDTH). - -find_nextkey(_QueryArray, LCnt, - {null, null}, - _StartKey, _EndKey, - _SegList, _LowLastMod, _Width) when LCnt > ?MAX_LEVELS -> - % The array has been scanned wihtout finding a best key - must be - % exhausted - respond to indicate no more keys to be found by the - % iterator - no_more_keys; -find_nextkey(QueryArray, LCnt, - {BKL, BestKV}, - _StartKey, _EndKey, - _SegList, _LowLastMod, _Width) when LCnt > ?MAX_LEVELS -> - % All levels have been scanned, so need to remove the best result from - % the array, and return that array along with the best key/sqn/status - % combination - {BKL, [BestKV|Tail]} = lists:keyfind(BKL, 1, QueryArray), - {lists:keyreplace(BKL, 1, QueryArray, {BKL, Tail}), BestKV}; -find_nextkey(QueryArray, LCnt, - {BestKeyLevel, BestKV}, - StartKey, EndKey, - SegList, LowLastMod, Width) -> - % Get the next key at this level - {NextKey, RestOfKeys} = - case lists:keyfind(LCnt, 1, QueryArray) of - false -> - {null, null}; - {LCnt, []} -> - {null, null}; - {LCnt, [NK|ROfKs]} -> - {NK, ROfKs} +keyfolder(_Iter, Acc, 0, _LevelInfo, _SearchInfo, _AccDetails) -> + {0, Acc}; +keyfolder( + Iter, + Acc, + MaxKeys, + {W, Ls}=LevelInfo, + {_KR, LastModRange, _SCF}=SearchInfo, + {AccFun, Now}=AccDetails) -> + {IterUpd, FoundKVs} = + find_nextkeys( + Iter, + {Ls, ?NULL_KEY}, + [], + Ls, + {fetch_size(MaxKeys, W), scan_size(MaxKeys)}, + SearchInfo), + {UpdAcc, KeyCount} = + leveled_codec:maybe_accumulate( + lists:reverse(FoundKVs), Acc, 0, {Now, LastModRange}, AccFun), + MaxKeysLeft = + case MaxKeys of + unlimited -> unlimited; + MaxKeys -> MaxKeys - KeyCount end, - % Compare the next key at this level with the best key - case {NextKey, BestKeyLevel, BestKV} of - {null, BKL, BKV} -> - % There is no key at this level - go to the next level - find_nextkey(QueryArray, - LCnt + 1, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{next, Owner, _SK}, BKL, BKV} -> - % The first key at this level is pointer to a file - need to query - % the file to expand this level out before proceeding + keyfolder(IterUpd, UpdAcc, MaxKeysLeft, LevelInfo, SearchInfo, AccDetails). + +-spec fetch_size(max_keys(), pos_integer()) -> pos_integer(). +fetch_size(unlimited, W) -> W; +fetch_size(MaxKeys, W) -> min(MaxKeys, W). + +-spec scan_size(max_keys()) -> pos_integer(). +scan_size(unlimited) -> + ?ITERATOR_SCANWIDTH; +scan_size(MaxKeys) -> + min(?ITERATOR_SCANWIDTH, max(?ITERATOR_MINSCANWIDTH, MaxKeys div 256)). + +-spec find_nextkeys( + sst_iterator(), + {list(iterator_level()), + {null|iterator_level(), null|leveled_codec:ledger_kv()}}, + list(leveled_codec:ledger_kv()), + list(iterator_level()), + {pos_integer(), pos_integer()}, + search_info()) -> + {no_more_keys, list(leveled_codec:ledger_kv())}| + {sst_iterator(), list(leveled_codec:ledger_kv())}. +%% @doc +%% Looks to find up to W keys, where for each key every level is checked, +%% comparing keys to find the best key for that loop +find_nextkeys( + _Iter, {[], ?NULL_KEY}, FoundKVs, _Ls, _BatchInfo, _SearchInfo) -> + % Each level checked and best key still NULL => no_more_keys + {no_more_keys, FoundKVs}; +find_nextkeys( + Iter, {[], {BKL, BestKV}}, FoundKVs, _Ls, {W, _SW}, _SearchInfo) + when length(FoundKVs) == W - 1 -> + % All levels scanned, and there are now W keys (W - 1 previously found plus + % the latest best key) + {maps:update_with(BKL, fun tl/1, Iter), [BestKV|FoundKVs]}; +find_nextkeys( + Iter, {[], {BKL, BestKV}}, FoundKVs, Ls, BatchInfo, SearchInfo) -> + % All levels scanned so this is the best key ... now loop to find more + find_nextkeys( + maps:update_with(BKL, fun tl/1, Iter), + {Ls, ?NULL_KEY}, + [BestKV|FoundKVs], + Ls, BatchInfo, SearchInfo); +find_nextkeys( + Iter, + {[LCnt|OtherLevels]=LoopLs, {BKL, BKV}=PrevBest}, + FoundKVs, + Ls, + {_W, ScanWidth}=BI, + {{StartKey, EndKey}, {LowLastMod, _High}, SegChecker}=SI) -> + case maps:get(LCnt, Iter) of + [] -> + find_nextkeys( + Iter, + {OtherLevels, PrevBest}, + FoundKVs, + Ls -- [LCnt], BI, SI); + [{next, Owner, _SK}|RestOfKeys] -> + % Expansion required Pointer = {next, Owner, StartKey, EndKey}, - UpdList = leveled_sst:sst_expandpointer(Pointer, - RestOfKeys, - Width, - SegList, - LowLastMod), - NewEntry = {LCnt, UpdList}, + UpdList = + leveled_sst:sst_expandpointer( + Pointer, RestOfKeys, ScanWidth, SegChecker, LowLastMod), % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level - find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), - LCnt, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{pointer, SSTPid, Slot, PSK, PEK}, BKL, BKV} -> - % The first key at this level is pointer within a file - need to - % query the file to expand this level out before proceeding + find_nextkeys( + maps:update(LCnt, UpdList, Iter), + {LoopLs, PrevBest}, + FoundKVs, + Ls, BI, SI); + [{pointer, SSTPid, Slot, PSK, PEK}|RestOfKeys] -> + % Expansion required Pointer = {pointer, SSTPid, Slot, PSK, PEK}, - UpdList = leveled_sst:sst_expandpointer(Pointer, - RestOfKeys, - Width, - SegList, - LowLastMod), - NewEntry = {LCnt, UpdList}, + UpdList = + leveled_sst:sst_expandpointer( + Pointer, RestOfKeys, ScanWidth, SegChecker, LowLastMod), % Need to loop around at this level (LCnt) as we have not yet % examined a real key at this level - find_nextkey(lists:keyreplace(LCnt, 1, QueryArray, NewEntry), - LCnt, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{Key, Val}, null, null} -> - % No best key set - so can assume that this key is the best key, - % and check the lower levels - find_nextkey(QueryArray, - LCnt + 1, - {LCnt, {Key, Val}}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{Key, Val}, _BKL, {BestKey, _BestVal}} when Key < BestKey -> - % There is a real key and a best key to compare, and the real key - % at this level is before the best key, and so is now the new best - % key - % The QueryArray is not modified until we have checked all levels - find_nextkey(QueryArray, - LCnt + 1, - {LCnt, {Key, Val}}, - StartKey, EndKey, - SegList, LowLastMod, Width); - {{Key, Val}, BKL, {BestKey, BestVal}} when Key == BestKey -> - SQN = leveled_codec:strip_to_seqonly({Key, Val}), - BestSQN = leveled_codec:strip_to_seqonly({BestKey, BestVal}), - if - SQN =< BestSQN -> - % This is a dominated key, so we need to skip over it - NewQArray = lists:keyreplace(LCnt, - 1, - QueryArray, - {LCnt, RestOfKeys}), - find_nextkey(NewQArray, - LCnt + 1, - {BKL, {BestKey, BestVal}}, - StartKey, EndKey, - SegList, LowLastMod, Width); - SQN > BestSQN -> - % There is a real key at the front of this level and it has - % a higher SQN than the best key, so we should use this as - % the best key - % But we also need to remove the dominated key from the - % lower level in the query array - OldBestEntry = lists:keyfind(BKL, 1, QueryArray), - {BKL, [{BestKey, BestVal}|BestTail]} = OldBestEntry, - find_nextkey(lists:keyreplace(BKL, - 1, - QueryArray, - {BKL, BestTail}), - LCnt + 1, - {LCnt, {Key, Val}}, - StartKey, EndKey, - SegList, LowLastMod, Width) - end; - {_, BKL, BKV} -> - % This is not the best key - find_nextkey(QueryArray, - LCnt + 1, - {BKL, BKV}, - StartKey, EndKey, - SegList, LowLastMod, Width) + find_nextkeys( + maps:update(LCnt, UpdList, Iter), + {LoopLs, PrevBest}, + FoundKVs, + Ls, BI, SI); + [{Key, Val}|_RestOfKeys] when BKV == null -> + find_nextkeys( + Iter, + {OtherLevels, {LCnt, {Key, Val}}}, + FoundKVs, + Ls, BI, SI); + [{Key, Val}|_RestOfKeys] when Key < element(1, BKV) -> + find_nextkeys( + Iter, + {OtherLevels, {LCnt, {Key, Val}}}, + FoundKVs, + Ls, BI, SI); + [{Key, _Val}|_RestOfKeys] when Key > element(1, BKV) -> + find_nextkeys( + Iter, + {OtherLevels, PrevBest}, + FoundKVs, + Ls, BI, SI); + [{Key, Val}|_RestOfKeys] -> + case leveled_codec:key_dominates({Key, Val}, BKV) of + true -> + find_nextkeys( + maps:update_with(BKL, fun tl/1, Iter), + {OtherLevels, {LCnt, {Key, Val}}}, + FoundKVs, + Ls, BI, SI); + false -> + find_nextkeys( + maps:update_with(LCnt, fun tl/1, Iter), + {OtherLevels, PrevBest}, + FoundKVs, + Ls, BI, SI) + end end. --spec maybelog_fetch_timing( - leveled_monitor:monitor(), - memory|leveled_pmanifest:lsm_level(), - leveled_monitor:timing(), - boolean()) -> ok. -maybelog_fetch_timing(_Monitor, _Level, no_timing, _NF) -> - ok; -maybelog_fetch_timing({Pid, _StatsFreq}, _Level, FetchTime, true) -> - leveled_monitor:add_stat(Pid, {pcl_fetch_update, not_found, FetchTime}); -maybelog_fetch_timing({Pid, _StatsFreq}, Level, FetchTime, _NF) -> - leveled_monitor:add_stat(Pid, {pcl_fetch_update, Level, FetchTime}). - - %%%============================================================================ %%% Test %%%============================================================================ @@ -1962,15 +1864,32 @@ pcl_fetch(Pid, Key) -> gen_server:call(Pid, {fetch, Key, Hash, true}, infinity) end. -keyfolder(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc, Now}) -> - keyfolder({IMMiter, SSTiter}, - {StartKey, EndKey}, - {AccFun, Acc, Now}, - {false, {0, infinity}, -1}). +keyfolder_test(IMMiter, SSTiter, StartKey, EndKey, {AccFun, Acc, Now}) -> + keyfolder( + maps:put(-1, IMMiter, SSTiter), + {StartKey, EndKey}, + {AccFun, Acc, Now}, + {false, {0, infinity}, -1}). + +convert_qmanifest_tomap(SSTiter) -> + maps:from_list(SSTiter). find_nextkey(QueryArray, StartKey, EndKey) -> - find_nextkey(QueryArray, StartKey, EndKey, false, 0). - + {UpdArray, NextKeys} = + find_nextkeys( + QueryArray, + {maps:keys(QueryArray), ?NULL_KEY}, + [], + maps:keys(QueryArray), + {1, 1}, + {{StartKey, EndKey}, {0, infinity}, false}), + case UpdArray of + no_more_keys -> + no_more_keys; + UpdArray -> + [NextKey] = NextKeys, + {UpdArray, NextKey} + end. generate_randomkeys({Count, StartSQN}) -> generate_randomkeys(Count, StartSQN, []). @@ -1988,7 +1907,6 @@ generate_randomkeys(Count, SQN, Acc) -> leveled_codec:segment_hash(K), null}}, generate_randomkeys(Count - 1, SQN + 1, [RandKey|Acc]). - clean_testdir(RootPath) -> clean_subdir(sst_rootpath(RootPath)), @@ -2008,7 +1926,6 @@ clean_subdir(DirPath) -> ok end. - maybe_pause_push(PCL, KL) -> T0 = [], I0 = leveled_pmem:new_index(), @@ -2196,30 +2113,22 @@ simple_server_test() -> ?assertMatch(Key2, pcl_fetch(PclSnap, {o,"Bucket0002", "Key0002", null})), ?assertMatch(Key3, pcl_fetch(PclSnap, {o,"Bucket0003", "Key0003", null})), ?assertMatch(Key4, pcl_fetch(PclSnap, {o,"Bucket0004", "Key0004", null})), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0001", - "Key0001", - null}, - 1)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0002", - "Key0002", - null}, - 1002)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0003", - "Key0003", - null}, - 2003)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0004", - "Key0004", - null}, - 3004)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0001", "Key0001", null}, 1)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0002", "Key0002", null}, 1002)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0003", "Key0003", null}, 2003)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0004", "Key0004", null}, 3004)), % Add some more keys and confirm that check sequence number still % sees the old version in the previous snapshot, but will see the new @@ -2231,12 +2140,10 @@ simple_server_test() -> KL1A = generate_randomkeys({2000, 4006}), ok = maybe_pause_push(PCLr, [Key1A]), ok = maybe_pause_push(PCLr, KL1A), - ?assertMatch(current, pcl_checksequencenumber(PclSnap, - {o, - "Bucket0001", - "Key0001", - null}, - 1)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap, {o, "Bucket0001", "Key0001", null}, 1)), ok = pcl_close(PclSnap), {ok, PclSnap2, null} = @@ -2249,36 +2156,31 @@ simple_server_test() -> undefined, false), - ?assertMatch(replaced, pcl_checksequencenumber(PclSnap2, - {o, - "Bucket0001", - "Key0001", - null}, - 1)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap2, - {o, - "Bucket0001", - "Key0001", - null}, - 4005)), - ?assertMatch(current, pcl_checksequencenumber(PclSnap2, - {o, - "Bucket0002", - "Key0002", - null}, - 1002)), + ?assertMatch( + replaced, + pcl_checksequencenumber( + PclSnap2, {o, "Bucket0001", "Key0001", null}, 1)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap2, {o, "Bucket0001", "Key0001", null}, 4005)), + ?assertMatch( + current, + pcl_checksequencenumber( + PclSnap2, {o, "Bucket0002", "Key0002", null}, 1002)), ok = pcl_close(PclSnap2), ok = pcl_close(PCLr), clean_testdir(RootPath). simple_findnextkey_test() -> - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, {{o, "Bucket1", "Key5", null}, {4, {active, infinity}, {0, 0}, null}}]}, {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, {5, [{{o, "Bucket1", "Key2", null}, {2, {active, infinity}, {0, 0}, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0", null}, {o, "Bucket1", "Key5", null}), @@ -2309,12 +2211,13 @@ simple_findnextkey_test() -> ?assertMatch(no_more_keys, ER). sqnoverlap_findnextkey_test() -> - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, {{o, "Bucket1", "Key5", null}, {4, {active, infinity}, {0, 0}, null}}]}, {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, {0, 0}, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0", null}, {o, "Bucket1", "Key5", null}), @@ -2339,12 +2242,13 @@ sqnoverlap_findnextkey_test() -> ?assertMatch(no_more_keys, ER). sqnoverlap_otherway_findnextkey_test() -> - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, {0, 0}, null}}, {{o, "Bucket1", "Key5", null}, {1, {active, infinity}, {0, 0}, null}}]}, {3, [{{o, "Bucket1", "Key3", null}, {3, {active, infinity}, {0, 0}, null}}]}, {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, {0, 0}, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), {Array2, KV1} = find_nextkey(QueryArray, {o, "Bucket1", "Key0", null}, {o, "Bucket1", "Key5", null}), @@ -2370,7 +2274,7 @@ sqnoverlap_otherway_findnextkey_test() -> foldwithimm_simple_test() -> Now = leveled_util:integer_now(), - QueryArray = [ + QueryArrayAsList = [ {2, [{{o, "Bucket1", "Key1", null}, {5, {active, infinity}, 0, null}}, {{o, "Bucket1", "Key5", null}, @@ -2380,6 +2284,7 @@ foldwithimm_simple_test() -> {5, [{{o, "Bucket1", "Key5", null}, {2, {active, infinity}, 0, null}}]} ], + QueryArray = convert_qmanifest_tomap(QueryArrayAsList), KL1A = [{{o, "Bucket1", "Key6", null}, {7, {active, infinity}, 0, null}}, {{o, "Bucket1", "Key1", null}, {8, {active, infinity}, 0, null}}, {{o, "Bucket1", "Key8", null}, {9, {active, infinity}, 0, null}}], @@ -2389,7 +2294,7 @@ foldwithimm_simple_test() -> IMM2), AccFun = fun(K, V, Acc) -> SQN = leveled_codec:strip_to_seqonly({K, V}), Acc ++ [{K, SQN}] end, - Acc = keyfolder(IMMiter, + Acc = keyfolder_test(IMMiter, QueryArray, {o, "Bucket1", "Key1", null}, {o, "Bucket1", "Key6", null}, {AccFun, [], Now}), @@ -2400,7 +2305,7 @@ foldwithimm_simple_test() -> IMMiterA = [{{o, "Bucket1", "Key1", null}, {8, {active, infinity}, 0, null}}], - AccA = keyfolder(IMMiterA, + AccA = keyfolder_test(IMMiterA, QueryArray, {o, "Bucket1", "Key1", null}, {o, "Bucket1", "Key6", null}, @@ -2416,7 +2321,7 @@ foldwithimm_simple_test() -> {o, null, null, null}, IMM3), io:format("Compare IMM3 with QueryArrary~n"), - AccB = keyfolder(IMMiterB, + AccB = keyfolder_test(IMMiterB, QueryArray, {o, "Bucket1", "Key1", null}, {o, "Bucket1", "Key6", null}, {AccFun, [], Now}), @@ -2453,7 +2358,6 @@ slow_fetch_test() -> ?assertMatch(not_present, log_slowfetch(2, not_present, "fake", 0, 1)), ?assertMatch("value", log_slowfetch(2, "value", "fake", 0, 1)). - coverage_cheat_test() -> {noreply, _State0} = handle_info(timeout, #state{}), {ok, _State1} = code_change(null, #state{}, null). @@ -2533,4 +2437,4 @@ loop() -> ok end. --endif. +-endif. \ No newline at end of file diff --git a/src/leveled_pmanifest.erl b/src/leveled_pmanifest.erl index eb59ad0f..7f051224 100644 --- a/src/leveled_pmanifest.erl +++ b/src/leveled_pmanifest.erl @@ -451,23 +451,28 @@ key_lookup(Manifest, LevelIdx, Key) -> -spec query_manifest( manifest(), leveled_codec:ledger_key(), - leveled_codec:ledger_key()) -> list(). + leveled_codec:ledger_key()) + -> list( + {lsm_level(), + list({next, manifest_entry(), leveled_codec:ledger_key()})}). query_manifest(Manifest, StartKey, EndKey) -> SetupFoldFun = fun(Level, Acc) -> - Pointers = - range_lookup(Manifest, Level, StartKey, EndKey), - case Pointers of - [] -> Acc; - PL -> Acc ++ [{Level, PL}] + case range_lookup(Manifest, Level, StartKey, EndKey) of + [] -> + Acc; + Pointers -> + [{Level, Pointers}|Acc] end end, lists:foldl(SetupFoldFun, [], lists:seq(0, ?MAX_LEVELS - 1)). --spec range_lookup(manifest(), - integer(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key()) -> list(). +-spec range_lookup( + manifest(), + integer(), + leveled_codec:ledger_key(), + leveled_codec:ledger_key()) + -> list({next, manifest_entry(), leveled_codec:ledger_key()}). %% @doc %% Return a list of manifest_entry pointers at this level which cover the %% key query range. @@ -478,10 +483,11 @@ range_lookup(Manifest, LevelIdx, StartKey, EndKey) -> end, range_lookup_int(Manifest, LevelIdx, StartKey, EndKey, MakePointerFun). --spec merge_lookup(manifest(), - integer(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key()) -> list(). +-spec merge_lookup( + manifest(), + integer(), + leveled_codec:ledger_key(), + leveled_codec:ledger_key()) -> list({next, manifest_entry(), all}). %% @doc %% Return a list of manifest_entry pointers at this level which cover the %% key query range, only all keys in the files should be included in the @@ -494,8 +500,8 @@ merge_lookup(Manifest, LevelIdx, StartKey, EndKey) -> range_lookup_int(Manifest, LevelIdx, StartKey, EndKey, MakePointerFun). --spec mergefile_selector(manifest(), integer(), selector_strategy()) - -> manifest_entry(). +-spec mergefile_selector( + manifest(), integer(), selector_strategy()) -> manifest_entry(). %% @doc %% An algorithm for discovering which files to merge .... %% We can find the most optimal file: @@ -511,13 +517,15 @@ mergefile_selector(Manifest, LevelIdx, _Strategy) when LevelIdx =< 1 -> Level = array:get(LevelIdx, Manifest#manifest.levels), lists:nth(leveled_rand:uniform(length(Level)), Level); mergefile_selector(Manifest, LevelIdx, random) -> - Level = leveled_tree:to_list(array:get(LevelIdx, - Manifest#manifest.levels)), + Level = + leveled_tree:to_list( + array:get(LevelIdx, Manifest#manifest.levels)), {_SK, ME} = lists:nth(leveled_rand:uniform(length(Level)), Level), ME; mergefile_selector(Manifest, LevelIdx, {grooming, ScoringFun}) -> - Level = leveled_tree:to_list(array:get(LevelIdx, - Manifest#manifest.levels)), + Level = + leveled_tree:to_list( + array:get(LevelIdx, Manifest#manifest.levels)), SelectorFun = fun(_I, Acc) -> {_SK, ME} = lists:nth(leveled_rand:uniform(length(Level)), Level), @@ -555,12 +563,12 @@ add_snapshot(Manifest, Pid, Timeout) -> ManSQN = Manifest#manifest.manifest_sqn, case Manifest#manifest.min_snapshot_sqn of 0 -> - Manifest#manifest{snapshots = SnapList0, - min_snapshot_sqn = ManSQN}; + Manifest#manifest{ + snapshots = SnapList0, min_snapshot_sqn = ManSQN}; N -> N0 = min(N, ManSQN), - Manifest#manifest{snapshots = SnapList0, - min_snapshot_sqn = N0} + Manifest#manifest{ + snapshots = SnapList0, min_snapshot_sqn = N0} end. -spec release_snapshot(manifest(), pid()|atom()) -> manifest(). diff --git a/src/leveled_pmem.erl b/src/leveled_pmem.erl index 9e1b8a3f..2f5bb885 100644 --- a/src/leveled_pmem.erl +++ b/src/leveled_pmem.erl @@ -43,7 +43,7 @@ -define(MAX_CACHE_LINES, 31). % Must be less than 128 --type index_array() :: list(array:array())|[]|none. +-type index_array() :: list(array:array())|[]|none. -export_type([index_array/0]). @@ -71,7 +71,7 @@ prepare_for_index(IndexArray, no_lookup) -> prepare_for_index(IndexArray, Hash) -> {Slot, H0} = split_hash(Hash), Bin = array:get(Slot, IndexArray), - array:set(Slot, <>, IndexArray). + array:set(Slot, <>, IndexArray). -spec add_to_index(array:array(), index_array(), integer()) -> index_array(). %% @doc @@ -201,16 +201,16 @@ merge_trees(StartKey, EndKey, TreeList, LevelMinus1) -> find_pos(<<>>, _Hash) -> false; -find_pos(<<1:1/integer, Hash:23/integer, _T/binary>>, Hash) -> +find_pos(<>, Hash) -> true; -find_pos(<<1:1/integer, _Miss:23/integer, T/binary>>, Hash) -> +find_pos(<<_Miss:24/integer, T/binary>>, Hash) -> find_pos(T, Hash). split_hash({SegmentID, ExtraHash}) -> Slot = SegmentID band 255, H0 = (SegmentID bsr 8) bor (ExtraHash bsl 8), - {Slot, H0 band 8388607}. + {Slot, H0 band 16#FFFFFF}. check_slotlist(Key, _Hash, CheckList, TreeList) -> SlotCheckFun = diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl index 86f753db..a9d337aa 100644 --- a/src/leveled_runner.erl +++ b/src/leveled_runner.erl @@ -65,6 +65,7 @@ -type mp() :: {re_pattern, term(), term(), term(), term()}. +-export_type([acc_fun/0, mp/0]). %%%============================================================================ %%% External functions @@ -146,15 +147,7 @@ bucket_list(SnapFun, Tag, FoldBucketsFun, InitAcc, MaxBuckets) -> %% for a timeout index_query(SnapFun, {StartKey, EndKey, TermHandling}, FoldAccT) -> {FoldKeysFun, InitAcc} = FoldAccT, - {ReturnTerms, TermRegex} = TermHandling, - AddFun = - case ReturnTerms of - true -> - fun add_terms/2; - _ -> - fun add_keys/2 - end, - + Runner = fun() -> {ok, LedgerSnapshot, _JournalSnapshot, AfterFun} = SnapFun(), @@ -163,7 +156,7 @@ index_query(SnapFun, {StartKey, EndKey, TermHandling}, FoldAccT) -> LedgerSnapshot, StartKey, EndKey, - accumulate_index(TermRegex, AddFun, FoldKeysFun), + leveled_codec:accumulate_index(TermHandling, FoldKeysFun), InitAcc, by_runner), wrap_runner(Folder, AfterFun) @@ -680,47 +673,20 @@ check_presence(Key, Value, InkerClone) -> false end. +accumulate_keys(FoldKeysFun, undefined) -> + fun(Key, _Value, Acc) -> + {B, K} = leveled_codec:from_ledgerkey(Key), + FoldKeysFun(B, K, Acc) + end; accumulate_keys(FoldKeysFun, TermRegex) -> - AccFun = - fun(Key, _Value, Acc) -> - {B, K} = leveled_codec:from_ledgerkey(Key), - case TermRegex of - undefined -> - FoldKeysFun(B, K, Acc); - Re -> - case re:run(K, Re) of - nomatch -> - Acc; - _ -> - FoldKeysFun(B, K, Acc) - end - end - end, - AccFun. - -add_keys(ObjKey, _IdxValue) -> - ObjKey. - -add_terms(ObjKey, IdxValue) -> - {IdxValue, ObjKey}. - -accumulate_index(TermRe, AddFun, FoldKeysFun) -> - case TermRe of - undefined -> - fun(Key, _Value, Acc) -> - {Bucket, ObjKey, IdxValue} = leveled_codec:from_ledgerkey(Key), - FoldKeysFun(Bucket, AddFun(ObjKey, IdxValue), Acc) - end; - TermRe -> - fun(Key, _Value, Acc) -> - {Bucket, ObjKey, IdxValue} = leveled_codec:from_ledgerkey(Key), - case re:run(IdxValue, TermRe) of - nomatch -> - Acc; - _ -> - FoldKeysFun(Bucket, AddFun(ObjKey, IdxValue), Acc) - end - end + fun(Key, _Value, Acc) -> + {B, K} = leveled_codec:from_ledgerkey(Key), + case re:run(K, TermRegex) of + nomatch -> + Acc; + _ -> + FoldKeysFun(B, K, Acc) + end end. -spec wrap_runner(fun(), fun()) -> any(). diff --git a/src/leveled_sst.erl b/src/leveled_sst.erl index c11c36be..b9a705bc 100644 --- a/src/leveled_sst.erl +++ b/src/leveled_sst.erl @@ -79,16 +79,17 @@ -define(DELETE_TIMEOUT, 10000). -define(TREE_TYPE, idxt). -define(TREE_SIZE, 16). --define(TIMING_SAMPLECOUNTDOWN, 20000). --define(TIMING_SAMPLESIZE, 100). -define(BLOCK_LENGTHS_LENGTH, 20). -define(LMD_LENGTH, 4). -define(FLIPPER32, 4294967295). -define(DOUBLESIZE_LEVEL, 3). -define(INDEX_MODDATE, true). -define(TOMB_COUNT, true). --define(USE_SET_FOR_SPEED, 64). +-define(USE_SET_FOR_SPEED, 32). -define(STARTUP_TIMEOUT, 10000). +-define(MIN_HASH, 32768). +-define(MAX_HASH, 65535). +-define(LOG_BUILDTIMINGS_LEVELS, [3]). -ifdef(TEST). -define(HIBERNATE_TIMEOUT, 5000). @@ -128,13 +129,14 @@ -export([sst_newmerge/10]). --export([tune_seglist/1, extract_hash/1, member_check/2]). +-export([tune_seglist/1, extract_hash/1, segment_checker/1]). -export([in_range/3]). --record(slot_index_value, {slot_id :: integer(), - start_position :: integer(), - length :: integer()}). +-record(slot_index_value, + {slot_id :: integer(), + start_position :: integer(), + length :: integer()}). -record(summary, {first_key :: tuple(), @@ -143,62 +145,61 @@ size :: integer(), max_sqn :: integer()}). %% DO NOT CHANGE - %% The summary record is persisted as part of the sile format - %% Any chnage to this record will mean the change cannot be rolled back + %% The summary record is persisted as part of the file format + %% Any change to this record will mean the change cannot be rolled back +-type slot_index_value() + :: #slot_index_value{}. -type press_method() - :: lz4|native|none. + :: lz4|native|none. -type range_endpoint() - :: all|leveled_codec:ledger_key(). + :: all|leveled_codec:ledger_key(). -type slot_pointer() - :: {pointer, pid(), integer(), range_endpoint(), range_endpoint()}. + :: {pointer, + pid(), slot_index_value(), range_endpoint(), range_endpoint()}. -type sst_pointer() - % Used in sst_new - :: {next, - leveled_pmanifest:manifest_entry(), - range_endpoint()}. + % Used in sst_new + :: {next, + leveled_pmanifest:manifest_entry(), + range_endpoint()}. -type sst_closed_pointer() - % used in expand_list_by_pointer - % (close point is added by maybe_expand_pointer - :: {next, - leveled_pmanifest:manifest_entry(), - range_endpoint(), - range_endpoint()}. + % used in expand_list_by_pointer + % (close point is added by maybe_expand_pointer + :: {next, + leveled_pmanifest:manifest_entry(), + range_endpoint(), + range_endpoint()}. -type expandable_pointer() - :: slot_pointer()|sst_pointer()|sst_closed_pointer(). + :: slot_pointer()|sst_pointer()|sst_closed_pointer(). -type expanded_pointer() - :: leveled_codec:ledger_kv()|expandable_pointer(). --type binaryslot_element() - :: {tuple(), tuple()}|{binary(), integer(), tuple(), tuple()}. + :: leveled_codec:ledger_kv()|expandable_pointer(). +-type expanded_slot() :: + {binary(), non_neg_integer(), range_endpoint(), range_endpoint()}. -type tuned_seglist() - :: false| - {sets, sets:set(non_neg_integer())}| - {list, list(non_neg_integer())}. + :: false | list(non_neg_integer()). -type sst_options() - :: #sst_options{}. + :: #sst_options{}. -type binary_slot() - :: {binary(), binary(), list(integer()), leveled_codec:ledger_key()}. + :: {binary(), binary(), list(integer()), leveled_codec:ledger_key()}. -type sst_summary() - :: #summary{}. + :: #summary{}. -type blockindex_cache() - :: {non_neg_integer(), array:array(), non_neg_integer()}. + :: {non_neg_integer(), array:array(), non_neg_integer()}. -type fetch_cache() - :: array:array()|no_cache. + :: array:array()|no_cache. -type cache_size() - :: no_cache|4|32|64. + :: no_cache|4|32|64. -type cache_hash() - :: no_cache|non_neg_integer(). --type level() - :: non_neg_integer(). + :: no_cache|non_neg_integer(). -type summary_filter() - :: fun((leveled_codec:ledger_key()) -> any()). - -%% yield_blockquery is used to determine if the work necessary to process a -%% range query beyond the fetching the slot should be managed from within -%% this process, or should be handled by the calling process. -%% Handling within the calling process may lead to extra binary heap garbage -%% see Issue 52. Handling within the SST process may lead to contention and -%% extra copying. Files at the top of the tree yield, those lower down don't. + :: fun((leveled_codec:ledger_key()) -> any()). +-type segment_check_fun() + :: non_neg_integer() + | {non_neg_integer(), non_neg_integer(), + fun((non_neg_integer()) -> boolean())} + | false. +-type fetch_levelzero_fun() + :: fun((pos_integer(), leveled_penciller:levelzero_returnfun()) -> ok). -record(state, {summary, @@ -206,7 +207,6 @@ penciller :: pid() | undefined | false, root_path, filename, - yield_blockquery = false :: boolean(), blockindex_cache :: blockindex_cache() | undefined | redacted, compression_method = native :: press_method(), @@ -215,7 +215,7 @@ fetch_cache = no_cache :: fetch_cache() | redacted, new_slots :: list()|undefined, deferred_startup_tuple :: tuple()|undefined, - level :: level()|undefined, + level :: leveled_pmanifest:lsm_level()|undefined, tomb_count = not_counted :: non_neg_integer()|not_counted, high_modified_date :: non_neg_integer()|undefined, @@ -226,18 +226,19 @@ {slot_hashlist = 0 :: integer(), slot_serialise = 0 :: integer(), slot_finish = 0 :: integer(), - fold_toslot = 0 :: integer()}). + fold_toslot = 0 :: integer(), + last_timestamp = os:timestamp() :: erlang:timestamp()}). --type sst_state() :: #state{}. -type build_timings() :: no_timing|#build_timings{}. --export_type([expandable_pointer/0, press_method/0]). +-export_type([expandable_pointer/0, press_method/0, segment_check_fun/0]). %%%============================================================================ %%% API %%%============================================================================ --spec sst_open(string(), string(), sst_options(), level()) +-spec sst_open( + string(), string(), sst_options(), leveled_pmanifest:lsm_level()) -> {ok, pid(), {leveled_codec:ledger_key(), leveled_codec:ledger_key()}, binary()}. @@ -257,7 +258,7 @@ sst_open(RootPath, Filename, OptsSST, Level) -> {ok, Pid, {SK, EK}, Bloom} end. --spec sst_new(string(), string(), level(), +-spec sst_new(string(), string(), leveled_pmanifest:lsm_level(), list(leveled_codec:ledger_kv()), integer(), sst_options()) -> {ok, pid(), @@ -294,7 +295,7 @@ sst_new(RootPath, Filename, Level, KVList, MaxSQN, OptsSST, IndexModDate) -> -spec sst_newmerge(string(), string(), list(leveled_codec:ledger_kv()|sst_pointer()), list(leveled_codec:ledger_kv()|sst_pointer()), - boolean(), level(), + boolean(), leveled_pmanifest:lsm_level(), integer(), sst_options()) -> empty|{ok, pid(), {{list(leveled_codec:ledger_kv()), @@ -337,39 +338,35 @@ sst_newmerge(RootPath, Filename, empty; _ -> {ok, Pid} = gen_statem:start_link(?MODULE, [], ?START_OPTS), - case gen_statem:call(Pid, {sst_new, - RootPath, - Filename, - Level, - {SlotList, FK}, - MaxSQN, - OptsSST0, - IndexModDate, - CountOfTombs, - self()}, - infinity) of - {ok, {SK, EK}, Bloom} -> - {ok, Pid, {{Rem1, Rem2}, SK, EK}, Bloom} - end + {ok, {SK, EK}, Bloom} = + gen_statem:call( + Pid, + {sst_new, + RootPath, + Filename, + Level, + {SlotList, FK}, + MaxSQN, + OptsSST0, + IndexModDate, + CountOfTombs, self()}, + infinity), + {ok, Pid, {{Rem1, Rem2}, SK, EK}, Bloom} end. --spec sst_newlevelzero(string(), string(), - integer(), - fun((pos_integer(), - leveled_penciller:levelzero_returnfun()) - -> ok)| - list(), - pid()|undefined, - integer(), - sst_options()) -> - {ok, pid(), noreply}. +-spec sst_newlevelzero( + string(), string(), + integer(), + fetch_levelzero_fun()|list(), + pid()|undefined, + integer(), + sst_options()) -> {ok, pid(), noreply}. %% @doc %% Start a new file at level zero. At this level the file size is not fixed - %% it will be as big as the input. Also the KVList is not passed in, it is %% fetched slot by slot using the FetchFun -sst_newlevelzero(RootPath, Filename, - Slots, Fetcher, Penciller, - MaxSQN, OptsSST) -> +sst_newlevelzero( + RootPath, Filename, Slots, Fetcher, Penciller, MaxSQN, OptsSST) -> OptsSST0 = update_options(OptsSST, 0), {ok, Pid} = gen_statem:start_link(?MODULE, [], ?START_OPTS), %% Initiate the file into the "starting" state @@ -383,14 +380,10 @@ sst_newlevelzero(RootPath, Filename, infinity), ok = case Fetcher of - FetchSlots when is_list(Fetcher) -> - gen_statem:cast(Pid, {complete_l0startup, FetchSlots}); - _ -> - % Fetcher is a function - gen_statem:cast(Pid, {sst_returnslot, none, Fetcher, Slots}) - % Start the fetch loop (async). Having the fetch loop running - % on async message passing means that the SST file can now be - % closed while the fetch loop is still completing + SlotList when is_list(SlotList) -> + gen_statem:cast(Pid, {complete_l0startup, SlotList}); + FetchFun when is_function(FetchFun, 2) -> + gen_statem:cast(Pid, {sst_returnslot, none, FetchFun, Slots}) end, {ok, Pid, noreply}. @@ -427,22 +420,21 @@ sst_getsqn(Pid, LedgerKey, Hash) -> sst_getmaxsequencenumber(Pid) -> gen_statem:call(Pid, get_maxsequencenumber, infinity). --spec sst_expandpointer(expandable_pointer(), - list(expandable_pointer()), - pos_integer(), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(expanded_pointer()). +-spec sst_expandpointer( + expandable_pointer(), + list(expandable_pointer()), + pos_integer(), + segment_check_fun(), + non_neg_integer()) -> list(expanded_pointer()). %% @doc %% Expand out a list of pointer to return a list of Keys and Values with a %% tail of pointers (once the ScanWidth has been satisfied). %% Folding over keys in a store uses this function, although this function %% does not directly call the gen_server - it does so by sst_getfilteredslots %% or sst_getfilteredrange depending on the nature of the pointer. -sst_expandpointer(Pointer, MorePointers, ScanWidth, SegmentList, LowLastMod) -> - expand_list_by_pointer(Pointer, MorePointers, ScanWidth, - SegmentList, LowLastMod). - +sst_expandpointer(Pointer, MorePointers, ScanWidth, SegChecker, LowLastMod) -> + expand_list_by_pointer( + Pointer, MorePointers, ScanWidth, SegChecker, LowLastMod). -spec sst_setfordelete(pid(), pid()|false) -> ok. %% @doc @@ -456,7 +448,7 @@ sst_setfordelete(Pid, Penciller) -> -spec sst_gettombcount(pid()) -> non_neg_integer()|not_counted. %% @doc -%% Get the count of tomb stones in this SST file, returning not_counted if this +%% Get the count of tombstones in this SST file, returning not_counted if this %% file was created with a version which did not support tombstone counting, or %% could also be because the file is L0 (which aren't counted as being chosen %% for merge is inevitable) @@ -477,9 +469,8 @@ sst_clear(Pid) -> sst_deleteconfirmed(Pid) -> gen_statem:cast(Pid, close). --spec sst_checkready(pid()) -> {ok, string(), - leveled_codec:ledger_key(), - leveled_codec:ledger_key()}. +-spec sst_checkready(pid()) -> + {ok, string(), leveled_codec:ledger_key(), leveled_codec:ledger_key()}. %% @doc %% If a file has been set to be built, check that it has been built. Returns %% the filename and the {startKey, EndKey} for the manifest. @@ -491,8 +482,7 @@ sst_checkready(Pid) -> %% @doc %% Notify the SST file that it is now working at a new level %% This simply prompts a GC on the PID now (as this may now be a long-lived -%% file, so don't want all the startup state to be held on memory - want to -%% proactively drop it +%% file, so don't want all the startup state to be held on memory) sst_switchlevels(Pid, NewLevel) -> gen_statem:cast(Pid, {switch_levels, NewLevel}). @@ -502,8 +492,6 @@ sst_switchlevels(Pid, NewLevel) -> sst_close(Pid) -> gen_statem:call(Pid, close). - - %%%============================================================================ %%% gen_statem callbacks %%%============================================================================ @@ -546,22 +534,23 @@ starting({call, From}, {Length, SlotIndex, BlockEntries, SlotsBin, Bloom} = build_all_slots(SlotList), {_, BlockIndex, HighModDate} = - update_blockindex_cache(true, - BlockEntries, - new_blockindex_cache(Length), - undefined, - IdxModDate), + update_blockindex_cache( + BlockEntries, + new_blockindex_cache(Length), + undefined, + IdxModDate), SummaryBin = - build_table_summary(SlotIndex, Level, FirstKey, Length, - MaxSQN, Bloom, CountOfTombs), + build_table_summary( + SlotIndex, Level, FirstKey, Length, MaxSQN, Bloom, CountOfTombs), ActualFilename = - write_file(RootPath, Filename, SummaryBin, SlotsBin, - PressMethod, IdxModDate, CountOfTombs), - YBQ = Level =< 2, + write_file( + RootPath, Filename, SummaryBin, SlotsBin, + PressMethod, IdxModDate, CountOfTombs), {UpdState, Bloom} = - read_file(ActualFilename, - State#state{root_path=RootPath, yield_blockquery=YBQ}, - OptsSST#sst_options.pagecache_level >= Level), + read_file( + ActualFilename, + State#state{root_path=RootPath}, + OptsSST#sst_options.pagecache_level >= Level), Summary = UpdState#state.summary, leveled_log:log_timer( sst08, [ActualFilename, Level, Summary#summary.max_sqn], SW), @@ -620,17 +609,17 @@ starting(cast, complete_l0startup, State) -> {SlotCount, SlotIndex, BlockEntries, SlotsBin,Bloom} = build_all_slots(SlotList), {_, BlockIndex, HighModDate} = - update_blockindex_cache(true, - BlockEntries, - new_blockindex_cache(SlotCount), - undefined, - IdxModDate), + update_blockindex_cache( + BlockEntries, + new_blockindex_cache(SlotCount), + undefined, + IdxModDate), Time2 = timer:now_diff(os:timestamp(), SW2), SW3 = os:timestamp(), SummaryBin = - build_table_summary(SlotIndex, 0, FirstKey, SlotCount, - MaxSQN, Bloom, not_counted), + build_table_summary( + SlotIndex, 0, FirstKey, SlotCount, MaxSQN, Bloom, not_counted), Time3 = timer:now_diff(os:timestamp(), SW3), SW4 = os:timestamp(), @@ -638,14 +627,13 @@ starting(cast, complete_l0startup, State) -> write_file(RootPath, Filename, SummaryBin, SlotsBin, PressMethod, IdxModDate, not_counted), {UpdState, Bloom} = - read_file(ActualFilename, - State#state{root_path=RootPath, - yield_blockquery=true, - % Important to empty this from state rather - % than carry it through to the next stage - new_slots=undefined, - deferred_startup_tuple=undefined}, - true), + read_file( + ActualFilename, + State#state{ + root_path=RootPath, + new_slots=undefined, % Important to empty this from state + deferred_startup_tuple=undefined}, + true), Summary = UpdState#state.summary, Time4 = timer:now_diff(os:timestamp(), SW4), @@ -690,16 +678,14 @@ starting(cast, {sst_returnslot, FetchedSlot, FetchFun, SlotCount}, State) -> Self = self(), ReturnFun = fun(NextSlot) -> - gen_statem:cast(Self, - {sst_returnslot, NextSlot, - FetchFun, SlotCount}) + gen_statem:cast( + Self, {sst_returnslot, NextSlot, FetchFun, SlotCount}) end, FetchFun(length(FetchedSlots) + 1, ReturnFun), {keep_state, State#state{new_slots = FetchedSlots}} end. - reader({call, From}, {get_kv, LedgerKey, Hash, Filter}, State) -> % Get a KV value and potentially take sample timings Monitor = @@ -746,68 +732,30 @@ reader({call, From}, {get_kv, LedgerKey, Hash, Filter}, State) -> [hibernate, {reply, From, Result}]} end; reader({call, From}, - {get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod}, + {fetch_range, StartKey, EndKey, LowLastMod}, State) -> - ReadNeeded = - check_modified(State#state.high_modified_date, - LowLastMod, - State#state.index_moddate), - {NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} = - case ReadNeeded of - true -> - fetch_range(StartKey, EndKey, ScanWidth, - SegList, LowLastMod, - State); - false -> - {false, [], []} - end, - PressMethod = State#state.compression_method, - IdxModDate = State#state.index_moddate, - - case State#state.yield_blockquery of - true -> - {keep_state_and_data, - [{reply, - From, - {yield, - SlotsToFetchBinList, - SlotsToPoint, - PressMethod, - IdxModDate} - }]}; - false -> - {L, FoundBIC} = - binaryslot_reader( - SlotsToFetchBinList, PressMethod, IdxModDate, SegList), - {UpdateCache, BlockIdxC0, HighModDate} = - update_blockindex_cache(NeedBlockIdx, - FoundBIC, - State#state.blockindex_cache, - State#state.high_modified_date, - State#state.index_moddate), - case UpdateCache of - true -> - {keep_state, - State#state{ - blockindex_cache = BlockIdxC0, - high_modified_date = HighModDate}, - [{reply, From, L ++ SlotsToPoint}]}; - false -> - {keep_state_and_data, - [hibernate, {reply, From, L ++ SlotsToPoint}]} - end - end; -reader({call, From}, {get_slots, SlotList, SegList, LowLastMod}, State) -> + SlotsToPoint = + fetch_range( + StartKey, + EndKey, + State#state.summary, + State#state.filter_fun, + check_modified( + State#state.high_modified_date, + LowLastMod, + State#state.index_moddate) + ), + {keep_state_and_data, [{reply, From, SlotsToPoint}]}; +reader({call, From}, {get_slots, SlotList, SegChecker, LowLastMod}, State) -> PressMethod = State#state.compression_method, IdxModDate = State#state.index_moddate, {NeedBlockIdx, SlotBins} = - read_slots(State#state.handle, - SlotList, - {SegList, - LowLastMod, - State#state.blockindex_cache}, - State#state.compression_method, - State#state.index_moddate), + read_slots( + State#state.handle, + SlotList, + {SegChecker, LowLastMod, State#state.blockindex_cache}, + State#state.compression_method, + State#state.index_moddate), {keep_state_and_data, [{reply, From, {NeedBlockIdx, SlotBins, PressMethod, IdxModDate}}]}; reader({call, From}, get_maxsequencenumber, State) -> @@ -838,9 +786,11 @@ reader({call, From}, close, State) -> {stop_and_reply, normal, [{reply, From, ok}], State}; reader(cast, {switch_levels, NewLevel}, State) -> - FreshCache = new_cache(NewLevel), {keep_state, - State#state{level = NewLevel, fetch_cache = FreshCache}, + State#state{ + level = NewLevel, + fetch_cache = new_cache(NewLevel) + }, [hibernate]}; reader(info, {update_blockindex_cache, BIC}, State) -> handle_update_blockindex_cache(BIC, State); @@ -890,33 +840,33 @@ delete_pending({call, From}, {get_kv, LedgerKey, Hash, Filter}, State) -> {keep_state_and_data, [{reply, From, Result}, ?DELETE_TIMEOUT]}; delete_pending( {call, From}, - {get_kvrange, StartKey, EndKey, ScanWidth, SegList, LowLastMod}, + {fetch_range, StartKey, EndKey, LowLastMod}, State) -> - {_NeedBlockIdx, SlotsToFetchBinList, SlotsToPoint} = - fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State), - % Always yield as about to clear and de-reference - PressMethod = State#state.compression_method, - IdxModDate = State#state.index_moddate, - {keep_state_and_data, - [{reply, From, - {yield, - SlotsToFetchBinList, - SlotsToPoint, - PressMethod, - IdxModDate}}, - ?DELETE_TIMEOUT]}; + SlotsToPoint = + fetch_range( + StartKey, + EndKey, + State#state.summary, + State#state.filter_fun, + check_modified( + State#state.high_modified_date, + LowLastMod, + State#state.index_moddate) + ), + {keep_state_and_data, [{reply, From, SlotsToPoint}, ?DELETE_TIMEOUT]}; delete_pending( {call, From}, - {get_slots, SlotList, SegList, LowLastMod}, + {get_slots, SlotList, SegChecker, LowLastMod}, State) -> PressMethod = State#state.compression_method, IdxModDate = State#state.index_moddate, {_NeedBlockIdx, SlotBins} = - read_slots(State#state.handle, - SlotList, - {SegList, LowLastMod, State#state.blockindex_cache}, - PressMethod, - IdxModDate), + read_slots( + State#state.handle, + SlotList, + {SegChecker, LowLastMod, State#state.blockindex_cache}, + PressMethod, + IdxModDate), {keep_state_and_data, [{reply, From, {false, SlotBins, PressMethod, IdxModDate}}, ?DELETE_TIMEOUT]}; @@ -952,17 +902,21 @@ delete_pending(timeout, _, State) -> {keep_state_and_data, [leveled_rand:uniform(10) * ?DELETE_TIMEOUT]}. handle_update_blockindex_cache(BIC, State) -> - {_, BlockIndexCache, HighModDate} = - update_blockindex_cache(true, - BIC, - State#state.blockindex_cache, - State#state.high_modified_date, - State#state.index_moddate), - {keep_state, - State#state{ - blockindex_cache = BlockIndexCache, - high_modified_date = HighModDate}}. - + {NeedBlockIdx, BlockIndexCache, HighModDate} = + update_blockindex_cache( + BIC, + State#state.blockindex_cache, + State#state.high_modified_date, + State#state.index_moddate), + case NeedBlockIdx of + true -> + {keep_state, + State#state{ + blockindex_cache = BlockIndexCache, + high_modified_date = HighModDate}}; + false -> + keep_state_and_data + end. terminate(normal, delete_pending, _State) -> ok; @@ -983,10 +937,10 @@ format_status(terminate, [_PDict, _, State]) -> %%% External Functions %%%============================================================================ --spec expand_list_by_pointer(expandable_pointer(), - list(expandable_pointer()), - pos_integer()) - -> list(expanded_pointer()). +-spec expand_list_by_pointer( + expandable_pointer(), + list(expandable_pointer()), + pos_integer()) -> list(expanded_pointer()). %% @doc %% Expand a list of pointers, maybe ending up with a list of keys and values %% with a tail of pointers @@ -996,114 +950,96 @@ format_status(terminate, [_PDict, _, State]) -> %% skip those slots not containing any information over the low last modified %% date expand_list_by_pointer(Pointer, Tail, Width) -> - expand_list_by_pointer(Pointer, Tail, Width, false). - -%% TODO until leveled_penciller updated -expand_list_by_pointer(Pointer, Tail, Width, SegList) -> - expand_list_by_pointer(Pointer, Tail, Width, SegList, 0). - --spec expand_list_by_pointer(expandable_pointer(), - list(expandable_pointer()), - pos_integer(), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(expanded_pointer()). + expand_list_by_pointer(Pointer, Tail, Width, false, 0). + +-spec expand_list_by_pointer( + expandable_pointer(), + list(expandable_pointer()), + pos_integer(), + segment_check_fun(), + non_neg_integer()) -> list(expanded_pointer()). %% @doc %% With filters (as described in expand_list_by_pointer/3 -expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, EndKey}, - Tail, Width, SegList, LowLastMod) -> - FoldFun = - fun(X, {Pointers, Remainder}) -> - case length(Pointers) of - L when L < Width -> - case X of - {pointer, SSTPid, S, SK, EK} -> - {Pointers ++ [{pointer, S, SK, EK}], Remainder}; - _ -> - {Pointers, Remainder ++ [X]} - end; - _ -> - {Pointers, Remainder ++ [X]} - end +expand_list_by_pointer( + {pointer, SSTPid, Slot, StartKey, EndKey}, + Tail, Width, SegChecker, LowLastMod) -> + {PotentialPointers, Remainder} = + lists:split(min(Width - 1, length(Tail)), Tail), + {LocalPointers, OtherPointers} = + lists:partition( + fun(Pointer) -> + case Pointer of + {pointer, SSTPid, _S, _SK, _EK} -> + true; + _ -> + false + end end, - InitAcc = {[{pointer, Slot, StartKey, EndKey}], []}, - {AccPointers, AccTail} = lists:foldl(FoldFun, InitAcc, Tail), - ExpPointers = sst_getfilteredslots(SSTPid, - AccPointers, - SegList, - LowLastMod), - lists:append(ExpPointers, AccTail); -expand_list_by_pointer({next, ManEntry, StartKey, EndKey}, - Tail, Width, SegList, LowLastMod) -> + PotentialPointers + ), + sst_getfilteredslots( + SSTPid, + [{pointer, SSTPid, Slot, StartKey, EndKey}|LocalPointers], + SegChecker, + LowLastMod, + OtherPointers ++ Remainder + ); +expand_list_by_pointer( + {next, ManEntry, StartKey, EndKey}, + Tail, _Width, _SegChecker, LowLastMod) -> + % The first pointer is a pointer to a file - expand_list_by_pointer will + % in this case convert this into list of pointers within that SST file + % i.e. of the form {pointer, SSTPid, Slot, StartKey, EndKey} + % This can then be further expanded by calling again to + % expand_list_by_pointer SSTPid = ManEntry#manifest_entry.owner, leveled_log:log(sst10, [SSTPid, is_process_alive(SSTPid)]), - ExpPointer = sst_getfilteredrange(SSTPid, - StartKey, - EndKey, - Width, - SegList, - LowLastMod), + ExpPointer = sst_getfilteredrange(SSTPid, StartKey, EndKey, LowLastMod), ExpPointer ++ Tail. --spec sst_getfilteredrange(pid(), - range_endpoint(), - range_endpoint(), - integer(), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(leveled_codec:ledger_kv()|slot_pointer()). +-spec sst_getfilteredrange( + pid(), + range_endpoint(), + range_endpoint(), + non_neg_integer()) -> list(slot_pointer()). %% @doc -%% Get a range of {Key, Value} pairs as a list between StartKey and EndKey -%% (inclusive). The ScanWidth is the maximum size of the range, a pointer -%% will be placed on the tail of the resulting list if results expand beyond -%% the Scan Width -%% -%% To make the range open-ended (either to start, end or both) the all atom -%% can be used in place of the Key tuple. -%% -%% A segment list can also be passed, which inidcates a subset of segment -%% hashes of interest in the query. -%% -%% TODO: Optimise this so that passing a list of segments that tune to the -%% same hash is faster - perhaps provide an exportable function in -%% leveled_tictac -sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, SegList, LowLastMod) -> - SegList0 = tune_seglist(SegList), - case gen_statem:call(Pid, {get_kvrange, - StartKey, EndKey, - ScanWidth, SegList0, LowLastMod}, - infinity) of - {yield, SlotsToFetchBinList, SlotsToPoint, PressMethod, IdxModDate} -> - {L, _BIC} = - binaryslot_reader(SlotsToFetchBinList, - PressMethod, IdxModDate, SegList0), - L ++ SlotsToPoint; - Reply -> - Reply - end. - - --spec sst_getfilteredslots(pid(), - list(slot_pointer()), - leveled_codec:segment_list(), - non_neg_integer()) - -> list(leveled_codec:ledger_kv()). +%% Get a list of slot_pointers that contain the information to look into those +%% slots to find the actual {K, V} pairs between the range endpoints. +%% Expanding these slot_pointers can be done using sst_getfilteredslots/5 +%% +%% Use segment_checker/1 to produce a segment_check_fun if the hashes of the +%% keys to be found are known. The LowLastMod integer will skip any blocks +%% where all keys were modified before thta date. +sst_getfilteredrange(Pid, StartKey, EndKey, LowLastMod) -> + gen_statem:call( + Pid, {fetch_range, StartKey, EndKey, LowLastMod}, infinity). + + +-spec sst_getfilteredslots( + pid(), + list(slot_pointer()), + segment_check_fun(), + non_neg_integer(), + list(expandable_pointer())) -> list(leveled_codec:ledger_kv()). %% @doc %% Get a list of slots by their ID. The slot will be converted from the binary -%% to term form outside of the FSM loop +%% to term form outside of the FSM loop, unless a segment_check_fun is passed, +%% and this process has cached the index to be used by the segment_check_fun, +%% and in this case the list of Slotbins will include the actual {K, V} pairs. %% -%% A list of 16-bit integer Segment IDs can be passed to filter the keys -%% returned (not precisely - with false results returned in addition). Use -%% false as a SegList to not filter. -%% An integer can be provided which gives a floor for the LastModified Date -%% of the object, if the object is to be covered by the query -sst_getfilteredslots(Pid, SlotList, SegList, LowLastMod) -> - SegL0 = tune_seglist(SegList), +%% Use segment_checker/1 to produce a segment_check_fun if the hashes of the +%% keys to be found are known. The LowLastMod integer will skip any blocks +%% where all keys were modified before thta date, but the results may still +%% contain older values (the calling function should still filter by modified +%% date as required). +sst_getfilteredslots(Pid, SlotList, SegChecker, LowLastMod, Pointers) -> {NeedBlockIdx, SlotBins, PressMethod, IdxModDate} = gen_statem:call( - Pid, {get_slots, SlotList, SegL0, LowLastMod}, infinity), - {L, BIC} = binaryslot_reader(SlotBins, PressMethod, IdxModDate, SegL0), + Pid, {get_slots, SlotList, SegChecker, LowLastMod}, infinity), + {L, BIC} = + binaryslot_reader( + SlotBins, PressMethod, IdxModDate, SegChecker, Pointers), case NeedBlockIdx of true -> erlang:send(Pid, {update_blockindex_cache, BIC}); @@ -1112,45 +1048,67 @@ sst_getfilteredslots(Pid, SlotList, SegList, LowLastMod) -> end, L. - --spec find_pos(binary(), - non_neg_integer()| - {list, list(non_neg_integer())}| - {sets, sets:set(non_neg_integer())}, - list(non_neg_integer()), - non_neg_integer()) -> list(non_neg_integer()). +-spec find_pos( + binary(), segment_check_fun()) -> list(non_neg_integer()). %% @doc %% Find a list of positions where there is an element with a matching segment %% ID to the expected segments (which can either be a single segment, a list of -%% segments or a set of segments depending on size. -find_pos(<<1:1/integer, PotentialHit:15/integer, T/binary>>, - Checker, PosList, Count) -> - case member_check(PotentialHit, Checker) of +%% segments or a set of segments depending on size). The segment_check_fun +%% will do the matching. Segments are 15-bits of the hash of the key. +find_pos(Bin, H) when is_integer(H) -> + find_posint(Bin, H, [], 0); +find_pos(Bin, {Min, Max, CheckFun}) -> + find_posmlt(Bin, Min, Max, CheckFun, [], 0). + +find_posint(<>, H, PosList, Count) -> + find_posint(T, H, [Count|PosList], Count + 1); +find_posint(<>, H, PosList, Count) + when Miss >= ?MIN_HASH -> + find_posint(T, H, PosList, Count + 1); +find_posint(<>, H, PosList, Count) when NHC < 128 -> + find_posint(T, H, PosList, Count + NHC + 1); +find_posint(_BinRem, _H, PosList, _Count) -> + lists:reverse(PosList). + +find_posmlt(<>, Min, Max, CheckFun, PosList, Count) + when H >= Min, H =< Max -> + case CheckFun(H) of true -> - find_pos(T, Checker, PosList ++ [Count], Count + 1); + find_posmlt(T, Min, Max, CheckFun, [Count|PosList], Count + 1); false -> - find_pos(T, Checker, PosList, Count + 1) + find_posmlt(T, Min, Max, CheckFun, PosList, Count + 1) end; -find_pos(<<0:1/integer, NHC:7/integer, T/binary>>, Checker, PosList, Count) -> - find_pos(T, Checker, PosList, Count + NHC + 1); -find_pos(_BinRem, _Hash, PosList, _Count) -> - %% Expect this to be <<>> - i.e. at end of binary, but if there is - %% corruption, could be some other value - so return as well in this - %% case - PosList. - - --spec member_check(non_neg_integer(), - non_neg_integer()| - {list, list(non_neg_integer())}| - {sets, sets:set(non_neg_integer())}) -> boolean(). -member_check(Hash, Hash) -> - true; -member_check(Hash, {list, HashList}) -> - lists:member(Hash, HashList); -member_check(Hash, {sets, HashSet}) -> - sets:is_element(Hash, HashSet); -member_check(_Miss, _Checker) -> +find_posmlt(<>, Min, Max, CheckFun, PosList, Count) + when Miss >= ?MIN_HASH -> + find_posmlt(T, Min, Max, CheckFun, PosList, Count + 1); +find_posmlt(<>, Min, Max, CheckFun, PosList, Count) + when NHC < 128 -> + find_posmlt(T, Min, Max, CheckFun, PosList, Count + NHC + 1); +find_posmlt(_BinRem, _Min, _Max, _CheckFun, PosList, _Count) -> + lists:reverse(PosList). + + +-spec segment_checker( + non_neg_integer()| list(non_neg_integer())| false) + -> segment_check_fun(). +segment_checker(Hash) when is_integer(Hash) -> + Hash; +segment_checker(HashList) when is_list(HashList) -> + %% Note that commonly segments will be close together numerically. The + %% guess/estimate process for checking vnode size selects a contiguous + %% range. Also the kv_index_tictactree segment selector tries to group + %% segment IDs close together. Hence checking the bounds first is + %% generally much faster than a straight membership test. + Min = lists:min(HashList), + Max = lists:max(HashList), + case length(HashList) > ?USE_SET_FOR_SPEED of + true -> + HashSet = sets:from_list(HashList), + {Min, Max, fun(H) -> sets:is_element(H, HashSet) end}; + false -> + {Min, Max, fun(H) -> lists:member(H, HashList) end} + end; +segment_checker(false) -> false. -spec sqn_only(leveled_codec:ledger_kv()|not_present) @@ -1160,13 +1118,15 @@ sqn_only(not_present) -> sqn_only(KV) -> leveled_codec:strip_to_seqonly(KV). +-spec extract_hash( + leveled_codec:segment_hash()) -> non_neg_integer()|no_lookup. extract_hash({SegHash, _ExtraHash}) when is_integer(SegHash) -> tune_hash(SegHash); extract_hash(NotHash) -> NotHash. --spec new_cache(level()) -> fetch_cache(). +-spec new_cache(leveled_pmanifest:lsm_level()) -> fetch_cache(). new_cache(Level) -> case cache_size(Level) of no_cache -> @@ -1188,7 +1148,7 @@ cache_hash({_SegHash, ExtraHash}, Level) when is_integer(ExtraHash) -> %% as each level has more files than the previous level. Load tests with %% any sort of pareto distribution show far better cost/benefit ratios for %% cache at higher levels. --spec cache_size(level()) -> cache_size(). +-spec cache_size(leveled_pmanifest:lsm_level()) -> cache_size(). cache_size(N) when N < 3 -> 64; cache_size(3) -> @@ -1220,25 +1180,18 @@ add_to_cache(CacheHash, KV, FetchCache) -> array:set(CacheHash, KV, FetchCache). --spec tune_hash(non_neg_integer()) -> non_neg_integer(). +-spec tune_hash(non_neg_integer()) -> ?MIN_HASH..?MAX_HASH. %% @doc -%% Only 15 bits of the hash is ever interesting +%% Only 15 bits of the hash is ever interesting, and this is converted +%% into a 16-bit hash for matching by adding 2 ^ 15 (i.e. a leading 1) tune_hash(SegHash) -> - SegHash band 32767. + ?MIN_HASH + (SegHash band (?MIN_HASH - 1)). -spec tune_seglist(leveled_codec:segment_list()) -> tuned_seglist(). -%% @doc -%% Only 15 bits of the hash is ever interesting tune_seglist(SegList) -> case is_list(SegList) of true -> - SL0 = lists:usort(lists:map(fun tune_hash/1, SegList)), - case length(SL0) > ?USE_SET_FOR_SPEED of - true -> - {sets, sets:from_list(SL0)}; - false -> - {list, SL0} - end; + lists:usort(lists:map(fun tune_hash/1, SegList)); false -> false end. @@ -1289,11 +1242,12 @@ updatebic_foldfun(HMDRequired) -> end. -spec update_blockindex_cache( - boolean(), list({integer(), binary()}), - blockindex_cache(), non_neg_integer()|undefined, + list({integer(), binary()}), + blockindex_cache(), + non_neg_integer()|undefined, boolean()) -> {boolean(), blockindex_cache(), non_neg_integer()|undefined}. -update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) -> +update_blockindex_cache(Entries, BIC, HighModDate, IdxModDate) -> case {element(1, BIC), array:size(element(2, BIC))} of {N, N} -> {false, BIC, HighModDate}; @@ -1315,9 +1269,7 @@ update_blockindex_cache(true, Entries, BIC, HighModDate, IdxModDate) -> _ -> {true, BIC0, undefined} end - end; -update_blockindex_cache(_Needed, _Entries, BIC, HighModDate, _IdxModDate) -> - {false, BIC, HighModDate}. + end. -spec check_modified(non_neg_integer()|undefined, non_neg_integer(), @@ -1345,8 +1297,8 @@ check_modified(_, _, _) -> blockindex_cache()|no_update, non_neg_integer()|undefined|no_update, fetch_cache()|no_update}. + %% @doc -%% %% Fetch a key from the store, potentially taking timings. Result should be %% not_present if the key is not in the store. fetch(LedgerKey, Hash, @@ -1366,11 +1318,8 @@ fetch(LedgerKey, Hash, binaryslot_get( SlotBin, LedgerKey, Hash, PressMethod, IndexModDate), {_UpdateState, BIC0, HMD0} = - update_blockindex_cache(true, - [{SlotID, Header}], - BIC, - HighModDate, - IndexModDate), + update_blockindex_cache( + [{SlotID, Header}], BIC, HighModDate, IndexModDate), case Result of not_present -> maybelog_fetch_timing( @@ -1381,7 +1330,8 @@ fetch(LedgerKey, Hash, end, {Result, BIC0, HMD0, no_update}; {BlockLengths, _LMD, PosBin} -> - PosList = find_pos(PosBin, extract_hash(Hash), [], 0), + PosList = + find_pos(PosBin, segment_checker(extract_hash(Hash))), case PosList of [] -> maybelog_fetch_timing(Monitor, Level, not_found, SW0), @@ -1396,14 +1346,15 @@ fetch(LedgerKey, Hash, _ -> StartPos = Slot#slot_index_value.start_position, Result = - check_blocks(PosList, - {Handle, StartPos}, - BlockLengths, - byte_size(PosBin), - LedgerKey, - PressMethod, - IndexModDate, - not_present), + check_blocks( + PosList, + {Handle, StartPos}, + BlockLengths, + byte_size(PosBin), + LedgerKey, + PressMethod, + IndexModDate, + not_present), case Result of not_present -> maybelog_fetch_timing( @@ -1424,92 +1375,65 @@ fetch(LedgerKey, Hash, end. --spec fetch_range(tuple(), tuple(), integer(), - leveled_codec:segment_list(), non_neg_integer(), - sst_state()) -> - {boolean(), list(), list()}. +-spec fetch_range( + range_endpoint(), + range_endpoint(), + sst_summary(), + summary_filter(), + boolean()) -> list(slot_pointer()). %% @doc -%% Fetch the contents of the SST file for a given key range. This will -%% pre-fetch some results, and append pointers for additional results. -%% -%% A filter can be provided based on the Segment ID (usable for hashable -%% objects not no_lookup entries) to accelerate the query if the 5-arity -%% version is used -fetch_range(StartKey, EndKey, ScanWidth, SegList, LowLastMod, State) -> - Summary = State#state.summary, - Handle = State#state.handle, +%% Fetch pointers to the slots the SST file covered by a given key range. +fetch_range(StartKey, EndKey, Summary, FilterFun, true) -> {Slots, RTrim} = lookup_slots( StartKey, EndKey, Summary#summary.index, - State#state.filter_fun), + FilterFun), Self = self(), SL = length(Slots), - - ExpandedSlots = - case SL of - 1 -> - [Slot] = Slots, - case RTrim of - true -> - [{pointer, Self, Slot, StartKey, EndKey}]; - false -> - [{pointer, Self, Slot, StartKey, all}] - end; - N -> - {LSlot, MidSlots, RSlot} = - case N of - 2 -> - [Slot1, Slot2] = Slots, - {Slot1, [], Slot2}; - N -> - [Slot1|_Rest] = Slots, - SlotN = lists:last(Slots), - {Slot1, lists:sublist(Slots, 2, N - 2), SlotN} - end, - MidSlotPointers = lists:map(fun(S) -> - {pointer, Self, S, all, all} - end, - MidSlots), - case RTrim of - true -> - [{pointer, Self, LSlot, StartKey, all}] ++ - MidSlotPointers ++ - [{pointer, Self, RSlot, all, EndKey}]; - false -> - [{pointer, Self, LSlot, StartKey, all}] ++ - MidSlotPointers ++ - [{pointer, Self, RSlot, all, all}] - end - end, - {SlotsToFetch, SlotsToPoint} = - case ScanWidth of - SW when SW >= SL -> - {ExpandedSlots, []}; - _ -> - lists:split(ScanWidth, ExpandedSlots) - end, - - {NeededBlockIdx, SlotsToFetchBinList} = - read_slots(Handle, - SlotsToFetch, - {SegList, LowLastMod, State#state.blockindex_cache}, - State#state.compression_method, - State#state.index_moddate), - {NeededBlockIdx, SlotsToFetchBinList, SlotsToPoint}. + case SL of + 1 -> + [Slot] = Slots, + case RTrim of + true -> + [{pointer, Self, Slot, StartKey, EndKey}]; + false -> + [{pointer, Self, Slot, StartKey, all}] + end; + N -> + {LSlot, MidSlots, RSlot} = + {hd(Slots), lists:sublist(Slots, 2, N - 2), lists:last(Slots)}, + MidSlotPointers = + lists:map( + fun(S) -> {pointer, Self, S, all, all} end, + MidSlots), + case RTrim of + true -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, EndKey}]; + false -> + [{pointer, Self, LSlot, StartKey, all}] ++ + MidSlotPointers ++ + [{pointer, Self, RSlot, all, all}] + end + end; +fetch_range(_StartKey, _EndKey, _Summary, _FilterFun, false) -> + []. -spec compress_level( non_neg_integer(), non_neg_integer(), press_method()) -> press_method(). %% @doc -%% disable compression at higher levels for improved performance +%% Disable compression at higher levels for improved performance compress_level( Level, LevelToCompress, _PressMethod) when Level < LevelToCompress -> none; compress_level(_Level, _LevelToCompress, PressMethod) -> PressMethod. --spec maxslots_level(level(), pos_integer()) -> pos_integer(). +-spec maxslots_level( + leveled_pmanifest:lsm_level(), pos_integer()) -> pos_integer(). maxslots_level(Level, MaxSlotCount) when Level < ?DOUBLESIZE_LEVEL -> MaxSlotCount; maxslots_level(_Level, MaxSlotCount) -> @@ -1627,17 +1551,18 @@ open_reader(Filename, LoadPageCache) -> {ok, SummaryBin} = file:pread(Handle, SlotsLength + 9, SummaryLength), {Handle, FileVersion, SummaryBin}. -build_table_summary(SlotIndex, _Level, FirstKey, - SlotCount, MaxSQN, Bloom, CountOfTombs) -> +build_table_summary( + SlotIndex, _Level, FirstKey, SlotCount, MaxSQN, Bloom, CountOfTombs) -> [{LastKey, _LastV}|_Rest] = SlotIndex, - Summary = #summary{first_key = FirstKey, - last_key = LastKey, - size = SlotCount, - max_sqn = MaxSQN}, + Summary = + #summary{ + first_key = FirstKey, + last_key = LastKey, + size = SlotCount, + max_sqn = MaxSQN}, SummBin0 = - term_to_binary({Summary, Bloom, lists:reverse(SlotIndex)}, - ?BINARY_SETTINGS), - + term_to_binary( + {Summary, Bloom, lists:reverse(SlotIndex)}, ?BINARY_SETTINGS), SummBin = case CountOfTombs of not_counted -> @@ -1645,7 +1570,6 @@ build_table_summary(SlotIndex, _Level, FirstKey, I -> <> end, - SummCRC = hmac(SummBin), <>. @@ -1665,8 +1589,8 @@ read_table_summary(BinWithCheck, TombCount) -> % If not might it might be possible to rebuild from all the slots case TombCount of not_counted -> - erlang:append_element(binary_to_term(SummBin), - not_counted); + erlang:append_element( + binary_to_term(SummBin), not_counted); _ -> <> = SummBin, erlang:append_element(binary_to_term(SummBin0), I) @@ -1677,33 +1601,32 @@ read_table_summary(BinWithCheck, TombCount) -> build_all_slots(SlotList) -> SlotCount = length(SlotList), {SlotIndex, BlockIndex, SlotsBin, HashLists} = - build_all_slots(SlotList, - 9, - 1, - [], - [], - <<>>, - []), + build_all_slots( + SlotList, 9, 1, [], [], <<>>, []), Bloom = leveled_ebloom:create_bloom(HashLists), {SlotCount, SlotIndex, BlockIndex, SlotsBin, Bloom}. -build_all_slots([], _Pos, _SlotID, - SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> +build_all_slots( + [], + _Pos, _SlotID, SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> {SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists}; -build_all_slots([SlotD|Rest], Pos, SlotID, - SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> +build_all_slots( + [SlotD|Rest], + Pos, SlotID, SlotIdxAcc, BlockIdxAcc, SlotBinAcc, HashLists) -> {BlockIdx, SlotBin, HashList, LastKey} = SlotD, Length = byte_size(SlotBin), - SlotIndexV = #slot_index_value{slot_id = SlotID, - start_position = Pos, - length = Length}, - build_all_slots(Rest, - Pos + Length, - SlotID + 1, - [{LastKey, SlotIndexV}|SlotIdxAcc], - [{SlotID, BlockIdx}|BlockIdxAcc], - <>, - lists:append(HashLists, HashList)). + SlotIndexV = + #slot_index_value{ + slot_id = SlotID, start_position = Pos, length = Length}, + build_all_slots( + Rest, + Pos + Length, + SlotID + 1, + [{LastKey, SlotIndexV}|SlotIdxAcc], + [{SlotID, BlockIdx}|BlockIdxAcc], + <>, + lists:append(HashList, HashLists) + ). generate_filenames(RootFilename) -> @@ -1767,8 +1690,6 @@ deserialise_checkedblock(Bin, _Other) -> % native or none can be treated the same binary_to_term(Bin). - - -spec hmac(binary()|integer()) -> integer(). %% @doc %% Perform a CRC check on an input @@ -1911,7 +1832,7 @@ lookup_slots(StartKey, EndKey, Tree, FilterFun) -> %% binary_to_term is an often repeated task, and this is better with smaller %% slots. %% -%% The outcome has been to divide the slot into four small blocks to minimise +%% The outcome has been to divide the slot into five small blocks to minimise %% the binary_to_term time. A binary index is provided for the slot for all %% Keys that are directly fetchable (i.e. standard keys not index keys). %% @@ -1919,54 +1840,58 @@ lookup_slots(StartKey, EndKey, Tree, FilterFun) -> %% compared to using a 128-member gb:tree. %% %% The binary index is cacheable and doubles as a not_present filter, as it is -%% based on a 17-bit hash (so 0.0039 fpr). - - --spec accumulate_positions(leveled_codec:ledger_kv(), - {binary(), - non_neg_integer(), - list(non_neg_integer()), - leveled_codec:last_moddate()}) -> - {binary(), - non_neg_integer(), - list(non_neg_integer()), - leveled_codec:last_moddate()}. +%% based on a 15-bit hash. + + +-spec accumulate_positions( + list(leveled_codec:ledger_kv()), + {binary(), + non_neg_integer(), + list(leveled_codec:segment_hash()), + leveled_codec:last_moddate()}) -> + {binary(), + non_neg_integer(), + list(leveled_codec:segment_hash()), + leveled_codec:last_moddate()}. %% @doc %% Fold function use to accumulate the position information needed to %% populate the summary of the slot -accumulate_positions({K, V}, {PosBinAcc, NoHashCount, HashAcc, LMDAcc}) -> +accumulate_positions([], Acc) -> + Acc; +accumulate_positions([{K, V}|T], {PosBin, NoHashCount, HashAcc, LMDAcc}) -> {_SQN, H1, LMD} = leveled_codec:strip_to_indexdetails({K, V}), LMDAcc0 = take_max_lastmoddate(LMD, LMDAcc), - PosH1 = extract_hash(H1), - case is_integer(PosH1) of - true -> + case extract_hash(H1) of + PosH1 when is_integer(PosH1) -> case NoHashCount of 0 -> - {<<1:1/integer, PosH1:15/integer,PosBinAcc/binary>>, - 0, - [H1|HashAcc], - LMDAcc0}; - N -> + accumulate_positions( + T, + {<>, + 0, + [H1|HashAcc], + LMDAcc0} + ); + N when N =< 128 -> % The No Hash Count is an integer between 0 and 127 % and so at read time should count NHC + 1 NHC = N - 1, - {<<1:1/integer, - PosH1:15/integer, - 0:1/integer, - NHC:7/integer, - PosBinAcc/binary>>, - 0, - [H1|HashAcc], - LMDAcc0} + accumulate_positions( + T, + {<>, + 0, + [H1|HashAcc], + LMDAcc0}) end; - false -> - {PosBinAcc, NoHashCount + 1, HashAcc, LMDAcc0} + _ -> + accumulate_positions( + T, {PosBin, NoHashCount + 1, HashAcc, LMDAcc0}) end. --spec take_max_lastmoddate(leveled_codec:last_moddate(), - leveled_codec:last_moddate()) -> - leveled_codec:last_moddate(). +-spec take_max_lastmoddate( + leveled_codec:last_moddate(), leveled_codec:last_moddate()) + -> leveled_codec:last_moddate(). %% @doc %% Get the last modified date. If no Last Modified Date on any object, can't %% add the accelerator and should check each object in turn @@ -1975,26 +1900,33 @@ take_max_lastmoddate(undefined, _LMDAcc) -> take_max_lastmoddate(LMD, LMDAcc) -> max(LMD, LMDAcc). --spec generate_binary_slot(leveled_codec:maybe_lookup(), - list(leveled_codec:ledger_kv()), - press_method(), - boolean(), - build_timings()) -> - {binary_slot(), - build_timings()}. +-spec generate_binary_slot( + leveled_codec:maybe_lookup(), + {forward|reverse, list(leveled_codec:ledger_kv())}, + press_method(), + boolean(), + build_timings()) -> {binary_slot(), build_timings()}. %% @doc %% Generate the serialised slot to be used when storing this sublist of keys %% and values -generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> - - SW0 = os:timestamp(), +generate_binary_slot( + Lookup, {DR, KVL0}, PressMethod, IndexModDate, BuildTimings0) -> + % The slot should be received reversed - get last key before flipping + % accumulate_positions/2 should use the reversed KVL for efficiency + {KVL, KVLr} = + case DR of + forward -> + {KVL0, lists:reverse(KVL0)}; + reverse -> + {lists:reverse(KVL0), KVL0} + end, + LastKey = element(1, hd(KVLr)), {HashL, PosBinIndex, LMD} = case Lookup of lookup -> - InitAcc = {<<>>, 0, [], 0}, {PosBinIndex0, NHC, HashL0, LMD0} = - lists:foldr(fun accumulate_positions/2, InitAcc, KVL), + accumulate_positions(KVLr, {<<>>, 0, [], 0}), PosBinIndex1 = case NHC of 0 -> @@ -2008,8 +1940,7 @@ generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> {[], <<0:1/integer, 127:7/integer>>, 0} end, - BuildTimings1 = update_buildtimings(SW0, BuildTimings0, slot_hashlist), - SW1 = os:timestamp(), + BuildTimings1 = update_buildtimings(BuildTimings0, slot_hashlist), {SideBlockSize, MidBlockSize} = case Lookup of @@ -2063,8 +1994,7 @@ generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> serialise_block(KVLE, PressMethod)} end, - BuildTimings2 = update_buildtimings(SW1, BuildTimings1, slot_serialise), - SW2 = os:timestamp(), + BuildTimings2 = update_buildtimings(BuildTimings1, slot_serialise), B1P = case IndexModDate of @@ -2102,9 +2032,7 @@ generate_binary_slot(Lookup, KVL, PressMethod, IndexModDate, BuildTimings0) -> CheckH:32/integer, Header/binary, B1/binary, B2/binary, B3/binary, B4/binary, B5/binary>>, - {LastKey, _LV} = lists:last(KVL), - - BuildTimings3 = update_buildtimings(SW2, BuildTimings2, slot_finish), + BuildTimings3 = update_buildtimings(BuildTimings2, slot_finish), {{Header, SlotBin, HashL, LastKey}, BuildTimings3}. @@ -2201,27 +2129,22 @@ read_slot(Handle, Slot) -> Slot#slot_index_value.length), SlotBin. - -pointer_mapfun(Pointer) -> - {Slot, SK, EK} = - case Pointer of - {pointer, _Pid, Slot0, SK0, EK0} -> - {Slot0, SK0, EK0}; - {pointer, Slot0, SK0, EK0} -> - {Slot0, SK0, EK0} - end, - +-spec pointer_mapfun( + slot_pointer()) -> + {non_neg_integer(), non_neg_integer(), non_neg_integer(), + range_endpoint(), range_endpoint()}. +pointer_mapfun({pointer, _Pid, Slot, SK, EK}) -> {Slot#slot_index_value.start_position, Slot#slot_index_value.length, Slot#slot_index_value.slot_id, SK, EK}. - -type slotbin_fun() :: fun(({non_neg_integer(), non_neg_integer(), non_neg_integer(), - range_endpoint(), range_endpoint()}) -> - {binary(), non_neg_integer(), range_endpoint(), range_endpoint()}). + range_endpoint(), range_endpoint()}) + -> expanded_slot() + ). -spec binarysplit_mapfun(binary(), integer()) -> slotbin_fun(). %% @doc @@ -2234,18 +2157,15 @@ binarysplit_mapfun(MultiSlotBin, StartPos) -> {SlotBin, ID, SK, EK} end. - --spec read_slots(file:io_device(), list(), - {false|list(), non_neg_integer(), blockindex_cache()}, - press_method(), boolean()) -> - {boolean(), list(binaryslot_element())}. +-spec read_slots( + file:io_device(), + list(), + {segment_check_fun(), non_neg_integer(), blockindex_cache()}, + press_method(), + boolean()) + -> {boolean(), list(expanded_slot()|leveled_codec:ledger_kv())}. %% @doc -%% The reading of sots will return a list of either 2-tuples containing -%% {K, V} pairs - or 3-tuples containing {Binary, SK, EK}. The 3 tuples -%% can be exploded into lists of {K, V} pairs using the binaryslot_reader/4 -%% function -%% -%% Reading slots is generally unfiltered, but in the sepcial case when +%% Reading slots is generally unfiltered, but in the special case when %% querting across slots when only matching segment IDs are required the %% BlockIndexCache can be used %% @@ -2258,12 +2178,13 @@ read_slots(Handle, SlotList, {false, 0, _BlockIndexCache}, % No list of segments passed or useful Low LastModified Date % Just read slots in SlotList {false, read_slotlist(SlotList, Handle)}; -read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, +read_slots(Handle, SlotList, {SegChecker, LowLastMod, BlockIndexCache}, PressMethod, IdxModDate) -> - % List of segments passed so only {K, V} pairs matching those segments - % should be returned. This required the {K, V} pair to have been added - % with the appropriate hash - if the pair were added with no_lookup as - % the hash value this will fail unexpectedly. + % Potentially need to check the low last modified date, and also the + % segment_check_fun against the index. If the index is cached, return the + % KV pairs at this point, otherwise return the slot pointer so that the + % term_to_binary work can be conducted by the fold process and not impact + % the heap of this SST process BinMapFun = fun(Pointer, {NeededBlockIdx, Acc}) -> {SP, _L, ID, SK, EK} = pointer_mapfun(Pointer), @@ -2273,17 +2194,13 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % If there is an attempt to use the seg list query and the % index block cache isn't cached for any part this may be % slower as each slot will be read in turn - {true, Acc ++ read_slotlist([Pointer], Handle)}; + {true, read_slotlist([Pointer], Handle) ++ Acc}; {BlockLengths, LMD, BlockIdx} -> % If there is a BlockIndex cached then we can use it to % check to see if any of the expected segments are % present without lifting the slot off disk. Also the % fact that we know position can be used to filter out - % other keys - % - % Note that LMD will be 0 if the indexing of last mod - % date was not enable at creation time. So in this - % case the filter should always map + % blocks. case LowLastMod > LMD of true -> % The highest LMD on the slot was before the @@ -2292,51 +2209,48 @@ read_slots(Handle, SlotList, {SegList, LowLastMod, BlockIndexCache}, % slot - it is all too old {NeededBlockIdx, Acc}; false -> - case SegList of + case SegChecker of false -> - % Need all the slot now + % No SegChecker - need all the slot now {NeededBlockIdx, - Acc ++ - read_slotlist([Pointer], Handle)}; - _SL -> - % Need to find just the right keys - PositionList = - find_pos(BlockIdx, SegList, [], 0), - % Note check_blocks should return [] if - % PositionList is empty (which it may be) - KVL = - check_blocks(PositionList, - {Handle, SP}, - BlockLengths, - byte_size(BlockIdx), - false, - PressMethod, - IdxModDate, - []), - % There is no range passed through to the - % binaryslot_reader, so these results need - % to be filtered - FilterFun = - fun(KV) -> in_range(KV, SK, EK) end, - {NeededBlockIdx, - Acc ++ lists:filter(FilterFun, KVL)} + read_slotlist([Pointer], Handle) ++ Acc + }; + SegChecker -> + TrimmedKVL = + checkblocks_segandrange( + BlockIdx, + {Handle, SP}, + BlockLengths, + PressMethod, + IdxModDate, + SegChecker, + {SK, EK}), + {NeededBlockIdx, TrimmedKVL ++ Acc} end end end end, - lists:foldl(BinMapFun, {false, []}, SlotList). - - --spec in_range(leveled_codec:ledger_kv(), - range_endpoint(), range_endpoint()) -> boolean(). -%% @doc -%% Is the ledger key in the range. -in_range({_LK, _LV}, all, all) -> - true; -in_range({LK, _LV}, all, EK) -> - not leveled_codec:endkey_passed(EK, LK); -in_range({LK, LV}, SK, EK) -> - (LK >= SK) and in_range({LK, LV}, all, EK). + lists:foldr(BinMapFun, {false, []}, SlotList). + + +-spec checkblocks_segandrange( + binary(), + binary()|{file:io_device(), integer()}, + binary(), + press_method(), + boolean(), + segment_check_fun(), + {range_endpoint(), range_endpoint()}) + -> list(leveled_codec:ledger_kv()). +checkblocks_segandrange( + BlockIdx, SlotOrHandle, BlockLengths, + PressMethod, IdxModDate, SegChecker, {StartKey, EndKey}) -> + PositionList = find_pos(BlockIdx, SegChecker), + KVL = + check_blocks( + PositionList, SlotOrHandle, BlockLengths, byte_size(BlockIdx), + false, PressMethod, IdxModDate, []), + in_range(KVL, StartKey, EndKey). read_slotlist(SlotList, Handle) -> @@ -2345,12 +2259,13 @@ read_slotlist(SlotList, Handle) -> lists:map(binarysplit_mapfun(MultiSlotBin, StartPos), LengthList). --spec binaryslot_reader(list(binaryslot_element()), - press_method(), - boolean(), - leveled_codec:segment_list()) - -> {list({tuple(), tuple()}), - list({integer(), binary()})}. +-spec binaryslot_reader( + list(expanded_slot()), + press_method(), + boolean(), + segment_check_fun(), + list(expandable_pointer())) + -> {list({tuple(), tuple()}), list({integer(), binary()})}. %% @doc %% Read the binary slots converting them to {K, V} pairs if they were not %% already {K, V} pairs. If they are already {K, V} pairs it is assumed @@ -2358,11 +2273,12 @@ read_slotlist(SlotList, Handle) -> %% %% Keys which are still to be extracted from the slot, are accompanied at %% this function by the range against which the keys need to be checked. -%% This range is passed with the slot to binaryslot_trimmedlist which should -%% open the slot block by block, filtering individual keys where the endpoints -%% of the block are outside of the range, and leaving blocks already proven to -%% be outside of the range unopened. -binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList) -> +%% This range is passed with the slot to binaryslot_trimmed which +%% should open the slot block by block, filtering individual keys where the +%% endpoints of the block are outside of the range, and leaving blocks already +%% proven to be outside of the range unopened. +binaryslot_reader( + SlotBinsToFetch, PressMethod, IdxModDate, SegChecker, SlotsToPoint) -> % Two accumulators are added. % One to collect the list of keys and values found in the binary slots % (subject to range filtering if the slot is still deserialised at this @@ -2372,37 +2288,34 @@ binaryslot_reader(SlotBinsToFetch, PressMethod, IdxModDate, SegList) -> % of get_kvreader calls. This means that slots which are only used in % range queries can still populate their block_index caches (on the FSM % loop state), and those caches can be used for future queries. - binaryslot_reader(SlotBinsToFetch, - PressMethod, IdxModDate, SegList, [], []). + {Acc, BIAcc} = + binaryslot_reader( + SlotBinsToFetch, PressMethod, IdxModDate, SegChecker, [], []), + {lists:reverse(lists:reverse(SlotsToPoint) ++ Acc), BIAcc}. -binaryslot_reader([], _PressMethod, _IdxModDate, _SegList, Acc, BIAcc) -> +binaryslot_reader([], _PressMethod, _IdxModDate, _SegChecker, Acc, BIAcc) -> {Acc, BIAcc}; -binaryslot_reader([{SlotBin, ID, SK, EK}|Tail], - PressMethod, IdxModDate, SegList, Acc, BIAcc) -> +binaryslot_reader( + [{SlotBin, ID, SK, EK}|Tail], + PressMethod, IdxModDate, SegChecker, Acc, BIAcc) -> % The start key and end key here, may not the start key and end key the % application passed into the query. If the slot is known to lie entirely % inside the range, on either of both sides, the SK and EK may be % substituted for the 'all' key work to indicate there is no need for % entries in this slot to be trimmed from either or both sides. {TrimmedL, BICache} = - binaryslot_trimmedlist(SlotBin, - SK, EK, - PressMethod, - IdxModDate, - SegList), - binaryslot_reader(Tail, - PressMethod, - IdxModDate, - SegList, - Acc ++ TrimmedL, - [{ID, BICache}|BIAcc]); -binaryslot_reader([{K, V}|Tail], - PressMethod, IdxModDate, SegList, Acc, BIAcc) -> + binaryslot_trimmed( + SlotBin, SK, EK, PressMethod, IdxModDate, SegChecker), + binaryslot_reader( + Tail, PressMethod, IdxModDate, SegChecker, + lists:reverse(TrimmedL) ++ Acc, [{ID, BICache}|BIAcc]); +binaryslot_reader(L, PressMethod, IdxModDate, SegChecker, Acc, BIAcc) -> + {KVs, Tail} = lists:splitwith(fun(SR) -> tuple_size(SR) == 2 end, L), % These entries must already have been filtered for membership inside any % range used in the query. - binaryslot_reader(Tail, - PressMethod, IdxModDate, SegList, - Acc ++ [{K, V}], BIAcc). + binaryslot_reader( + Tail, PressMethod, IdxModDate, SegChecker, + lists:reverse(KVs) ++ Acc, BIAcc). read_length_list(Handle, LengthList) -> @@ -2413,8 +2326,8 @@ read_length_list(Handle, LengthList) -> {MultiSlotBin, StartPos}. --spec extract_header(binary()|none, boolean()) -> - {binary(), non_neg_integer(), binary()}|none. +-spec extract_header( + binary()|none, boolean()) -> {binary(), non_neg_integer(), binary()}|none. %% @doc %% Helper for extracting the binaries from the header ignoring the missing LMD %% if LMD is not indexed @@ -2434,10 +2347,8 @@ binaryslot_get(FullBin, Key, Hash, PressMethod, IdxModDate) -> {Header, Blocks} -> {BlockLengths, _LMD, PosBinIndex} = extract_header(Header, IdxModDate), - PosList = find_pos(PosBinIndex, - extract_hash(Hash), - [], - 0), + PosList = + find_pos(PosBinIndex, segment_checker(extract_hash(Hash))), {fetch_value(PosList, BlockLengths, Blocks, Key, PressMethod), Header}; crc_wonky -> @@ -2445,90 +2356,63 @@ binaryslot_get(FullBin, Key, Hash, PressMethod, IdxModDate) -> none} end. -binaryslot_tolist(FullBin, PressMethod, IdxModDate) -> - BlockFetchFun = - fun(Length, {Acc, Bin}) -> - case Length of - 0 -> - {Acc, Bin}; - _ -> - <> = Bin, - {Acc ++ deserialise_block(Block, PressMethod), Rest} - end - end, - - {Out, _Rem} = - case crc_check_slot(FullBin) of - {Header, Blocks} -> - {BlockLengths, _LMD, _PosBinIndex} = - extract_header(Header, IdxModDate), - <> = BlockLengths, - lists:foldl(BlockFetchFun, - {[], Blocks}, - [B1L, B2L, B3L, B4L, B5L]); - crc_wonky -> - {[], <<>>} - end, - Out. +-spec binaryslot_blockstolist( + list(non_neg_integer()), + binary(), + press_method(), + list(leveled_codec:ledger_kv())) -> list(leveled_codec:ledger_kv()). +binaryslot_blockstolist([], _Bin, _PressMethod, Acc) -> + Acc; +binaryslot_blockstolist([0|RestLengths], RestBin, PressMethod, Acc) -> + binaryslot_blockstolist(RestLengths, RestBin, PressMethod, Acc); +binaryslot_blockstolist([L|RestLengths], Bin, PressMethod, Acc) -> + <> = Bin, + binaryslot_blockstolist( + RestLengths, + RestBin, + PressMethod, + Acc ++ deserialise_block(Block, PressMethod)). + +-spec binaryslot_tolist( + binary(), press_method(), boolean()) + -> list(leveled_codec:ledger_kv()). +binaryslot_tolist(FullBin, PressMethod, IdxModDate) -> + case crc_check_slot(FullBin) of + {Header, Blocks} -> + {BlockLengths, _LMD, _PosBinIndex} = + extract_header(Header, IdxModDate), + <> = BlockLengths, + binaryslot_blockstolist( + [B1L, B2L, B3L, B4L, B5L], Blocks, PressMethod, []); + crc_wonky -> + [] + end. -binaryslot_trimmedlist(FullBin, all, all, - PressMethod, IdxModDate, false) -> +-spec binaryslot_trimmed( + binary(), + range_endpoint(), + range_endpoint(), + press_method(), + boolean(), + segment_check_fun()) -> + {list(leveled_codec:ledger_kv()), + list({integer(), binary()})|none}. +%% @doc +%% Must return a trimmed and reversed list of results in the range +binaryslot_trimmed( + FullBin, all, all, PressMethod, IdxModDate, false) -> {binaryslot_tolist(FullBin, PressMethod, IdxModDate), none}; -binaryslot_trimmedlist(FullBin, StartKey, EndKey, - PressMethod, IdxModDate, SegList) -> - LTrimFun = fun({K, _V}) -> K < StartKey end, - RTrimFun = fun({K, _V}) -> not leveled_codec:endkey_passed(EndKey, K) end, - BlockCheckFun = - fun(Block, {Acc, Continue}) -> - case {Block, Continue} of - {<<>>, _} -> - {Acc, false}; - {_, true} -> - BlockList = - case is_binary(Block) of - true -> - deserialise_block(Block, PressMethod); - false -> - Block - end, - case fetchends_rawblock(BlockList) of - {_, LastKey} when StartKey > LastKey -> - %% This includes the case when LastKey is - %% not_present due to corruption in the BlockList - %% as tuple is > not_present. - {Acc, true}; - {_, LastKey} -> - {_LDrop, RKeep} = lists:splitwith(LTrimFun, - BlockList), - case leveled_codec:endkey_passed(EndKey, - LastKey) of - true -> - {LKeep, _RDrop} - = lists:splitwith(RTrimFun, RKeep), - {Acc ++ LKeep, false}; - false -> - {Acc ++ RKeep, true} - end - end; - {_ , false} -> - {Acc, false} - end - end, - - case {crc_check_slot(FullBin), SegList} of - % It will be more effecient to check a subset of blocks. To work out - % the best subset we always look in the middle block of 5, and based on - % the first and last keys of that middle block when compared to the Start - % and EndKey of the query determines a subset of blocks - % - % This isn't perfectly efficient, esepcially if the query overlaps Block2 - % and Block3 (as Block 1 will also be checked), but finessing this last - % scenario is hard to do in concise code +binaryslot_trimmed( + FullBin, StartKey, EndKey, PressMethod, IdxModDate, SegmentChecker) -> + case {crc_check_slot(FullBin), SegmentChecker} of + % Get a trimmed list of keys in the slot based on the range, trying + % to minimise the number of blocks which are deserialised by + % checking the middle block first. {{Header, Blocks}, false} -> {BlockLengths, _LMD, _PosBinIndex} = extract_header(Header, IdxModDate), @@ -2540,66 +2424,141 @@ binaryslot_trimmedlist(FullBin, StartKey, EndKey, <> = Blocks, - BlocksToCheck = - blocks_required({StartKey, EndKey}, - [Block1, Block2, MidBlock, Block4, Block5], - PressMethod), - {Acc, _Continue} = - lists:foldl(BlockCheckFun, {[], true}, BlocksToCheck), - {Acc, none}; - {{Header, _Blocks}, SegList} -> + TrimmedKVL = + blocks_required( + {StartKey, EndKey}, + Block1, Block2, MidBlock, Block4, Block5, + PressMethod), + {TrimmedKVL, none}; + {{Header, _Blocks}, SegmentChecker} -> {BlockLengths, _LMD, BlockIdx} = extract_header(Header, IdxModDate), - PosList = find_pos(BlockIdx, SegList, [], 0), - KVL = check_blocks(PosList, - FullBin, - BlockLengths, - byte_size(BlockIdx), - false, - PressMethod, - IdxModDate, - []), - {KVL, Header}; + TrimmedKVL = + checkblocks_segandrange( + BlockIdx, + FullBin, + BlockLengths, + PressMethod, + IdxModDate, + SegmentChecker, + {StartKey, EndKey}), + {TrimmedKVL, Header}; {crc_wonky, _} -> {[], none} end. - -blocks_required({StartKey, EndKey}, [B1, B2, MidBlock, B4, B5], PressMethod) -> +-spec blocks_required( + {range_endpoint(), range_endpoint()}, + binary(), binary(), binary(), binary(), binary(), + press_method()) -> list(leveled_codec:ledger_kv()). +blocks_required( + {StartKey, EndKey}, B1, B2, MidBlock, B4, B5, PressMethod) -> MidBlockList = deserialise_block(MidBlock, PressMethod), - filter_blocks_required(fetchends_rawblock(MidBlockList), - {StartKey, EndKey}, - [B1, B2, MidBlockList, B4, B5]). - -filter_blocks_required({not_present, not_present}, _RangeKeys, AllBlocks) -> - AllBlocks; -filter_blocks_required({_MidFirst, MidLast}, {StartKey, _EndKey}, - [_Block1, _Block2, _MidBlockList, Block4, Block5]) - when StartKey > MidLast -> - [Block4, Block5]; -filter_blocks_required({MidFirst, MidLast}, {StartKey, EndKey}, - [_Block1, _Block2, MidBlockList, Block4, Block5]) - when StartKey >= MidFirst -> - NoneAfter = leveled_codec:endkey_passed(EndKey, MidLast), - case NoneAfter of + case filterby_midblock( + fetchends_rawblock(MidBlockList), {StartKey, EndKey}) of + empty -> + in_range(deserialise_block(B1, PressMethod), StartKey, EndKey) + ++ in_range(deserialise_block(B2, PressMethod), StartKey, EndKey) + ++ in_range(deserialise_block(B4, PressMethod), StartKey, EndKey) + ++ in_range(deserialise_block(B5, PressMethod), StartKey, EndKey); + all_blocks -> + get_lefthand_blocks(B1, B2, PressMethod, StartKey) + ++ MidBlockList + ++ get_righthand_blocks(B4, B5, PressMethod, EndKey); + lt_mid -> + in_range( + get_lefthand_blocks(B1, B2, PressMethod, StartKey), + all, + EndKey); + le_mid -> + get_lefthand_blocks(B1, B2, PressMethod, StartKey) + ++ in_range(MidBlockList, all, EndKey); + mid_only -> + in_range(MidBlockList, StartKey, EndKey); + ge_mid -> + in_range(MidBlockList, StartKey, all) + ++ get_righthand_blocks(B4, B5, PressMethod, EndKey); + gt_mid -> + in_range( + get_righthand_blocks(B4, B5, PressMethod, EndKey), + StartKey, + all) + end. + +get_lefthand_blocks(B1, B2, PressMethod, StartKey) -> + BlockList2 = deserialise_block(B2, PressMethod), + case previous_block_required( + fetchends_rawblock(BlockList2), StartKey) of true -> - [MidBlockList]; + in_range(deserialise_block(B1, PressMethod), StartKey, all) + ++ BlockList2; false -> - [MidBlockList, Block4, Block5] + in_range(BlockList2, StartKey, all) + end. + +get_righthand_blocks(B4, B5, PressMethod, EndKey) -> + BlockList4 = deserialise_block(B4, PressMethod), + case next_block_required( + fetchends_rawblock(BlockList4), EndKey) of + true -> + BlockList4 + ++ in_range(deserialise_block(B5, PressMethod), all, EndKey); + false -> + in_range(BlockList4, all, EndKey) + end. + +filterby_midblock({not_present, not_present}, _RangeKeys) -> + empty; +filterby_midblock( + {_MidFirst, MidLast}, {StartKey, _EndKey}) when StartKey > MidLast -> + gt_mid; +filterby_midblock( + {MidFirst, MidLast}, {StartKey, EndKey}) when StartKey >= MidFirst -> + case leveled_codec:endkey_passed(EndKey, MidLast) of + true -> + mid_only; + false -> + ge_mid end; -filter_blocks_required({MidFirst, MidLast}, {_StartKey, EndKey}, - [Block1, Block2, MidBlockList, Block4, Block5]) -> +filterby_midblock({MidFirst, MidLast}, {_StartKey, EndKey}) -> AllBefore = leveled_codec:endkey_passed(EndKey, MidFirst), NoneAfter = leveled_codec:endkey_passed(EndKey, MidLast), case {AllBefore, NoneAfter} of {true, true} -> - [Block1, Block2]; + lt_mid; {false, true} -> - [Block1, Block2, MidBlockList]; + le_mid; {false, false} -> - [Block1, Block2, MidBlockList, Block4, Block5] + all_blocks end. +previous_block_required({not_present, not_present}, _SK) -> + true; +previous_block_required({FK, _LK}, StartKey) when FK < StartKey -> + false; +previous_block_required(_BlockEnds, _StartKey) -> + true. + +next_block_required({not_present, not_present}, _EK) -> + true; +next_block_required({_FK, LK}, EndKey) -> + not leveled_codec:endkey_passed(EndKey, LK). + +-spec in_range( + list(leveled_codec:ledger_kv()), + range_endpoint(), + range_endpoint()) -> list(leveled_codec:ledger_kv()). +%% @doc +%% Is the ledger key in the range. +in_range(KVL, all, all) -> + KVL; +in_range(KVL, all, EK) -> + lists:takewhile( + fun({K, _V}) -> not leveled_codec:endkey_passed(EK, K) end, KVL); +in_range(KVL, SK, all) -> + lists:dropwhile(fun({K, _V}) -> K < SK end, KVL); +in_range(KVL, SK, EK) -> + in_range(in_range(KVL, SK, all), all, EK). crc_check_slot(FullBin) -> < fetch_value(Rest, BlockLengths, Blocks, Key, PressMethod) end. --spec fetchfrom_rawblock(pos_integer(), list(leveled_codec:ledger_kv())) -> - not_present|leveled_codec:ledger_kv(). +-spec fetchfrom_rawblock( + pos_integer(), list(leveled_codec:ledger_kv())) + -> not_present|leveled_codec:ledger_kv(). %% @doc %% Fetch from a deserialised block, but accounting for potential corruption %% in that block which may lead to it returning as an empty list if that @@ -2676,19 +2636,19 @@ fetchfrom_rawblock(BlockPos, RawBlock) when BlockPos > length(RawBlock) -> fetchfrom_rawblock(BlockPos, RawBlock) -> lists:nth(BlockPos, RawBlock). --spec fetchends_rawblock(list(leveled_codec:ledger_kv())) -> - {not_present, not_present}| - {leveled_codec:ledger_key(), leveled_codec:ledger_key()}. +-spec fetchends_rawblock( + list(leveled_codec:ledger_kv())) + -> {not_present, not_present}| + {leveled_codec:ledger_key(), leveled_codec:ledger_key()}. %% @doc %% Fetch the first and last key from a block, and not_present if the block %% is empty (rather than crashing) fetchends_rawblock([]) -> {not_present, not_present}; fetchends_rawblock(RawBlock) -> - {element(1, lists:nth(1, RawBlock)), + {element(1, hd(RawBlock)), element(1, lists:last(RawBlock))}. - revert_position(Pos) -> {SideBlockSize, MidBlockSize} = ?LOOK_BLOCKSIZE, case Pos < 2 * SideBlockSize of @@ -2705,8 +2665,6 @@ revert_position(Pos) -> end end. - - %%%============================================================================ %%% Merge Functions %%%============================================================================ @@ -2755,21 +2713,20 @@ merge_lists(KVList1, SSTOpts, IdxModDate) -> element(1, lists:nth(1, KVList1)), not_counted}. - split_lists([], SlotLists, 0, _PressMethod, _IdxModDate) -> lists:reverse(SlotLists); split_lists(LastPuff, SlotLists, 0, PressMethod, IdxModDate) -> {SlotD, _} = generate_binary_slot( - lookup, LastPuff, PressMethod, IdxModDate, no_timing), + lookup, {forward, LastPuff}, PressMethod, IdxModDate, no_timing), lists:reverse([SlotD|SlotLists]); split_lists(KVList1, SlotLists, N, PressMethod, IdxModDate) -> {Slot, KVListRem} = lists:split(?LOOK_SLOTSIZE, KVList1), {SlotD, _} = - generate_binary_slot(lookup, Slot, PressMethod, IdxModDate, no_timing), + generate_binary_slot( + lookup, {forward, Slot}, PressMethod, IdxModDate, no_timing), split_lists(KVListRem, [SlotD|SlotLists], N - 1, PressMethod, IdxModDate). - -spec merge_lists( list(expanded_pointer()), list(expanded_pointer()), @@ -2785,17 +2742,22 @@ split_lists(KVList1, SlotLists, N, PressMethod, IdxModDate) -> %% provided may include pointers to fetch more Keys/Values from the source %% file merge_lists( - KVList1, KVList2, LevelInfo, SSTOpts, IndexModDate, SaveTombCount) -> + KVList1, KVList2, {IsBase, L}, SSTOpts, IndexModDate, SaveTombCount) -> InitTombCount = case SaveTombCount of true -> 0; false -> not_counted end, - merge_lists(KVList1, KVList2, - LevelInfo, - [], null, 0, - SSTOpts#sst_options.max_sstslots, - SSTOpts#sst_options.press_method, - IndexModDate, - InitTombCount, - #build_timings{}). + BuildTimings = + case IsBase orelse lists:member(L, ?LOG_BUILDTIMINGS_LEVELS) of + true -> + #build_timings{}; + false -> + no_timing + end, + merge_lists( + KVList1, KVList2, + {IsBase, L}, [], null, 0, + SSTOpts#sst_options.max_sstslots, SSTOpts#sst_options.press_method, + IndexModDate, InitTombCount, + BuildTimings). -spec merge_lists( @@ -2829,10 +2791,9 @@ merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, MaxSlots, PressMethod, IdxModDate, CountOfTombs, T0) -> % Form a slot by merging the two lists until the next 128 K/V pairs have % been determined - SW = os:timestamp(), {KVRem1, KVRem2, Slot, FK0} = form_slot(KVL1, KVL2, LI, no_lookup, 0, [], FirstKey), - T1 = update_buildtimings(SW, T0, fold_toslot), + T1 = update_buildtimings(T0, fold_toslot), case Slot of {_, []} -> % There were no actual keys in the slot (maybe some expired) @@ -2851,7 +2812,8 @@ merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, MaxSlots, % Convert the list of KVs for the slot into a binary, and related % metadata {SlotD, T2} = - generate_binary_slot(Lookup, KVL, PressMethod, IdxModDate, T1), + generate_binary_slot( + Lookup, {reverse, KVL}, PressMethod, IdxModDate, T1), merge_lists(KVRem1, KVRem2, LI, @@ -2861,30 +2823,10 @@ merge_lists(KVL1, KVL2, LI, SlotList, FirstKey, SlotCount, MaxSlots, MaxSlots, PressMethod, IdxModDate, - count_tombs(KVL, CountOfTombs), + leveled_codec:count_tombs(KVL, CountOfTombs), T2) end. - --spec count_tombs( - list(leveled_codec:ledger_kv()), non_neg_integer()|not_counted) -> - non_neg_integer()|not_counted. -%% @doc -%% Count the tombstones in a list of KVs -count_tombs(_KVL, not_counted) -> - not_counted; -count_tombs(KVL, InitCount) -> - FoldFun = - fun(KV, Count) -> - case leveled_codec:strip_to_statusonly(KV) of - tomb -> - Count + 1; - _ -> - Count - end - end, - lists:foldl(FoldFun, InitCount, KVL). - -spec form_slot(list(expanded_pointer()), list(expanded_pointer()), {boolean(), non_neg_integer()}, @@ -2896,164 +2838,168 @@ count_tombs(KVL, InitCount) -> {lookup|no_lookup, list(leveled_codec:ledger_kv())}, leveled_codec:ledger_key()}. %% @doc -%% Merge together Key Value lists to provide an ordered slot of KVs +%% Merge together Key Value lists to provide a reverse-ordered slot of KVs form_slot([], [], _LI, Type, _Size, Slot, FK) -> - {[], [], {Type, lists:reverse(Slot)}, FK}; + {[], [], {Type, Slot}, FK}; form_slot(KVList1, KVList2, _LI, lookup, ?LOOK_SLOTSIZE, Slot, FK) -> - {KVList1, KVList2, {lookup, lists:reverse(Slot)}, FK}; + {KVList1, KVList2, {lookup, Slot}, FK}; form_slot(KVList1, KVList2, _LI, no_lookup, ?NOLOOK_SLOTSIZE, Slot, FK) -> - {KVList1, KVList2, {no_lookup, lists:reverse(Slot)}, FK}; -form_slot(KVList1, KVList2, {IsBasement, TS}, lookup, Size, Slot, FK) -> - case {key_dominates(KVList1, KVList2, {IsBasement, TS}), FK} of - {{{next_key, TopKV}, Rem1, Rem2}, _} -> - form_slot(Rem1, - Rem2, - {IsBasement, TS}, - lookup, - Size + 1, - [TopKV|Slot], - FK); - {{skipped_key, Rem1, Rem2}, _} -> - form_slot(Rem1, Rem2, {IsBasement, TS}, lookup, Size, Slot, FK) + {KVList1, KVList2, {no_lookup, Slot}, FK}; +form_slot(KVList1, KVList2, LevelInfo, lookup, Size, Slot, FK) -> + case key_dominates(KVList1, KVList2, LevelInfo) of + {{next_key, TopKV}, Rem1, Rem2} -> + form_slot( + Rem1, Rem2, LevelInfo, lookup, Size + 1, [TopKV|Slot], FK); + {skipped_key, Rem1, Rem2} -> + form_slot(Rem1, Rem2, LevelInfo, lookup, Size, Slot, FK) end; -form_slot(KVList1, KVList2, {IsBasement, TS}, no_lookup, Size, Slot, FK) -> - case key_dominates(KVList1, KVList2, {IsBasement, TS}) of +form_slot(KVList1, KVList2, LevelInfo, no_lookup, Size, Slot, FK) -> + case key_dominates(KVList1, KVList2, LevelInfo) of {{next_key, {TopK, TopV}}, Rem1, Rem2} -> - FK0 = - case FK of - null -> - TopK; - _ -> - FK - end, + FK0 = case FK of null -> TopK; _ -> FK end, case leveled_codec:to_lookup(TopK) of no_lookup -> - form_slot(Rem1, - Rem2, - {IsBasement, TS}, - no_lookup, - Size + 1, - [{TopK, TopV}|Slot], - FK0); + form_slot( + Rem1, + Rem2, + LevelInfo, + no_lookup, + Size + 1, + [{TopK, TopV}|Slot], + FK0); lookup -> case Size >= ?LOOK_SLOTSIZE of true -> - {KVList1, - KVList2, - {no_lookup, lists:reverse(Slot)}, - FK}; + {KVList1, KVList2, {no_lookup, Slot}, FK}; false -> - form_slot(Rem1, - Rem2, - {IsBasement, TS}, - lookup, - Size + 1, - [{TopK, TopV}|Slot], - FK0) + form_slot( + Rem1, + Rem2, + LevelInfo, + lookup, + Size + 1, + [{TopK, TopV}|Slot], + FK0) end end; {skipped_key, Rem1, Rem2} -> - form_slot(Rem1, Rem2, {IsBasement, TS}, no_lookup, Size, Slot, FK) + form_slot(Rem1, Rem2, LevelInfo, no_lookup, Size, Slot, FK) end. +-spec key_dominates( + list(expanded_pointer()), + list(expanded_pointer()), + {boolean()|undefined, leveled_pmanifest:lsm_level()}) + -> + {{next_key, leveled_codec:ledger_kv()}|skipped_key, + list(expanded_pointer()), + list(expanded_pointer())}. +key_dominates([{pointer, SSTPid, Slot, StartKey, all}|T1], KL2, Level) -> + key_dominates( + expand_list_by_pointer( + {pointer, SSTPid, Slot, StartKey, all}, T1, ?MERGE_SCANWIDTH), + KL2, + Level); +key_dominates([{next, ManEntry, StartKey}|T1], KL2, Level) -> + key_dominates( + expand_list_by_pointer( + {next, ManEntry, StartKey, all}, T1, ?MERGE_SCANWIDTH), + KL2, + Level); +key_dominates(KL1, [{pointer, SSTPid, Slot, StartKey, all}|T2], Level) -> + key_dominates( + KL1, + expand_list_by_pointer( + {pointer, SSTPid, Slot, StartKey, all}, T2, ?MERGE_SCANWIDTH), + Level); +key_dominates(KL1, [{next, ManEntry, StartKey}|T2], Level) -> + key_dominates( + KL1, + expand_list_by_pointer( + {next, ManEntry, StartKey, all}, T2, ?MERGE_SCANWIDTH), + Level); +key_dominates( + [{K1, _V1}|_T1]=Rest1, [{K2, V2}|Rest2], {false, _TS}) when K2 < K1 -> + {{next_key, {K2, V2}}, Rest1, Rest2}; +key_dominates( + [{K1, V1}|Rest1], [{K2, _V2}|_T2]=Rest2, {false, _TS}) when K1 < K2 -> + {{next_key, {K1, V1}}, Rest1, Rest2}; key_dominates(KL1, KL2, Level) -> - key_dominates_expanded(maybe_expand_pointer(KL1), - maybe_expand_pointer(KL2), - Level). - -key_dominates_expanded([H1|T1], [], Level) -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of - true -> - {skipped_key, T1, []}; - false -> - {{next_key, H1}, T1, []} - end; -key_dominates_expanded([], [H2|T2], Level) -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [], T2}; - false -> - {{next_key, H2}, [], T2} - end; -key_dominates_expanded([H1|T1], [H2|T2], Level) -> - case leveled_codec:key_dominates(H1, H2) of - left_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H1, Level) of + case key_dominates_expanded(KL1, KL2) of + {{next_key, NKV}, Rest1, Rest2} -> + case leveled_codec:maybe_reap_expiredkey(NKV, Level) of true -> - {skipped_key, T1, [H2|T2]}; + {skipped_key, Rest1, Rest2}; false -> - {{next_key, H1}, T1, [H2|T2]} + {{next_key, NKV}, Rest1, Rest2} end; - right_hand_first -> - case leveled_codec:maybe_reap_expiredkey(H2, Level) of - true -> - {skipped_key, [H1|T1], T2}; - false -> - {{next_key, H2}, [H1|T1], T2} - end; - left_hand_dominant -> + {skipped_key, Rest1, Rest2} -> + {skipped_key, Rest1, Rest2} + end. + +-spec key_dominates_expanded( + list(expanded_pointer()), list(expanded_pointer())) + -> {{next_key, leveled_codec:ledger_kv()}|skipped_key, + list(expanded_pointer()), + list(expanded_pointer())}. +key_dominates_expanded([H1|T1], []) -> + {{next_key, H1}, T1, []}; +key_dominates_expanded([], [H2|T2]) -> + {{next_key, H2}, [], T2}; +key_dominates_expanded([{K1, _V1}|_T1]=LHL, [{K2, V2}|T2]) when K2 < K1 -> + {{next_key, {K2, V2}}, LHL, T2}; +key_dominates_expanded([{K1, V1}|T1], [{K2, _V2}|_T2]=RHL) when K1 < K2 -> + {{next_key, {K1, V1}}, T1, RHL}; +key_dominates_expanded([H1|T1], [H2|T2]) -> + case leveled_codec:key_dominates(H1, H2) of + true -> {skipped_key, [H1|T1], T2}; - right_hand_dominant -> + false -> {skipped_key, T1, [H2|T2]} end. -%% When a list is provided it may include a pointer to gain another batch of -%% entries from the same file, or a new batch of entries from another file -%% -%% This resultant list should include the Tail of any pointers added at the -%% end of the list - -maybe_expand_pointer([]) -> - []; -maybe_expand_pointer([{pointer, SSTPid, Slot, StartKey, all}|Tail]) -> - expand_list_by_pointer({pointer, SSTPid, Slot, StartKey, all}, - Tail, - ?MERGE_SCANWIDTH); -maybe_expand_pointer([{next, ManEntry, StartKey}|Tail]) -> - expand_list_by_pointer({next, ManEntry, StartKey, all}, - Tail, - ?MERGE_SCANWIDTH); -maybe_expand_pointer(List) -> - List. - - %%%============================================================================ %%% Timing Functions %%%============================================================================ --spec update_buildtimings( - erlang:timestamp(), build_timings(), atom()) -> build_timings(). +-spec update_buildtimings(build_timings(), atom()) -> build_timings(). %% @doc %% %% Timings taken from the build of a SST file. %% %% There is no sample window, but the no_timing status is still used for %% level zero files where we're not breaking down the build time in this way. -update_buildtimings(_SW, no_timing, _Stage) -> +update_buildtimings(no_timing, _Stage) -> no_timing; -update_buildtimings(SW, Timings, Stage) -> - Timer = timer:now_diff(os:timestamp(), SW), - case Stage of - slot_hashlist -> - HLT = Timings#build_timings.slot_hashlist + Timer, - Timings#build_timings{slot_hashlist = HLT}; - slot_serialise -> - SST = Timings#build_timings.slot_serialise + Timer, - Timings#build_timings{slot_serialise = SST}; - slot_finish -> - SFT = Timings#build_timings.slot_finish + Timer, - Timings#build_timings{slot_finish = SFT}; - fold_toslot -> - FST = Timings#build_timings.fold_toslot + Timer, - Timings#build_timings{fold_toslot = FST} - end. +update_buildtimings(Timings, Stage) -> + LastTS = Timings#build_timings.last_timestamp, + ThisTS = os:timestamp(), + Timer = timer:now_diff(ThisTS, LastTS), + NewTimings = + case Stage of + slot_hashlist -> + HLT = Timings#build_timings.slot_hashlist + Timer, + Timings#build_timings{slot_hashlist = HLT}; + slot_serialise -> + SST = Timings#build_timings.slot_serialise + Timer, + Timings#build_timings{slot_serialise = SST}; + slot_finish -> + SFT = Timings#build_timings.slot_finish + Timer, + Timings#build_timings{slot_finish = SFT}; + fold_toslot -> + FST = Timings#build_timings.fold_toslot + Timer, + Timings#build_timings{fold_toslot = FST} + end, + NewTimings#build_timings{last_timestamp = ThisTS}. -spec log_buildtimings(build_timings(), tuple()) -> ok. %% @doc %% %% Log out the time spent during the merge lists part of the SST build +log_buildtimings(no_timing, _LI) -> + ok; log_buildtimings(Timings, LI) -> leveled_log:log( sst13, @@ -3084,27 +3030,36 @@ maybelog_fetch_timing({Pid, _SlotFreq}, Level, Type, SW) -> -define(TEST_AREA, "test/test_area/"). --spec sst_getkvrange(pid(), - range_endpoint(), - range_endpoint(), - integer()) - -> list(leveled_codec:ledger_kv()|slot_pointer()). + +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> + sst_getkvrange(Pid, StartKey, EndKey, ScanWidth, false, 0). + +-spec sst_getkvrange( + pid(), + range_endpoint(), + range_endpoint(), + integer(), + segment_check_fun(), + non_neg_integer()) -> list(leveled_codec:ledger_kv()|slot_pointer()). %% @doc %% Get a range of {Key, Value} pairs as a list between StartKey and EndKey %% (inclusive). The ScanWidth is the maximum size of the range, a pointer %% will be placed on the tail of the resulting list if results expand beyond %% the Scan Width -sst_getkvrange(Pid, StartKey, EndKey, ScanWidth) -> - sst_getfilteredrange(Pid, StartKey, EndKey, ScanWidth, false, 0). - --spec sst_getslots(pid(), list(slot_pointer())) - -> list(leveled_codec:ledger_kv()). +sst_getkvrange(Pid, StartKey, EndKey, ScanWidth, SegChecker, LowLastMod) -> + [Pointer|MorePointers] = + sst_getfilteredrange(Pid, StartKey, EndKey, LowLastMod), + sst_expandpointer( + Pointer, MorePointers, ScanWidth, SegChecker, LowLastMod). + +-spec sst_getslots( + pid(), list(slot_pointer())) -> list(leveled_codec:ledger_kv()). %% @doc %% Get a list of slots by their ID. The slot will be converted from the binary %% to term form outside of the FSM loop, this is to stop the copying of the %% converted term to the calling process. sst_getslots(Pid, SlotList) -> - sst_getfilteredslots(Pid, SlotList, false, 0). + sst_getfilteredslots(Pid, SlotList, false, 0, []). testsst_new(RootPath, Filename, Level, KVList, MaxSQN, PressMethod) -> OptsSST = @@ -3266,7 +3221,8 @@ indexed_list_test() -> SW0 = os:timestamp(), {{_PosBinIndex1, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, KVL1, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, KVL1}, native, ?INDEX_MODDATE, no_timing), io:format(user, "Indexed list created slot in ~w microseconds of size ~w~n", [timer:now_diff(os:timestamp(), SW0), byte_size(FullBin)]), @@ -3295,7 +3251,8 @@ indexed_list_mixedkeys_test() -> Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), {{_PosBinIndex1, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), {TestK1, TestV1} = lists:nth(4, KVL1), MH1 = leveled_codec:segment_hash(TestK1), @@ -3322,7 +3279,8 @@ indexed_list_mixedkeys2_test() -> % this isn't actually ordered correctly Keys = IdxKeys1 ++ KVL1 ++ IdxKeys2, {{_Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), lists:foreach(fun({K, V}) -> MH = leveled_codec:segment_hash(K), test_binary_slot(FullBin, K, MH, {K, V}) @@ -3333,9 +3291,11 @@ indexed_list_allindexkeys_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), ?LOOK_SLOTSIZE), {{HeaderT, FullBinT, HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, true, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, true, no_timing), {{HeaderF, FullBinF, HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, false, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, false, no_timing), EmptySlotSize = ?LOOK_SLOTSIZE - 1, LMD = ?FLIPPER32, ?assertMatch(<<_BL:20/binary, LMD:32/integer, EmptySlotSize:8/integer>>, @@ -3348,92 +3308,92 @@ indexed_list_allindexkeys_test() -> % io:format(user, % "Indexed list flattened in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]), + io:format("BinToListT ~p~n", [BinToListT]), ?assertMatch(Keys, BinToListT), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBinT, - all, all, - native, - true, - false)), + ?assertMatch( + {Keys, none}, + binaryslot_trimmed( + FullBinT, all, all, native, true, false)), ?assertMatch(Keys, BinToListF), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBinF, - all, all, - native, - false, - false)). + ?assertMatch( + {Keys, none}, + binaryslot_trimmed( + FullBinF, all, all, native, false, false)). indexed_list_allindexkeys_nolookup_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(1000)), ?NOLOOK_SLOTSIZE), {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(no_lookup, Keys, native, ?INDEX_MODDATE,no_timing), + generate_binary_slot( + no_lookup, {forward, Keys}, native, ?INDEX_MODDATE,no_timing), ?assertMatch(<<_BL:20/binary, _LMD:32/integer, 127:8/integer>>, Header), % SW = os:timestamp(), - BinToList = binaryslot_tolist(FullBin, native, ?INDEX_MODDATE), + BinToList = + binaryslot_tolist(FullBin, native, ?INDEX_MODDATE), % io:format(user, % "Indexed list flattened in ~w microseconds ~n", % [timer:now_diff(os:timestamp(), SW)]), ?assertMatch(Keys, BinToList), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, - all, all, - native, - ?INDEX_MODDATE, - false)). + ?assertMatch( + {Keys, none}, + binaryslot_trimmed(FullBin, all, all, native, ?INDEX_MODDATE, false)). indexed_list_allindexkeys_trimmed_test() -> Keys = lists:sublist(lists:ukeysort(1, generate_indexkeys(150)), ?LOOK_SLOTSIZE), {{Header, FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE,no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), EmptySlotSize = ?LOOK_SLOTSIZE - 1, - ?assertMatch(<<_BL:20/binary, _LMD:32/integer, EmptySlotSize:8/integer>>, - Header), - ?assertMatch({Keys, none}, binaryslot_trimmedlist(FullBin, - {i, - "Bucket", - {"t1_int", 0}, - null}, - {i, - "Bucket", - {"t1_int", 99999}, - null}, - native, - ?INDEX_MODDATE, - false)), + ?assertMatch( + <<_BL:20/binary, _LMD:32/integer, EmptySlotSize:8/integer>>, + Header), + ?assertMatch( + {Keys, none}, + binaryslot_trimmed( + FullBin, + {i, "Bucket", {"t1_int", 0}, null}, + {i, "Bucket", {"t1_int", 99999}, null}, + native, + ?INDEX_MODDATE, + false)), {SK1, _} = lists:nth(10, Keys), {EK1, _} = lists:nth(100, Keys), R1 = lists:sublist(Keys, 10, 91), - {O1, none} = binaryslot_trimmedlist(FullBin, SK1, EK1, - native, ?INDEX_MODDATE, false), + {O1, none} = + binaryslot_trimmed( + FullBin, SK1, EK1, native, ?INDEX_MODDATE, false), ?assertMatch(91, length(O1)), ?assertMatch(R1, O1), {SK2, _} = lists:nth(10, Keys), {EK2, _} = lists:nth(20, Keys), R2 = lists:sublist(Keys, 10, 11), - {O2, none} = binaryslot_trimmedlist(FullBin, SK2, EK2, - native, ?INDEX_MODDATE, false), + {O2, none} = + binaryslot_trimmed(FullBin, SK2, EK2, native, ?INDEX_MODDATE, false), ?assertMatch(11, length(O2)), ?assertMatch(R2, O2), {SK3, _} = lists:nth(?LOOK_SLOTSIZE - 1, Keys), {EK3, _} = lists:nth(?LOOK_SLOTSIZE, Keys), R3 = lists:sublist(Keys, ?LOOK_SLOTSIZE - 1, 2), - {O3, none} = binaryslot_trimmedlist(FullBin, SK3, EK3, - native, ?INDEX_MODDATE, false), + {O3, none} = + binaryslot_trimmed(FullBin, SK3, EK3, native, ?INDEX_MODDATE, false), ?assertMatch(2, length(O3)), ?assertMatch(R3, O3). findposfrag_test() -> - ?assertMatch([], find_pos(<<128:8/integer>>, 1, [], 0)). + ?assertMatch([], find_pos(<<128:8/integer>>, segment_checker(1))). indexed_list_mixedkeys_bitflip_test() -> KVL0 = lists:ukeysort(1, generate_randomkeys(1, 50, 1, 4)), KVL1 = lists:sublist(KVL0, 33), Keys = lists:ukeysort(1, generate_indexkeys(60) ++ KVL1), {{Header, SlotBin, _HL, LK}, no_timing} = - generate_binary_slot(lookup, Keys, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, Keys}, native, ?INDEX_MODDATE, no_timing), ?assertMatch(LK, element(1, lists:last(Keys))), @@ -3452,11 +3412,12 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin, TestKey1, MH1, lists:nth(1, KVL1)), test_binary_slot(SlotBin, TestKey2, MH2, lists:nth(33, KVL1)), - ToList = binaryslot_tolist(SlotBin, native, ?INDEX_MODDATE), + ToList = + binaryslot_tolist(SlotBin, native, ?INDEX_MODDATE), ?assertMatch(Keys, ToList), - [Pos1] = find_pos(PosBin, extract_hash(MH1), [], 0), - [Pos2] = find_pos(PosBin, extract_hash(MH2), [], 0), + [Pos1] = find_pos(PosBin, segment_checker(extract_hash(MH1))), + [Pos2] = find_pos(PosBin, segment_checker(extract_hash(MH2))), {BN1, _BP1} = revert_position(Pos1), {BN2, _BP2} = revert_position(Pos2), {Offset1, Length1} = block_offsetandlength(Header, BN1), @@ -3471,8 +3432,10 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin1, TestKey1, MH1, not_present), test_binary_slot(SlotBin2, TestKey2, MH2, not_present), - ToList1 = binaryslot_tolist(SlotBin1, native, ?INDEX_MODDATE), - ToList2 = binaryslot_tolist(SlotBin2, native, ?INDEX_MODDATE), + ToList1 = + binaryslot_tolist(SlotBin1, native, ?INDEX_MODDATE), + ToList2 = + binaryslot_tolist(SlotBin2, native, ?INDEX_MODDATE), ?assertMatch(true, is_list(ToList1)), ?assertMatch(true, is_list(ToList2)), @@ -3485,8 +3448,8 @@ indexed_list_mixedkeys_bitflip_test() -> {SK1, _} = lists:nth(10, Keys), {EK1, _} = lists:nth(20, Keys), - {O1, none} = binaryslot_trimmedlist(SlotBin3, SK1, EK1, - native, ?INDEX_MODDATE, false), + {O1, none} = + binaryslot_trimmed(SlotBin3, SK1, EK1, native, ?INDEX_MODDATE, false), ?assertMatch([], O1), SlotBin4 = flip_byte(SlotBin, 0, 20), @@ -3494,14 +3457,16 @@ indexed_list_mixedkeys_bitflip_test() -> test_binary_slot(SlotBin4, TestKey1, MH1, not_present), test_binary_slot(SlotBin5, TestKey1, MH1, not_present), - ToList4 = binaryslot_tolist(SlotBin4, native, ?INDEX_MODDATE), - ToList5 = binaryslot_tolist(SlotBin5, native, ?INDEX_MODDATE), + ToList4 = + binaryslot_tolist(SlotBin4, native, ?INDEX_MODDATE), + ToList5 = + binaryslot_tolist(SlotBin5, native, ?INDEX_MODDATE), ?assertMatch([], ToList4), ?assertMatch([], ToList5), - {O4, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, - native, ?INDEX_MODDATE, false), - {O5, none} = binaryslot_trimmedlist(SlotBin4, SK1, EK1, - native, ?INDEX_MODDATE, false), + {O4, none} = + binaryslot_trimmed(SlotBin4, SK1, EK1, native, ?INDEX_MODDATE, false), + {O5, none} = + binaryslot_trimmed(SlotBin4, SK1, EK1, native, ?INDEX_MODDATE, false), ?assertMatch([], O4), ?assertMatch([], O5). @@ -3716,11 +3681,12 @@ simple_persisted_rangesegfilter_tester(SSTNewFun) -> SegList = lists:map(GetSegFun, [SK1, SK2, SK3, SK4, SK5, EK1, EK2, EK3, EK4, EK5]), + SegChecker = segment_checker(tune_seglist(SegList)), TestFun = fun(StartKey, EndKey, OutList) -> RangeKVL = - sst_getfilteredrange(Pid, StartKey, EndKey, 4, SegList, 0), + sst_getkvrange(Pid, StartKey, EndKey, 4, SegChecker, 0), RangeKL = lists:map(fun({LK0, _LV0}) -> LK0 end, RangeKVL), ?assertMatch(true, lists:member(StartKey, RangeKL)), ?assertMatch(true, lists:member(EndKey, RangeKL)), @@ -4196,7 +4162,8 @@ hashmatching_bytreesize_test() -> end, KVL = lists:map(GenKeyFun, lists:seq(1, 128)), {{PosBinIndex1, _FullBin, _HL, _LK}, no_timing} = - generate_binary_slot(lookup, KVL, native, ?INDEX_MODDATE, no_timing), + generate_binary_slot( + lookup, {forward, KVL}, native, ?INDEX_MODDATE, no_timing), check_segment_match(PosBinIndex1, KVL, small), check_segment_match(PosBinIndex1, KVL, medium). @@ -4208,8 +4175,8 @@ check_segment_match(PosBinIndex1, KVL, TreeSize) -> leveled_tictac:get_segment( leveled_tictac:keyto_segment32(<>), TreeSize), - SegList0 = tune_seglist([Seg]), - PosList = find_pos(PosBinIndex1, SegList0, [], 0), + SegChecker = segment_checker(tune_seglist([Seg])), + PosList = find_pos(PosBinIndex1, SegChecker), ?assertMatch(true, length(PosList) >= 1) end, lists:foreach(CheckFun, KVL). @@ -4271,21 +4238,13 @@ corrupted_block_rangetester(PressMethod, TestCount) -> CheckFun = fun({SK, EK}) -> - InputBlocks = + [CB1, CB2, CBMid, CB4, CB5] = lists:map(CorruptBlockFun, [B1, B2, MidBlock, B4, B5]), - BR = blocks_required({SK, EK}, InputBlocks, PressMethod), - ?assertMatch(true, length(BR) =< 5), - BlockListFun = - fun(B) -> - case is_binary(B) of - true -> - deserialise_block(B, PressMethod); - false -> - B - end - end, - BRL = lists:flatten(lists:map(BlockListFun, BR)), - lists:foreach(fun({_K, _V}) -> ok end, BRL) + BR = + blocks_required( + {SK, EK}, CB1, CB2, CBMid, CB4, CB5, PressMethod), + ?assertMatch(true, length(BR) =< 100), + lists:foreach(fun({_K, _V}) -> ok end, BR) end, lists:foreach(CheckFun, RandomRanges). @@ -4299,7 +4258,8 @@ corrupted_block_fetch_tester(PressMethod) -> KVL1 = lists:ukeysort(1, generate_randomkeys(1, KC, 1, 2)), {{Header, SlotBin, _HashL, _LastKey}, _BT} = - generate_binary_slot(lookup, KVL1, PressMethod, false, no_timing), + generate_binary_slot( + lookup, {forward, KVL1}, PressMethod, false, no_timing), < HeaderTS = <<0:160/integer, Now:32/integer, 0:32/integer>>, HeaderNoTS = <<0:192>>, BIC = new_blockindex_cache(8), - {_, BIC0, undefined} = - update_blockindex_cache(false, EntriesNoTS, BIC, undefined, false), - {_, BIC1, undefined} = - update_blockindex_cache(false, EntriesTS, BIC, undefined, true), {_, BIC2, undefined} = - update_blockindex_cache(true, EntriesNoTS, BIC, undefined, false), + update_blockindex_cache(EntriesNoTS, BIC, undefined, false), {ETSP1, ETSP2} = lists:split(6, EntriesTS), {_, BIC3, undefined} = - update_blockindex_cache(true, ETSP1, BIC, undefined, true), + update_blockindex_cache(ETSP1, BIC, undefined, true), {_, BIC3, undefined} = - update_blockindex_cache(true, ETSP1, BIC3, undefined, true), + update_blockindex_cache(ETSP1, BIC3, undefined, true), {_, BIC4, LMD4} = - update_blockindex_cache(true, ETSP2, BIC3, undefined, true), + update_blockindex_cache(ETSP2, BIC3, undefined, true), {_, BIC4, LMD4} = - update_blockindex_cache(true, ETSP2, BIC4, LMD4, true), - - ?assertMatch(none, array:get(0, element(2, BIC0))), - ?assertMatch(none, array:get(0, element(2, BIC1))), + update_blockindex_cache(ETSP2, BIC4, LMD4, true), ?assertMatch(HeaderNoTS, array:get(0, element(2, BIC2))), ?assertMatch(HeaderTS, array:get(0, element(2, BIC3))), ?assertMatch(HeaderTS, array:get(0, element(2, BIC4))), @@ -5088,4 +5041,108 @@ start_sst_fun(ProcessToInform) -> ProcessToInform ! {sst_pid, P1}. --endif. +blocks_required_test() -> + B = <<"Bucket">>, + Idx = <<"idx_bin">>, + Chunk = leveled_rand:rand_bytes(32), + KeyFun = + fun(I) -> + list_to_binary(io_lib:format("B~6..0B", [I])) + end, + IdxKey = + fun(I) -> + {?IDX_TAG, B, {Idx, KeyFun(I)}, KeyFun(I)} + end, + StdKey = + fun(I) -> {?STD_TAG, B, KeyFun(I), null} end, + MetaValue = + fun(I) -> + element( + 3, + leveled_codec:generate_ledgerkv( + StdKey(I), I, Chunk, 32, infinity)) + end, + IdxValue = + fun(I) -> + element( + 3, + leveled_codec:generate_ledgerkv( + IdxKey(I), I, null, 0, infinity)) + end, + Block1L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(1, 16)), + Block2L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(17, 32)), + MidBlockL = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(33, 48)), + Block4L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(49, 64)), + Block5L = + lists:map(fun(I) -> {IdxKey(I), IdxValue(I)} end, lists:seq(65, 70)) + ++ + lists:map(fun(I) -> {StdKey(I), MetaValue(I)} end, lists:seq(1, 8)), + B1 = serialise_block(Block1L, native), + B2 = serialise_block(Block2L, native), + B3 = serialise_block(MidBlockL, native), + B4 = serialise_block(Block4L, native), + B5 = serialise_block(Block5L, native), + Empty = serialise_block([], native), + + TestFun = + fun(SK, EK, Exp) -> + KVL = blocks_required({SK, EK}, B1, B2, B3, B4, B5, native), + io:format( + "Length KVL ~w First ~p Last ~p~n", + [length(KVL), hd(KVL), lists:last(KVL)]), + ?assert(length(KVL) == Exp) + end, + + TestFun( + {?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}, + 68 + ), + TestFun( + {?IDX_TAG, B, {Idx, KeyFun(35)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}, + 36 + ), + TestFun( + {?IDX_TAG, B, {Idx, KeyFun(68)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}, + 3 + ), + KVL1 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, B2, Empty, B4, B5, native), + ?assertMatch(52, length(KVL1)), + KVL2 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, B2, Empty, Empty, Empty, native), + ?assertMatch(30, length(KVL2)), + KVL3 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, Empty, Empty, Empty, Empty, native), + ?assertMatch(14, length(KVL3)), + KVL4 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, Empty, B3, B4, B5, native), + ?assertMatch(52, length(KVL4)), + KVL5 = + blocks_required( + {{?IDX_TAG, B, {Idx, KeyFun(3)}, null}, + {?IDX_TAG, B, {Idx, KeyFun(99)}, null}}, + B1, B2, B3, Empty, B5, native), + ?assertMatch(52, length(KVL5)) + . + + +-endif. \ No newline at end of file diff --git a/src/leveled_util.erl b/src/leveled_util.erl index 247f047b..8a2162c3 100644 --- a/src/leveled_util.erl +++ b/src/leveled_util.erl @@ -5,8 +5,6 @@ -module(leveled_util). --include("include/leveled.hrl"). - -export([generate_uuid/0, integer_now/0, integer_time/1, @@ -42,7 +40,7 @@ integer_time(TS) -> calendar:datetime_to_gregorian_seconds(DT). --spec magic_hash(any()) -> integer(). +-spec magic_hash(any()) -> 0..16#FFFFFFFF. %% @doc %% Use DJ Bernstein magic hash function. Note, this is more expensive than %% phash2 but provides a much more balanced result. @@ -52,7 +50,7 @@ integer_time(TS) -> %% http://stackoverflow.com/questions/10696223/reason-for-5381-number-in-djb-hash-function magic_hash({binary, BinaryKey}) -> H = 5381, - hash1(H, BinaryKey) band 16#FFFFFFFF; + hash1(H, BinaryKey); magic_hash(AnyKey) -> BK = t2b(AnyKey), magic_hash({binary, BK}). @@ -60,7 +58,7 @@ magic_hash(AnyKey) -> hash1(H, <<>>) -> H; hash1(H, <>) -> - H1 = H * 33, + H1 = (H * 33) band 16#FFFFFFFF, H2 = H1 bxor B, hash1(H2, Rest). diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index bd41b2d2..217d209f 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -34,8 +34,8 @@ expiring_indexes(_Config) -> % before). Confirm that replacing an object has the expected outcome, if % the IndexSpecs are updated as part of the request. KeyCount = 50000, - Future = 60, - % 1 minute - if running tests on a slow machine, may need to increase + Future = 120, + % 2 minutes - if running tests on a slow machine, may need to increase % this value RootPath = testutil:reset_filestructure(), StartOpts1 = @@ -44,13 +44,30 @@ expiring_indexes(_Config) -> {max_journalobjectcount, 30000}, {sync_strategy, testutil:sync_strategy()}], {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), - + SW1 = os:timestamp(), + timer:sleep(1000), + + V9 = testutil:get_compressiblevalue(), + Indexes9 = testutil:get_randomindexes_generator(2), + TempRiakObjects = + testutil:generate_objects( + KeyCount, binary_uuid, [], V9, Indexes9, "riakBucket"), + IBKL1 = testutil:stdload_expiring(Bookie1, KeyCount, Future), + lists:foreach( + fun({_RN, Obj, Spc}) -> + testutil:book_tempriakput( + Bookie1, Obj, Spc, leveled_util:integer_now() + Future) + end, + TempRiakObjects + ), timer:sleep(1000), % Wait a second after last key so that none loaded in the last second LoadTime = timer:now_diff(os:timestamp(), SW1)/1000000, io:format("Load of ~w std objects in ~w seconds~n", [KeyCount, LoadTime]), + + timer:sleep(1000), SW2 = os:timestamp(), FilterFun = fun({I, _B, _K}) -> lists:member(I, [5, 6, 7, 8]) end, @@ -76,6 +93,25 @@ expiring_indexes(_Config) -> {async, I0Counter1} = CountI0Fold(), I0Count1 = I0Counter1(), + HeadFold = + fun(LowTS, HighTS) -> + leveled_bookie:book_headfold( + Bookie1, + ?RIAK_TAG, + {range, <<"riakBucket">>, all}, + {fun(_B, _K, _V, Acc) -> Acc + 1 end, 0}, + false, true, false, + {testutil:convert_to_seconds(LowTS), + testutil:convert_to_seconds(HighTS)}, + false + ) + end, + {async, HeadCount0Fun} = HeadFold(SW1, SW2), + {async, HeadCount1Fun} = HeadFold(SW2, os:timestamp()), + HeadCounts = {HeadCount0Fun(), HeadCount1Fun()}, + io:format("HeadCounts ~w before expiry~n", [HeadCounts]), + {KeyCount, 0} = HeadCounts, + FoldFun = fun(BF, {IdxV, KeyF}, Acc) -> [{IdxV, BF, KeyF}|Acc] end, InitAcc = [], IndexFold = @@ -145,6 +181,12 @@ expiring_indexes(_Config) -> true = QR4 == [], true = QR5 == [], + {async, HeadCount0ExpFun} = HeadFold(SW1, SW2), + {async, HeadCount1ExpFun} = HeadFold(SW2, os:timestamp()), + HeadCountsExp = {HeadCount0ExpFun(), HeadCount1ExpFun()}, + io:format("HeadCounts ~w after expiry~n", [HeadCountsExp]), + {0, 0} = HeadCountsExp, + ok = leveled_bookie:book_close(Bookie1), testutil:reset_filestructure(). @@ -379,12 +421,14 @@ single_object_with2i(_Config) -> %% @TODO replace all index queries with new Top-Level API if tests %% pass - {async, IdxFolder1} = leveled_bookie:book_indexfold(Bookie1, - "Bucket1", - {fun testutil:foldkeysfun/3, []}, - {list_to_binary("binary_bin"), - <<99:32/integer>>, <<101:32/integer>>}, - {true, undefined}), + {async, IdxFolder1} = + leveled_bookie:book_indexfold( + Bookie1, + "Bucket1", + {fun testutil:foldkeysfun/3, []}, + {list_to_binary("binary_bin"), + <<99:32/integer>>, <<101:32/integer>>}, + {true, undefined}), R1 = IdxFolder1(), io:format("R1 of ~w~n", [R1]), true = [{<<100:32/integer>>,"Key1"}] == R1, diff --git a/test/end_to_end/testutil.erl b/test/end_to_end/testutil.erl index f8d786a1..2c8dfc7e 100644 --- a/test/end_to_end/testutil.erl +++ b/test/end_to_end/testutil.erl @@ -3,6 +3,7 @@ -include("../include/leveled.hrl"). -export([book_riakput/3, + book_tempriakput/4, book_riakdelete/4, book_riakget/3, book_riakhead/3, @@ -182,6 +183,16 @@ book_riakput(Pid, RiakObject, IndexSpecs) -> IndexSpecs, ?RIAK_TAG). +book_tempriakput(Pid, RiakObject, IndexSpecs, TTL) -> + leveled_bookie:book_tempput( + Pid, + RiakObject#r_object.bucket, + RiakObject#r_object.key, + to_binary(v1, RiakObject), + IndexSpecs, + ?RIAK_TAG, + TTL). + book_riakdelete(Pid, Bucket, Key, IndexSpecs) -> leveled_bookie:book_put(Pid, Bucket, Key, delete, IndexSpecs, ?RIAK_TAG).