From 733d420c19c31e46f853b2e539a671c4f5c7ad4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hanno=20J=2E=20G=C3=B6decke?= Date: Wed, 23 Oct 2024 14:30:48 +0200 Subject: [PATCH] Revert "Revert "Search suffix tree implementation"" This reverts commit a16871fef314b6788505f12b76135185e2412a09. --- src/CONST.ts | 3 + .../Search/SearchRouter/SearchRouter.tsx | 62 ++++- src/libs/FastSearch.ts | 140 ++++++++++++ src/libs/OptionsListUtils.ts | 38 +++- src/libs/SuffixUkkonenTree/index.ts | 211 ++++++++++++++++++ src/libs/SuffixUkkonenTree/utils.ts | 115 ++++++++++ tests/unit/FastSearchTest.ts | 118 ++++++++++ tests/unit/SuffixUkkonenTreeTest.ts | 63 ++++++ 8 files changed, 736 insertions(+), 14 deletions(-) create mode 100644 src/libs/FastSearch.ts create mode 100644 src/libs/SuffixUkkonenTree/index.ts create mode 100644 src/libs/SuffixUkkonenTree/utils.ts create mode 100644 tests/unit/FastSearchTest.ts create mode 100644 tests/unit/SuffixUkkonenTreeTest.ts diff --git a/src/CONST.ts b/src/CONST.ts index 440f942e1244..428b2038588e 100755 --- a/src/CONST.ts +++ b/src/CONST.ts @@ -1143,6 +1143,9 @@ const CONST = { SEARCH_OPTION_LIST_DEBOUNCE_TIME: 300, RESIZE_DEBOUNCE_TIME: 100, UNREAD_UPDATE_DEBOUNCE_TIME: 300, + SEARCH_CONVERT_SEARCH_VALUES: 'search_convert_search_values', + SEARCH_MAKE_TREE: 'search_make_tree', + SEARCH_BUILD_TREE: 'search_build_tree', SEARCH_FILTER_OPTIONS: 'search_filter_options', USE_DEBOUNCED_STATE_DELAY: 300, }, diff --git a/src/components/Search/SearchRouter/SearchRouter.tsx b/src/components/Search/SearchRouter/SearchRouter.tsx index d2cb25c5a5f9..cc9e9c6ca024 100644 --- a/src/components/Search/SearchRouter/SearchRouter.tsx +++ b/src/components/Search/SearchRouter/SearchRouter.tsx @@ -12,6 +12,7 @@ import useKeyboardShortcut from '@hooks/useKeyboardShortcut'; import useLocalize from '@hooks/useLocalize'; import useResponsiveLayout from '@hooks/useResponsiveLayout'; import useThemeStyles from '@hooks/useThemeStyles'; +import FastSearch from '@libs/FastSearch'; import Log from '@libs/Log'; import * as OptionsListUtils from '@libs/OptionsListUtils'; import {getAllTaxRates} from '@libs/PolicyUtils'; @@ -63,6 +64,49 @@ function SearchRouter({onRouterClose}: SearchRouterProps) { return OptionsListUtils.getSearchOptions(options, '', betas ?? []); }, [areOptionsInitialized, betas, options]); + /** + * Builds a suffix tree and returns a function to search in it. + */ + const findInSearchTree = useMemo(() => { + const fastSearch = FastSearch.createFastSearch([ + { + data: searchOptions.personalDetails, + toSearchableString: (option) => { + const displayName = option.participantsList?.[0]?.displayName ?? ''; + return [option.login ?? '', option.login !== displayName ? displayName : ''].join(); + }, + }, + { + data: searchOptions.recentReports, + toSearchableString: (option) => { + const searchStringForTree = [option.text ?? '', option.login ?? '']; + + if (option.isThread) { + if (option.alternateText) { + searchStringForTree.push(option.alternateText); + } + } else if (!!option.isChatRoom || !!option.isPolicyExpenseChat) { + if (option.subtitle) { + searchStringForTree.push(option.subtitle); + } + } + + return searchStringForTree.join(); + }, + }, + ]); + function search(searchInput: string) { + const [personalDetails, recentReports] = fastSearch.search(searchInput); + + return { + personalDetails, + recentReports, + }; + } + + return search; + }, [searchOptions.personalDetails, searchOptions.recentReports]); + const filteredOptions = useMemo(() => { if (debouncedInputValue.trim() === '') { return { @@ -73,15 +117,25 @@ function SearchRouter({onRouterClose}: SearchRouterProps) { } Timing.start(CONST.TIMING.SEARCH_FILTER_OPTIONS); - const newOptions = OptionsListUtils.filterOptions(searchOptions, debouncedInputValue, {sortByReportTypeInSearch: true, preferChatroomsOverThreads: true}); + const newOptions = findInSearchTree(debouncedInputValue); Timing.end(CONST.TIMING.SEARCH_FILTER_OPTIONS); - return { + const recentReports = newOptions.recentReports.concat(newOptions.personalDetails); + + const userToInvite = OptionsListUtils.pickUserToInvite({ + canInviteUser: true, recentReports: newOptions.recentReports, personalDetails: newOptions.personalDetails, - userToInvite: newOptions.userToInvite, + searchValue: debouncedInputValue, + optionsToExclude: [{login: CONST.EMAIL.NOTIFICATIONS}], + }); + + return { + recentReports, + personalDetails: [], + userToInvite, }; - }, [debouncedInputValue, searchOptions]); + }, [debouncedInputValue, findInSearchTree]); const recentReports: OptionData[] = useMemo(() => { if (debouncedInputValue === '') { diff --git a/src/libs/FastSearch.ts b/src/libs/FastSearch.ts new file mode 100644 index 000000000000..59d28dedd449 --- /dev/null +++ b/src/libs/FastSearch.ts @@ -0,0 +1,140 @@ +/* eslint-disable rulesdir/prefer-at */ +import CONST from '@src/CONST'; +import Timing from './actions/Timing'; +import SuffixUkkonenTree from './SuffixUkkonenTree'; + +type SearchableData = { + /** + * The data that should be searchable + */ + data: T[]; + /** + * A function that generates a string from a data entry. The string's value is used for searching. + * If you have multiple fields that should be searchable, simply concat them to the string and return it. + */ + toSearchableString: (data: T) => string; +}; + +// There are certain characters appear very often in our search data (email addresses), which we don't need to search for. +const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '?', '_', '~', '!', ' ']); + +/** + * Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for substrings in a list of strings. + * You can provide multiple datasets. The search results will be returned for each dataset. + * + * Note: Creating a FastSearch instance with a lot of data is computationally expensive. You should create an instance once and reuse it. + * Searches will be very fast though, even with a lot of data. + */ +function createFastSearch(dataSets: Array>) { + Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); + const maxNumericListSize = 400_000; + // The user might provide multiple data sets, but internally, the search values will be stored in this one list: + let concatenatedNumericList = new Uint8Array(maxNumericListSize); + // Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data: + const occurrenceToIndex = new Uint32Array(maxNumericListSize * 4); + // As we are working with ArrayBuffers, we need to keep track of the current offset: + const offset = {value: 1}; + // We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet: + const listOffsets: number[] = []; + + for (const {data, toSearchableString} of dataSets) { + // Performance critical: the array parameters are passed by reference, so we don't have to create new arrays every time: + dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString}); + listOffsets.push(offset.value); + } + concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE; + listOffsets[listOffsets.length - 1] = offset.value; + Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); + + // The list might be larger than necessary, so we clamp it to the actual size: + concatenatedNumericList = concatenatedNumericList.slice(0, offset.value); + + // Create & build the suffix tree: + Timing.start(CONST.TIMING.SEARCH_MAKE_TREE); + const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList); + Timing.end(CONST.TIMING.SEARCH_MAKE_TREE); + + Timing.start(CONST.TIMING.SEARCH_BUILD_TREE); + tree.build(); + Timing.end(CONST.TIMING.SEARCH_BUILD_TREE); + + /** + * Searches for the given input and returns results for each dataset. + */ + function search(searchInput: string): T[][] { + const cleanedSearchString = cleanString(searchInput); + const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, { + charSetToSkip, + // stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size + // (otherwise the search could fail as we include in our search empty array values): + clamp: true, + }); + const result = tree.findSubstring(Array.from(numeric)); + + const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set()); + // eslint-disable-next-line @typescript-eslint/prefer-for-of + for (let i = 0; i < result.length; i++) { + const occurrenceIndex = result[i]; + const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex]; + const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset); + + if (dataSetIndex === -1) { + throw new Error(`[FastSearch] The occurrence index ${occurrenceIndex} is not in any dataset`); + } + const item = dataSets[dataSetIndex].data[itemIndexInDataSet]; + if (!item) { + throw new Error(`[FastSearch] The item with index ${itemIndexInDataSet} in dataset ${dataSetIndex} is not defined`); + } + resultsByDataSet[dataSetIndex].add(item); + } + + return resultsByDataSet.map((set) => Array.from(set)); + } + + return { + search, + }; +} + +/** + * The suffix tree can only store string like values, and internally stores those as numbers. + * This function converts the user data (which are most likely objects) to a numeric representation. + * Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data. + */ +function dataToNumericRepresentation(concatenatedNumericList: Uint8Array, occurrenceToIndex: Uint32Array, offset: {value: number}, {data, toSearchableString}: SearchableData): void { + data.forEach((option, index) => { + const searchStringForTree = toSearchableString(option); + const cleanedSearchStringForTree = cleanString(searchStringForTree); + + if (cleanedSearchStringForTree.length === 0) { + return; + } + + SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, { + charSetToSkip, + out: { + outArray: concatenatedNumericList, + offset, + outOccurrenceToIndex: occurrenceToIndex, + index, + }, + }); + // eslint-disable-next-line no-param-reassign + occurrenceToIndex[offset.value] = index; + // eslint-disable-next-line no-param-reassign + concatenatedNumericList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE; + }); +} + +/** + * Everything in the tree is treated as lowercase. + */ +function cleanString(input: string) { + return input.toLowerCase(); +} + +const FastSearch = { + createFastSearch, +}; + +export default FastSearch; diff --git a/src/libs/OptionsListUtils.ts b/src/libs/OptionsListUtils.ts index f61dc47c5662..5ae0fca0a68e 100644 --- a/src/libs/OptionsListUtils.ts +++ b/src/libs/OptionsListUtils.ts @@ -2419,6 +2419,31 @@ function getPersonalDetailSearchTerms(item: Partial) { function getCurrentUserSearchTerms(item: ReportUtils.OptionData) { return [item.text ?? '', item.login ?? '', item.login?.replace(CONST.EMAIL_SEARCH_REGEX, '') ?? '']; } + +type PickUserToInviteParams = { + canInviteUser: boolean; + recentReports: ReportUtils.OptionData[]; + personalDetails: ReportUtils.OptionData[]; + searchValue: string; + config?: FilterOptionsConfig; + optionsToExclude: Option[]; +}; + +const pickUserToInvite = ({canInviteUser, recentReports, personalDetails, searchValue, config, optionsToExclude}: PickUserToInviteParams) => { + let userToInvite = null; + if (canInviteUser) { + if (recentReports.length === 0 && personalDetails.length === 0) { + userToInvite = getUserToInviteOption({ + searchValue, + selectedOptions: config?.selectedOptions, + optionsToExclude, + }); + } + } + + return userToInvite; +}; + /** * Filters options based on the search input value */ @@ -2506,16 +2531,7 @@ function filterOptions(options: Options, searchInputValue: string, config?: Filt recentReports = orderOptions(recentReports, searchValue); } - let userToInvite = null; - if (canInviteUser) { - if (recentReports.length === 0 && personalDetails.length === 0) { - userToInvite = getUserToInviteOption({ - searchValue, - selectedOptions: config?.selectedOptions, - optionsToExclude, - }); - } - } + const userToInvite = pickUserToInvite({canInviteUser, recentReports, personalDetails, searchValue, config, optionsToExclude}); if (maxRecentReportsToShow > 0 && recentReports.length > maxRecentReportsToShow) { recentReports.splice(maxRecentReportsToShow); @@ -2584,6 +2600,7 @@ export { formatMemberForList, formatSectionsFromSearchTerm, getShareLogOptions, + orderOptions, filterOptions, createOptionList, createOptionFromReport, @@ -2597,6 +2614,7 @@ export { getEmptyOptions, shouldUseBoldText, getAlternateText, + pickUserToInvite, hasReportErrors, }; diff --git a/src/libs/SuffixUkkonenTree/index.ts b/src/libs/SuffixUkkonenTree/index.ts new file mode 100644 index 000000000000..bcefd1008493 --- /dev/null +++ b/src/libs/SuffixUkkonenTree/index.ts @@ -0,0 +1,211 @@ +/* eslint-disable rulesdir/prefer-at */ +// .at() has a performance overhead we explicitly want to avoid here + +/* eslint-disable no-continue */ +import {ALPHABET_SIZE, DELIMITER_CHAR_CODE, END_CHAR_CODE, SPECIAL_CHAR_CODE, stringToNumeric} from './utils'; + +/** + * This implements a suffix tree using Ukkonen's algorithm. + * A good visualization to learn about the algorithm can be found here: https://brenden.github.io/ukkonen-animation/ + * A good video explaining Ukkonen's algorithm can be found here: https://www.youtube.com/watch?v=ALEV0Hc5dDk + * Note: This implementation is optimized for performance, not necessarily for readability. + * + * You probably don't want to use this directly, but rather use @libs/FastSearch.ts as a easy to use wrapper around this. + */ + +/** + * Creates a new tree instance that can be used to build a suffix tree and search in it. + * The input is a numeric representation of the search string, which can be created using {@link stringToNumeric}. + * Separate search values must be separated by the {@link DELIMITER_CHAR_CODE}. The search string must end with the {@link END_CHAR_CODE}. + * + * The tree will be built using the Ukkonen's algorithm: https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf + */ +function makeTree(numericSearchValues: Uint8Array) { + // Every leaf represents a suffix. There can't be more than n suffixes. + // Every internal node has to have at least 2 children. So the total size of ukkonen tree is not bigger than 2n - 1. + // + 1 is because an extra character at the beginning to offset the 1-based indexing. + const maxNodes = 2 * numericSearchValues.length + 1; + /* + This array represents all internal nodes in the suffix tree. + When building this tree, we'll be given a character in the string, and we need to be able to lookup in constant time + if there's any edge connected to a node starting with that character. For example, given a tree like this: + + root + / | \ + a b c + + and the next character in our string is 'd', we need to be able do check if any of the edges from the root node + start with the letter 'd', without looping through all the edges. + + To accomplish this, each node gets an array matching the alphabet size. + So you can imagine if our alphabet was just [a,b,c,d], then each node would get an array like [0,0,0,0]. + If we add an edge starting with 'a', then the root node would be [1,0,0,0] + So given an arbitrary letter such as 'd', then we can take the position of that letter in its alphabet (position 3 in our example) + and check whether that index in the array is 0 or 1. If it's a 1, then there's an edge starting with the letter 'd'. + + Note that for efficiency, all nodes are stored in a single flat array. That's how we end up with (maxNodes * alphabet_size). + In the example of a 4-character alphabet, we'd have an array like this: + + root root.left root.right last possible node + / \ / \ / \ / \ + [0,0,0,0, 0,0,0,0, 0,0,0,0, ................. 0,0,0,0] + */ + const transitionNodes = new Uint32Array(maxNodes * ALPHABET_SIZE); + + // Storing the range of the original string that each node represents: + const rangeStart = new Uint32Array(maxNodes); + const rangeEnd = new Uint32Array(maxNodes); + + const parent = new Uint32Array(maxNodes); + const suffixLink = new Uint32Array(maxNodes); + + let currentNode = 1; + let currentPosition = 1; + let nodeCounter = 3; + let currentIndex = 1; + + function initializeTree() { + rangeEnd.fill(numericSearchValues.length); + rangeEnd[1] = 0; + rangeEnd[2] = 0; + suffixLink[1] = 2; + for (let i = 0; i < ALPHABET_SIZE; ++i) { + transitionNodes[ALPHABET_SIZE * 2 + i] = 1; + } + } + + function processCharacter(char: number) { + // eslint-disable-next-line no-constant-condition + while (true) { + if (rangeEnd[currentNode] < currentPosition) { + if (transitionNodes[currentNode * ALPHABET_SIZE + char] === 0) { + createNewLeaf(char); + continue; + } + currentNode = transitionNodes[currentNode * ALPHABET_SIZE + char]; + currentPosition = rangeStart[currentNode]; + } + if (currentPosition === 0 || char === numericSearchValues[currentPosition]) { + currentPosition++; + } else { + splitEdge(char); + continue; + } + break; + } + } + + function createNewLeaf(c: number) { + transitionNodes[currentNode * ALPHABET_SIZE + c] = nodeCounter; + rangeStart[nodeCounter] = currentIndex; + parent[nodeCounter++] = currentNode; + currentNode = suffixLink[currentNode]; + + currentPosition = rangeEnd[currentNode] + 1; + } + + function splitEdge(c: number) { + rangeStart[nodeCounter] = rangeStart[currentNode]; + rangeEnd[nodeCounter] = currentPosition - 1; + parent[nodeCounter] = parent[currentNode]; + + transitionNodes[nodeCounter * ALPHABET_SIZE + numericSearchValues[currentPosition]] = currentNode; + transitionNodes[nodeCounter * ALPHABET_SIZE + c] = nodeCounter + 1; + rangeStart[nodeCounter + 1] = currentIndex; + parent[nodeCounter + 1] = nodeCounter; + rangeStart[currentNode] = currentPosition; + parent[currentNode] = nodeCounter; + + transitionNodes[parent[nodeCounter] * ALPHABET_SIZE + numericSearchValues[rangeStart[nodeCounter]]] = nodeCounter; + nodeCounter += 2; + handleDescent(nodeCounter); + } + + function handleDescent(latestNodeIndex: number) { + currentNode = suffixLink[parent[latestNodeIndex - 2]]; + currentPosition = rangeStart[latestNodeIndex - 2]; + while (currentPosition <= rangeEnd[latestNodeIndex - 2]) { + currentNode = transitionNodes[currentNode * ALPHABET_SIZE + numericSearchValues[currentPosition]]; + currentPosition += rangeEnd[currentNode] - rangeStart[currentNode] + 1; + } + if (currentPosition === rangeEnd[latestNodeIndex - 2] + 1) { + suffixLink[latestNodeIndex - 2] = currentNode; + } else { + suffixLink[latestNodeIndex - 2] = latestNodeIndex; + } + currentPosition = rangeEnd[currentNode] - (currentPosition - rangeEnd[latestNodeIndex - 2]) + 2; + } + + function build() { + initializeTree(); + for (currentIndex = 1; currentIndex < numericSearchValues.length; ++currentIndex) { + const c = numericSearchValues[currentIndex]; + processCharacter(c); + } + } + + /** + * Returns all occurrences of the given (sub)string in the input string. + * + * You can think of the tree that we create as a big string that looks like this: + * + * "banana$pancake$apple|" + * The example delimiter character '$' is used to separate the different strings. + * The end character '|' is used to indicate the end of our search string. + * + * This function will return the index(es) of found occurrences within this big string. + * So, when searching for "an", it would return [1, 3, 8]. + */ + function findSubstring(searchValue: number[]) { + const occurrences: number[] = []; + + function dfs(node: number, depth: number) { + const leftRange = rangeStart[node]; + const rightRange = rangeEnd[node]; + const rangeLen = node === 1 ? 0 : rightRange - leftRange + 1; + + for (let i = 0; i < rangeLen && depth + i < searchValue.length && leftRange + i < numericSearchValues.length; i++) { + if (searchValue[depth + i] !== numericSearchValues[leftRange + i]) { + return; + } + } + + let isLeaf = true; + for (let i = 0; i < ALPHABET_SIZE; ++i) { + const tNode = transitionNodes[node * ALPHABET_SIZE + i]; + + // Search speed optimization: don't go through the edge if it's different than the next char: + const correctChar = depth + rangeLen >= searchValue.length || i === searchValue[depth + rangeLen]; + + if (tNode !== 0 && tNode !== 1 && correctChar) { + isLeaf = false; + dfs(tNode, depth + rangeLen); + } + } + + if (isLeaf && depth + rangeLen >= searchValue.length) { + occurrences.push(numericSearchValues.length - (depth + rangeLen) + 1); + } + } + + dfs(1, 0); + return occurrences; + } + + return { + build, + findSubstring, + }; +} + +const SuffixUkkonenTree = { + makeTree, + + // Re-exported from utils: + DELIMITER_CHAR_CODE, + SPECIAL_CHAR_CODE, + END_CHAR_CODE, + stringToNumeric, +}; + +export default SuffixUkkonenTree; diff --git a/src/libs/SuffixUkkonenTree/utils.ts b/src/libs/SuffixUkkonenTree/utils.ts new file mode 100644 index 000000000000..96ee35b15796 --- /dev/null +++ b/src/libs/SuffixUkkonenTree/utils.ts @@ -0,0 +1,115 @@ +/* eslint-disable rulesdir/prefer-at */ // .at() has a performance overhead we explicitly want to avoid here +/* eslint-disable no-continue */ + +const CHAR_CODE_A = 'a'.charCodeAt(0); +const ALPHABET = 'abcdefghijklmnopqrstuvwxyz'; +const LETTER_ALPHABET_SIZE = ALPHABET.length; +const ALPHABET_SIZE = LETTER_ALPHABET_SIZE + 3; // +3: special char, delimiter char, end char +const SPECIAL_CHAR_CODE = ALPHABET_SIZE - 3; +const DELIMITER_CHAR_CODE = ALPHABET_SIZE - 2; +const END_CHAR_CODE = ALPHABET_SIZE - 1; + +// Store the results for a char code in a lookup table to avoid recalculating the same values (performance optimization) +const base26LookupTable = new Array(); + +/** + * Converts a number to a base26 representation. + */ +function convertToBase26(num: number): number[] { + if (base26LookupTable[num]) { + return base26LookupTable[num]; + } + if (num < 0) { + throw new Error('convertToBase26: Input must be a non-negative integer'); + } + + const result: number[] = []; + + do { + // eslint-disable-next-line no-param-reassign + num--; + result.unshift(num % 26); + // eslint-disable-next-line no-bitwise, no-param-reassign + num >>= 5; // Equivalent to Math.floor(num / 26), but faster + } while (num > 0); + + base26LookupTable[num] = result; + return result; +} + +/** + * Converts a string to an array of numbers representing the characters of the string. + * Every number in the array is in the range [0, ALPHABET_SIZE-1] (0-28). + * + * The numbers are offset by the character code of 'a' (97). + * - This is so that the numbers from a-z are in the range 0-28. + * - 26 is for encoding special characters. Character numbers that are not within the range of a-z will be encoded as "specialCharacter + base26(charCode)" + * - 27 is for the delimiter character + * - 28 is for the end character + * + * Note: The string should be converted to lowercase first (otherwise uppercase letters get base26'ed taking more space than necessary). + */ +function stringToNumeric( + // The string we want to convert to a numeric representation + input: string, + options?: { + // A set of characters that should be skipped and not included in the numeric representation + charSetToSkip?: Set; + // When out is provided, the function will write the result to the provided arrays instead of creating new ones (performance) + out?: { + outArray: Uint8Array; + // As outArray is a ArrayBuffer we need to keep track of the current offset + offset: {value: number}; + // A map of to map the found occurrences to the correct data set + // As the search string can be very long for high traffic accounts (500k+), this has to be big enough, thus its a Uint32Array + outOccurrenceToIndex?: Uint32Array; + // The index that will be used in the outOccurrenceToIndex array (this is the index of your original data position) + index?: number; + }; + // By default false. By default the outArray may be larger than necessary. If clamp is set to true the outArray will be clamped to the actual size. + clamp?: boolean; + }, +): { + numeric: Uint8Array; + occurrenceToIndex: Uint32Array; + offset: {value: number}; +} { + // The out array might be longer than our input string length, because we encode special characters as multiple numbers using the base26 encoding. + // * 6 is because the upper limit of encoding any char in UTF-8 to base26 is at max 6 numbers. + const outArray = options?.out?.outArray ?? new Uint8Array(input.length * 6); + const offset = options?.out?.offset ?? {value: 0}; + const occurrenceToIndex = options?.out?.outOccurrenceToIndex ?? new Uint32Array(input.length * 16 * 4); + const index = options?.out?.index ?? 0; + + for (let i = 0; i < input.length; i++) { + const char = input[i]; + + if (options?.charSetToSkip?.has(char)) { + continue; + } + + if (char >= 'a' && char <= 'z') { + // char is an alphabet character + occurrenceToIndex[offset.value] = index; + outArray[offset.value++] = char.charCodeAt(0) - CHAR_CODE_A; + } else { + const charCode = input.charCodeAt(i); + occurrenceToIndex[offset.value] = index; + outArray[offset.value++] = SPECIAL_CHAR_CODE; + const asBase26Numeric = convertToBase26(charCode); + // eslint-disable-next-line @typescript-eslint/prefer-for-of + for (let j = 0; j < asBase26Numeric.length; j++) { + occurrenceToIndex[offset.value] = index; + outArray[offset.value++] = asBase26Numeric[j]; + } + } + } + + return { + numeric: options?.clamp ? outArray.slice(0, offset.value) : outArray, + occurrenceToIndex, + offset, + }; +} + +export {stringToNumeric, ALPHABET, ALPHABET_SIZE, SPECIAL_CHAR_CODE, DELIMITER_CHAR_CODE, END_CHAR_CODE}; diff --git a/tests/unit/FastSearchTest.ts b/tests/unit/FastSearchTest.ts new file mode 100644 index 000000000000..029e05e15b1f --- /dev/null +++ b/tests/unit/FastSearchTest.ts @@ -0,0 +1,118 @@ +import FastSearch from '../../src/libs/FastSearch'; + +describe('FastSearch', () => { + it('should insert, and find the word', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['banana'], + toSearchableString: (data) => data, + }, + ]); + expect(search('an')).toEqual([['banana']]); + }); + + it('should work with multiple words', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['banana', 'test'], + toSearchableString: (data) => data, + }, + ]); + + expect(search('es')).toEqual([['test']]); + }); + + it('should work when providing two data sets', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['erica', 'banana'], + toSearchableString: (data) => data, + }, + { + data: ['banana', 'test'], + toSearchableString: (data) => data, + }, + ]); + + expect(search('es')).toEqual([[], ['test']]); + }); + + it('should work with numbers', () => { + const {search} = FastSearch.createFastSearch([ + { + data: [1, 2, 3, 4, 5], + toSearchableString: (data) => String(data), + }, + ]); + + expect(search('2')).toEqual([[2]]); + }); + + it('should work with unicodes', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['banana', 'ñèşťǒř', 'test'], + toSearchableString: (data) => data, + }, + ]); + + expect(search('èşť')).toEqual([['ñèşťǒř']]); + }); + + it('should work with words containing "reserved special characters"', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['ba|nana', 'te{st', 'he}llo'], + toSearchableString: (data) => data, + }, + ]); + + expect(search('st')).toEqual([['te{st']]); + expect(search('llo')).toEqual([['he}llo']]); + expect(search('nana')).toEqual([['ba|nana']]); + }); + + it('should be case insensitive', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['banana', 'TeSt', 'TEST', 'X'], + toSearchableString: (data) => data, + }, + ]); + + expect(search('test')).toEqual([['TeSt', 'TEST']]); + }); + + it('should work with large random data sets', () => { + const data = Array.from({length: 1000}, () => { + return Array.from({length: Math.floor(Math.random() * 22 + 9)}, () => { + const alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789@-_.'; + return alphabet.charAt(Math.floor(Math.random() * alphabet.length)); + }).join(''); + }); + + const {search} = FastSearch.createFastSearch([ + { + data, + toSearchableString: (x) => x, + }, + ]); + + data.forEach((word) => { + expect(search(word)).toEqual([expect.arrayContaining([word])]); + }); + }); + + it('should find email addresses without dots', () => { + const {search} = FastSearch.createFastSearch([ + { + data: ['test.user@example.com', 'unrelated'], + toSearchableString: (data) => data, + }, + ]); + + expect(search('testuser')).toEqual([['test.user@example.com']]); + expect(search('test.user')).toEqual([['test.user@example.com']]); + expect(search('examplecom')).toEqual([['test.user@example.com']]); + }); +}); diff --git a/tests/unit/SuffixUkkonenTreeTest.ts b/tests/unit/SuffixUkkonenTreeTest.ts new file mode 100644 index 000000000000..c0c556c16e14 --- /dev/null +++ b/tests/unit/SuffixUkkonenTreeTest.ts @@ -0,0 +1,63 @@ +import SuffixUkkonenTree from '@libs/SuffixUkkonenTree/index'; + +describe('SuffixUkkonenTree', () => { + // The suffix tree doesn't take strings, but expects an array buffer, where strings have been separated by a delimiter. + function helperStringsToNumericForTree(strings: string[]) { + const numericLists = strings.map((s) => SuffixUkkonenTree.stringToNumeric(s, {clamp: true})); + const numericList = numericLists.reduce( + (acc, {numeric}) => { + acc.push(...numeric, SuffixUkkonenTree.DELIMITER_CHAR_CODE); + return acc; + }, + // The value we pass to makeTree needs to be offset by one + [0], + ); + numericList.push(SuffixUkkonenTree.END_CHAR_CODE); + return Uint8Array.from(numericList); + } + + it('should insert, build, and find all occurrences', () => { + const strings = ['banana', 'pancake']; + const numericIntArray = helperStringsToNumericForTree(strings); + + const tree = SuffixUkkonenTree.makeTree(numericIntArray); + tree.build(); + const searchValue = SuffixUkkonenTree.stringToNumeric('an', {clamp: true}).numeric; + expect(tree.findSubstring(Array.from(searchValue))).toEqual(expect.arrayContaining([2, 4, 9])); + }); + + it('should find by first character', () => { + const strings = ['pancake', 'banana']; + const numericIntArray = helperStringsToNumericForTree(strings); + const tree = SuffixUkkonenTree.makeTree(numericIntArray); + tree.build(); + const searchValue = SuffixUkkonenTree.stringToNumeric('p', {clamp: true}).numeric; + expect(tree.findSubstring(Array.from(searchValue))).toEqual(expect.arrayContaining([1])); + }); + + it('should handle identical words', () => { + const strings = ['banana', 'banana', 'x']; + const numericIntArray = helperStringsToNumericForTree(strings); + const tree = SuffixUkkonenTree.makeTree(numericIntArray); + tree.build(); + const searchValue = SuffixUkkonenTree.stringToNumeric('an', {clamp: true}).numeric; + expect(tree.findSubstring(Array.from(searchValue))).toEqual(expect.arrayContaining([2, 4, 9, 11])); + }); + + it('should convert string to numeric with a list of chars to skip', () => { + const {numeric} = SuffixUkkonenTree.stringToNumeric('abcabc', { + charSetToSkip: new Set(['b']), + clamp: true, + }); + expect(Array.from(numeric)).toEqual([0, 2, 0, 2]); + }); + + it('should convert string outside of a-z to numeric with clamping', () => { + const {numeric} = SuffixUkkonenTree.stringToNumeric('2', { + clamp: true, + }); + + // "2" in ASCII is 50, so base26(50) = [0, 23] + expect(Array.from(numeric)).toEqual([SuffixUkkonenTree.SPECIAL_CHAR_CODE, 0, 23]); + }); +});