Skip to content

Commit

Permalink
Update regexp with escapes in TextTokenizer
Browse files Browse the repository at this point in the history
Resolves #49
  • Loading branch information
JayPanoz committed Sep 12, 2024
1 parent 753e7d7 commit 68e9b60
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion shared/src/util/tokenizer/TextTokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,13 @@ export class NaiveTextTokenizer {
}
}

const trimmedMatcher = new RegExp("[\\p{L}\\p{N}]+", "u");

// Unicode-aware of checking if there's anything that can be spoken in a string
// "Spoken" in this case means at least one unicode letter or unicode number character
export const speakableToken = (token: string): string | null => {
const trimmedToken = token.trimEnd();
if(trimmedToken.length === 0) return null;
if(trimmedToken.match(/[\p{L}\p{N}]+/u) === null) return null;
if(trimmedToken.match(trimmedMatcher) === null) return null;
return trimmedToken;
}

0 comments on commit 68e9b60

Please sign in to comment.