From 459f931b11e57e166d4a87efcd71651749f8ce47 Mon Sep 17 00:00:00 2001 From: Madhurjya Kalita <88551650+melsonic@users.noreply.github.com> Date: Mon, 2 Oct 2023 02:52:30 +0530 Subject: [PATCH] feat: added support for sanskrit diacriticals (#503) --- .../src/components/tokenizer/languages.ts | 2 ++ packages/stemmers/lib/sk.js | 25 ++++++++++++++ packages/stemmers/package.json | 5 +++ packages/stemmers/scripts/build.js | 1 + packages/stopwords/lib/sk.js | 34 +++++++++++++++++++ packages/stopwords/package.json | 5 +++ packages/stopwords/scripts/build.js | 1 + 7 files changed, 73 insertions(+) create mode 100644 packages/stemmers/lib/sk.js create mode 100644 packages/stopwords/lib/sk.js diff --git a/packages/orama/src/components/tokenizer/languages.ts b/packages/orama/src/components/tokenizer/languages.ts index ae7016e98..c7d487bdd 100644 --- a/packages/orama/src/components/tokenizer/languages.ts +++ b/packages/orama/src/components/tokenizer/languages.ts @@ -27,6 +27,7 @@ export const STEMMERS: Record = { tamil: 'ta', turkish: 'tr', ukrainian: 'uk', + sanskrit: 'sk', } export const SPLITTERS: Record = { @@ -58,6 +59,7 @@ export const SPLITTERS: Record = { slovenian: /[^a-z0-9螚ȎŠ]+/gim, bulgarian: /[^a-z0-9а-яА-Я]+/gim, tamil: /[^a-z0-9அ-ஹ]+/gim, + sanskrit: /[^a-z0-9A-Zāīūṛḷṃṁḥśṣṭḍṇṅñḻḹṝ]+/gim } export const SUPPORTED_LANGUAGES = Object.keys(STEMMERS) diff --git a/packages/stemmers/lib/sk.js b/packages/stemmers/lib/sk.js new file mode 100644 index 000000000..201bff72b --- /dev/null +++ b/packages/stemmers/lib/sk.js @@ -0,0 +1,25 @@ +/** + * Light Stemmer class for Sanskrit + */ + +class SanskritStemmer { + constructor() { + this.suffixes = ['aḥ', 'āḥ', 'iḥ', 'īḥ', 'uḥ', 'ūḥ', 'am', 'ām', 'im', 'īm', 'um', 'ūm', 'an', 'ān', 'in', 'īn', 'un', 'ūn', 'as', 'ās', 'is', 'īs', 'us', 'ūs']; + } + + stem(word) { + for (const suffix of this.suffixes) { + if (word.endsWith(suffix)) { + return word.slice(0, -suffix.length); + } + } + + return word; + } +} + +const stemmerInstance = new SanskritStemmer(); + +export function stemmer(word) { + return stemmerInstance.stem(word) +} diff --git a/packages/stemmers/package.json b/packages/stemmers/package.json index 00d9c2fe6..2fb0afb42 100644 --- a/packages/stemmers/package.json +++ b/packages/stemmers/package.json @@ -146,6 +146,11 @@ "types": "./dist/uk.d.ts", "import": "./dist/uk.js", "require": "./dist/uk.cjs" + }, + "./sanskrit": { + "types": "./dist/sk.d.ts", + "import": "./dist/sk.js", + "require": "./dist/sk.cjs" } }, "files": [ diff --git a/packages/stemmers/scripts/build.js b/packages/stemmers/scripts/build.js index 0ae0e67da..adf1b7a73 100644 --- a/packages/stemmers/scripts/build.js +++ b/packages/stemmers/scripts/build.js @@ -35,6 +35,7 @@ const stemmers = { tamil: 'ta', turkish: 'tr', ukrainian: 'uk', + sanskrit: 'sk', } async function compile(lang, jsExtension, tsExtension, moduleType) { diff --git a/packages/stopwords/lib/sk.js b/packages/stopwords/lib/sk.js new file mode 100644 index 000000000..4e0f6209c --- /dev/null +++ b/packages/stopwords/lib/sk.js @@ -0,0 +1,34 @@ +export const stopwords = [ + 'ahaṃ', + 'tava', + 'tvayi', + 'svayam', + 'vayam', + 'asmān', + 'nas', + 'yat', + 'yaḥ', + 'kiṃ', + 'kaḥ', + 'saḥ', + 'taḥ', + 'tasya', + 'tasmai', + 'asya', + 'tat', + 'tad', + 'tatra', + 'katham', + 'yadi', + 'vā', + 'athavā', + 'evaṃ', + 'na', + 'api', + 'atha', + 'sama', + 'santu', + 'antaḥ', + 'antar', + 'ubhau', +]; \ No newline at end of file diff --git a/packages/stopwords/package.json b/packages/stopwords/package.json index cd26bd188..c70dfebcb 100644 --- a/packages/stopwords/package.json +++ b/packages/stopwords/package.json @@ -146,6 +146,11 @@ "types": "./dist/uk.d.ts", "import": "./dist/uk.js", "require": "./dist/uk.cjs" + }, + "./sanskrit": { + "types": "./dist/sk.d.ts", + "import": "./dist/sk.js", + "require": "./dist/sk.cjs" } }, "files": [ diff --git a/packages/stopwords/scripts/build.js b/packages/stopwords/scripts/build.js index 097cdccfd..56db6ec2a 100644 --- a/packages/stopwords/scripts/build.js +++ b/packages/stopwords/scripts/build.js @@ -35,6 +35,7 @@ const stemmers = { tamil: 'ta', turkish: 'tr', ukrainian: 'uk', + sanskrit: 'sk', } async function compile(lang, jsExtension, tsExtension, moduleType) {