Skip to content

Commit

Permalink
feat: added support for sanskrit diacriticals (#503)
Browse files Browse the repository at this point in the history
  • Loading branch information
melsonic authored Oct 1, 2023
1 parent e690808 commit 459f931
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 0 deletions.
2 changes: 2 additions & 0 deletions packages/orama/src/components/tokenizer/languages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ export const STEMMERS: Record<string, string> = {
tamil: 'ta',
turkish: 'tr',
ukrainian: 'uk',
sanskrit: 'sk',
}

export const SPLITTERS: Record<Language, RegExp> = {
Expand Down Expand Up @@ -58,6 +59,7 @@ export const SPLITTERS: Record<Language, RegExp> = {
slovenian: /[^a-z0-9螚ȎŠ]+/gim,
bulgarian: /[^a-z0-9а-яА-Я]+/gim,
tamil: /[^a-z0-9அ-ஹ]+/gim,
sanskrit: /[^a-z0-9A-Zāīūṛḷṃṁḥśṣṭḍṇṅñḻḹṝ]+/gim
}

export const SUPPORTED_LANGUAGES = Object.keys(STEMMERS)
Expand Down
25 changes: 25 additions & 0 deletions packages/stemmers/lib/sk.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/**
* Light Stemmer class for Sanskrit
*/

class SanskritStemmer {
constructor() {
this.suffixes = ['aḥ', 'āḥ', 'iḥ', 'īḥ', 'uḥ', 'ūḥ', 'am', 'ām', 'im', 'īm', 'um', 'ūm', 'an', 'ān', 'in', 'īn', 'un', 'ūn', 'as', 'ās', 'is', 'īs', 'us', 'ūs'];
}

stem(word) {
for (const suffix of this.suffixes) {
if (word.endsWith(suffix)) {
return word.slice(0, -suffix.length);
}
}

return word;
}
}

const stemmerInstance = new SanskritStemmer();

export function stemmer(word) {
return stemmerInstance.stem(word)
}
5 changes: 5 additions & 0 deletions packages/stemmers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@
"types": "./dist/uk.d.ts",
"import": "./dist/uk.js",
"require": "./dist/uk.cjs"
},
"./sanskrit": {
"types": "./dist/sk.d.ts",
"import": "./dist/sk.js",
"require": "./dist/sk.cjs"
}
},
"files": [
Expand Down
1 change: 1 addition & 0 deletions packages/stemmers/scripts/build.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const stemmers = {
tamil: 'ta',
turkish: 'tr',
ukrainian: 'uk',
sanskrit: 'sk',
}

async function compile(lang, jsExtension, tsExtension, moduleType) {
Expand Down
34 changes: 34 additions & 0 deletions packages/stopwords/lib/sk.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
export const stopwords = [
'ahaṃ',
'tava',
'tvayi',
'svayam',
'vayam',
'asmān',
'nas',
'yat',
'yaḥ',
'kiṃ',
'kaḥ',
'saḥ',
'taḥ',
'tasya',
'tasmai',
'asya',
'tat',
'tad',
'tatra',
'katham',
'yadi',
'vā',
'athavā',
'evaṃ',
'na',
'api',
'atha',
'sama',
'santu',
'antaḥ',
'antar',
'ubhau',
];
5 changes: 5 additions & 0 deletions packages/stopwords/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@
"types": "./dist/uk.d.ts",
"import": "./dist/uk.js",
"require": "./dist/uk.cjs"
},
"./sanskrit": {
"types": "./dist/sk.d.ts",
"import": "./dist/sk.js",
"require": "./dist/sk.cjs"
}
},
"files": [
Expand Down
1 change: 1 addition & 0 deletions packages/stopwords/scripts/build.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const stemmers = {
tamil: 'ta',
turkish: 'tr',
ukrainian: 'uk',
sanskrit: 'sk',
}

async function compile(lang, jsExtension, tsExtension, moduleType) {
Expand Down

1 comment on commit 459f931

@vercel
Copy link

@vercel vercel bot commented on 459f931 Oct 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.