🌏 add more language models

this adds some previously errornously excluded language models
bugbakery · Oct 11, 2023 · 8e0e0d4 · 8e0e0d4
1 parent 10ee41a
commit 8e0e0d4
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 1 deletion.
diff --git a/server/app/models.yml b/server/app/models.yml
@@ -20,6 +20,13 @@ English:
   size: 128M
   type: transcription
   compressed: true
+- name: big-2
+  url: https://alphacephei.com/vosk/models/vosk-model-en-us-0.42-gigaspeech.zip
+  description: Accurate generic US English model trained by Kaldi on <a href="http://kaldi-asr.org/models/m14">Gigaspeech</a>.
+    Mostly for podcasts, not for telephony
+  size: 2.3G
+  type: transcription
+  compressed: true
 Indian English:
 - name: big
   url: https://alphacephei.com/vosk/models/vosk-model-en-in-0.5.zip
@@ -154,6 +161,14 @@ Portuguese/Brazilian Portuguese:
   size: 1.6G
   type: transcription
   compressed: true
+Greek:
+- name: big
+  url: https://alphacephei.com/vosk/models/vosk-model-el-gr-0.7.zip
+  description: Big narrowband Greek model for server processing, not extremely accurate
+    though
+  size: 1.1G
+  type: transcription
+  compressed: true
 Turkish:
 - name: small
   url: https://alphacephei.com/vosk/models/vosk-model-small-tr-0.3.zip
@@ -235,6 +250,13 @@ Farsi:
   size: 47M
   type: transcription
   compressed: true
+- name: big
+  url: https://alphacephei.com/vosk/models/vosk-model-fa-0.5.zip
+  description: Model with large vocabulary, not yet accurate but better than before
+    (Persian)
+  size: 1G
+  type: transcription
+  compressed: true
 - name: small-2
   url: https://alphacephei.com/vosk/models/vosk-model-small-fa-0.5.zip
   description: Bigger small model for desktop application (Persian)

diff --git a/server/scripts/generate_models_list.py b/server/scripts/generate_models_list.py
@@ -20,15 +20,17 @@
 for row in rows:
     if strong := row.find("strong"):
         current_lang = strong.text
+        print(current_lang)
     else:
         assert (
             current_lang is not None
         ), "no previous language heading found, probably the format changed :("
         raw = {k: v for k, v in zip(columns, row.find_all("td"))}
 
-        if current_lang == "English Other" or "not" in raw["Notes"].text.lower():
+        if current_lang == "English Other" or "not recommended" in raw["Notes"].text.lower():
             continue
 
+
         if current_lang == "Speaker identification model":
             continue