Refactor hotwords，support loading hotwords from file (#296)

k2-fsa · Sep 14, 2023 · 47184f9 · 47184f9
1 parent 087367d
commit 47184f9
Show file tree

Hide file tree

Showing 34 changed files with 800 additions and 297 deletions.
diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh
@@ -166,3 +166,8 @@ python3 ./python-api-examples/offline-decode-files.py \
 python3 sherpa-onnx/python/tests/test_offline_recognizer.py --verbose
 
 rm -rf $repo
+
+# test text2token
+git clone https://github.com/pkufool/sherpa-test-data /tmp/sherpa-test-data
+
+python3 sherpa-onnx/python/tests/test_text2token.py --verbose
diff --git a/.github/workflows/test-python-offline-websocket-server.yaml b/.github/workflows/test-python-offline-websocket-server.yaml
@@ -39,7 +39,7 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-          python3 -m pip install --upgrade pip numpy
+          python3 -m pip install --upgrade pip numpy sentencepiece
 
       - name: Install sherpa-onnx
         shell: bash

diff --git a/.github/workflows/test-python-online-websocket-server.yaml b/.github/workflows/test-python-online-websocket-server.yaml
@@ -39,7 +39,7 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-          python3 -m pip install --upgrade pip numpy
+          python3 -m pip install --upgrade pip numpy sentencepiece
 
       - name: Install sherpa-onnx
         shell: bash

diff --git a/python-api-examples/non_streaming_server.py b/python-api-examples/non_streaming_server.py
@@ -326,6 +326,31 @@ def add_modified_beam_search_args(parser: argparse.ArgumentParser):
     )
 
 
+def add_hotwords_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--hotwords-file",
+        type=str,
+        default="",
+        help="""
+        The file containing hotwords, one words/phrases per line, and for each
+        phrase the bpe/cjkchar are separated by a space. For example:
+
+        ▁HE LL O ▁WORLD
+        你 好 世 界
+        """,
+    )
+
+    parser.add_argument(
+        "--hotwords-score",
+        type=float,
+        default=1.5,
+        help="""
+        The hotword score of each token for biasing word/phrase. Used only if
+        --hotwords-file is given.
+        """,
+    )
+
+
 def check_args(args):
     if not Path(args.tokens).is_file():
         raise ValueError(f"{args.tokens} does not exist")
@@ -342,6 +367,10 @@ def check_args(args):
         assert Path(args.decoder).is_file(), args.decoder
         assert Path(args.joiner).is_file(), args.joiner
 
+    if args.hotwords_file != "":
+        assert args.decoding_method == "modified_beam_search", args.decoding_method
+        assert Path(args.hotwords_file).is_file(), args.hotwords_file
+
 
 def get_args():
     parser = argparse.ArgumentParser(
@@ -351,6 +380,7 @@ def get_args():
     add_model_args(parser)
     add_feature_config_args(parser)
     add_decoding_args(parser)
+    add_hotwords_args(parser)
 
     parser.add_argument(
         "--port",
@@ -792,6 +822,8 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
             feature_dim=args.feat_dim,
             decoding_method=args.decoding_method,
             max_active_paths=args.max_active_paths,
+            hotwords_file=args.hotwords_file,
+            hotwords_score=args.hotwords_score,
         )
     elif args.paraformer:
         assert len(args.nemo_ctc) == 0, args.nemo_ctc

diff --git a/python-api-examples/offline-decode-files.py b/python-api-examples/offline-decode-files.py
@@ -82,7 +82,6 @@
 from typing import List, Tuple
 
 import numpy as np
-import sentencepiece as spm
 import sherpa_onnx
 
 
@@ -98,43 +97,25 @@ def get_args():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--hotwords-file",
         type=str,
         default="",
         help="""
-        Path to bpe.model,
-        Used only when --decoding-method=modified_beam_search
-        """,
-    )
+        The file containing hotwords, one words/phrases per line, and for each
+        phrase the bpe/cjkchar are separated by a space. For example:
 
-    parser.add_argument(
-        "--modeling-unit",
-        type=str,
-        default="char",
-        help="""
-        The type of modeling unit.
-        Valid values are bpe, bpe+char, char.
-        Note: the char here means characters in CJK languages.
+        ▁HE LL O ▁WORLD
+        你 好 世 界
         """,
     )
 
     parser.add_argument(
-        "--contexts",
-        type=str,
-        default="",
-        help="""
-        The context list, it is a string containing some words/phrases separated
-        with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
-        """,
-    )
-
-    parser.add_argument(
-        "--context-score",
+        "--hotwords-score",
         type=float,
         default=1.5,
         help="""
-        The context score of each token for biasing word/phrase. Used only if
-        --contexts is given.
+        The hotword score of each token for biasing word/phrase. Used only if
+        --hotwords-file is given.
         """,
     )
 
@@ -273,25 +254,6 @@ def assert_file_exists(filename: str):
         "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
     )
 
-
-def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
-    sp = None
-    if "bpe" in args.modeling_unit:
-        assert_file_exists(args.bpe_model)
-        sp = spm.SentencePieceProcessor()
-        sp.load(args.bpe_model)
-    tokens = {}
-    with open(args.tokens, "r", encoding="utf-8") as f:
-        for line in f:
-            toks = line.strip().split()
-            assert len(toks) == 2, len(toks)
-            assert toks[0] not in tokens, f"Duplicate token: {toks} "
-            tokens[toks[0]] = int(toks[1])
-    return sherpa_onnx.encode_contexts(
-        modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
-    )
-
-
 def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
     """
     Args:
@@ -322,19 +284,13 @@ def main():
     assert_file_exists(args.tokens)
     assert args.num_threads > 0, args.num_threads
 
-    contexts_list = []
     if args.encoder:
         assert len(args.paraformer) == 0, args.paraformer
         assert len(args.nemo_ctc) == 0, args.nemo_ctc
         assert len(args.whisper_encoder) == 0, args.whisper_encoder
         assert len(args.whisper_decoder) == 0, args.whisper_decoder
         assert len(args.tdnn_model) == 0, args.tdnn_model
 
-        contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
-        if contexts:
-            print(f"Contexts list: {contexts}")
-            contexts_list = encode_contexts(args, contexts)
-
         assert_file_exists(args.encoder)
         assert_file_exists(args.decoder)
         assert_file_exists(args.joiner)
@@ -348,7 +304,8 @@ def main():
             sample_rate=args.sample_rate,
             feature_dim=args.feature_dim,
             decoding_method=args.decoding_method,
-            context_score=args.context_score,
+            hotwords_file=args.hotwords_file,
+            hotwords_score=args.hotwords_score,
             debug=args.debug,
         )
     elif args.paraformer:
@@ -425,12 +382,7 @@ def main():
         samples, sample_rate = read_wave(wave_filename)
         duration = len(samples) / sample_rate
         total_duration += duration
-        if contexts_list:
-            assert len(args.paraformer) == 0, args.paraformer
-            assert len(args.nemo_ctc) == 0, args.nemo_ctc
-            s = recognizer.create_stream(contexts_list=contexts_list)
-        else:
-            s = recognizer.create_stream()
+        s = recognizer.create_stream()
         s.accept_waveform(sample_rate, samples)
 
         streams.append(s)

diff --git a/python-api-examples/online-decode-files.py b/python-api-examples/online-decode-files.py
@@ -48,7 +48,6 @@
 from typing import List, Tuple
 
 import numpy as np
-import sentencepiece as spm
 import sherpa_onnx
 
 
@@ -124,46 +123,25 @@ def get_args():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--hotwords-file",
         type=str,
         default="",
         help="""
-        Path to bpe.model, it will be used to tokenize contexts biasing phrases.
-        Used only when --decoding-method=modified_beam_search
-        """,
-    )
-
-    parser.add_argument(
-        "--modeling-unit",
-        type=str,
-        default="char",
-        help="""
-        The type of modeling unit, it will be used to tokenize contexts biasing phrases.
-        Valid values are bpe, bpe+char, char.
-        Note: the char here means characters in CJK languages.
-        Used only when --decoding-method=modified_beam_search
-        """,
-    )
+        The file containing hotwords, one words/phrases per line, and for each
+        phrase the bpe/cjkchar are separated by a space. For example:
 
-    parser.add_argument(
-        "--contexts",
-        type=str,
-        default="",
-        help="""
-        The context list, it is a string containing some words/phrases separated
-        with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
-        Used only when --decoding-method=modified_beam_search
+        ▁HE LL O ▁WORLD
+        你 好 世 界
         """,
     )
 
     parser.add_argument(
-        "--context-score",
+        "--hotwords-score",
         type=float,
         default=1.5,
         help="""
-        The context score of each token for biasing word/phrase. Used only if
-        --contexts is given.
-        Used only when --decoding-method=modified_beam_search
+        The hotword score of each token for biasing word/phrase. Used only if
+        --hotwords-file is given.
         """,
     )
 
@@ -214,27 +192,6 @@ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
         return samples_float32, f.getframerate()
 
 
-def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
-    sp = None
-    if "bpe" in args.modeling_unit:
-        assert_file_exists(args.bpe_model)
-        sp = spm.SentencePieceProcessor()
-        sp.load(args.bpe_model)
-    tokens = {}
-    with open(args.tokens, "r", encoding="utf-8") as f:
-        for line in f:
-            toks = line.strip().split()
-            assert len(toks) == 2, len(toks)
-            assert toks[0] not in tokens, f"Duplicate token: {toks} "
-            tokens[toks[0]] = int(toks[1])
-    return sherpa_onnx.encode_contexts(
-        modeling_unit=args.modeling_unit,
-        contexts=contexts,
-        sp=sp,
-        tokens_table=tokens,
-    )
-
-
 def main():
     args = get_args()
     assert_file_exists(args.tokens)
@@ -258,7 +215,8 @@ def main():
             feature_dim=80,
             decoding_method=args.decoding_method,
             max_active_paths=args.max_active_paths,
-            context_score=args.context_score,
+            hotwords_file=args.hotwords_file,
+            hotwords_score=args.hotwords_score,
         )
     elif args.paraformer_encoder:
         recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
@@ -277,12 +235,6 @@ def main():
     print("Started!")
     start_time = time.time()
 
-    contexts_list = []
-    contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
-    if contexts:
-        print(f"Contexts list: {contexts}")
-        contexts_list = encode_contexts(args, contexts)
-
     streams = []
     total_duration = 0
     for wave_filename in args.sound_files:
@@ -291,10 +243,7 @@ def main():
         duration = len(samples) / sample_rate
         total_duration += duration
 
-        if contexts_list:
-            s = recognizer.create_stream(contexts_list=contexts_list)
-        else:
-            s = recognizer.create_stream()
+        s = recognizer.create_stream()
 
         s.accept_waveform(sample_rate, samples)
 

diff --git a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
@@ -79,6 +79,30 @@ def get_args():
         help="Valid values: cpu, cuda, coreml",
     )
 
+    parser.add_argument(
+        "--hotwords-file",
+        type=str,
+        default="",
+        help="""
+        The file containing hotwords, one words/phrases per line, and for each
+        phrase the bpe/cjkchar are separated by a space. For example:
+
+        ▁HE LL O ▁WORLD
+        你 好 世 界
+        """,
+    )
+
+    parser.add_argument(
+        "--hotwords-score",
+        type=float,
+        default=1.5,
+        help="""
+        The hotword score of each token for biasing word/phrase. Used only if
+        --hotwords-file is given.
+        """,
+    )
+
+
     return parser.parse_args()
 
 
@@ -104,6 +128,8 @@ def create_recognizer(args):
         rule3_min_utterance_length=300,  # it essentially disables this rule
         decoding_method=args.decoding_method,
         provider=args.provider,
+        hotwords_file=agrs.hotwords_file,
+        hotwords_score=args.hotwords_score,
     )
     return recognizer