Skip to content

Commit

Permalink
Refactor hotwords,support loading hotwords from file (#296)
Browse files Browse the repository at this point in the history
  • Loading branch information
pkufool authored Sep 14, 2023
1 parent 087367d commit 47184f9
Show file tree
Hide file tree
Showing 34 changed files with 800 additions and 297 deletions.
5 changes: 5 additions & 0 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,8 @@ python3 ./python-api-examples/offline-decode-files.py \
python3 sherpa-onnx/python/tests/test_offline_recognizer.py --verbose

rm -rf $repo

# test text2token
git clone https://github.com/pkufool/sherpa-test-data /tmp/sherpa-test-data

python3 sherpa-onnx/python/tests/test_text2token.py --verbose
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
- name: Install Python dependencies
shell: bash
run: |
python3 -m pip install --upgrade pip numpy
python3 -m pip install --upgrade pip numpy sentencepiece
- name: Install sherpa-onnx
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test-python-online-websocket-server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
- name: Install Python dependencies
shell: bash
run: |
python3 -m pip install --upgrade pip numpy
python3 -m pip install --upgrade pip numpy sentencepiece
- name: Install sherpa-onnx
shell: bash
Expand Down
32 changes: 32 additions & 0 deletions python-api-examples/non_streaming_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,31 @@ def add_modified_beam_search_args(parser: argparse.ArgumentParser):
)


def add_hotwords_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--hotwords-file",
type=str,
default="",
help="""
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
""",
)

parser.add_argument(
"--hotwords-score",
type=float,
default=1.5,
help="""
The hotword score of each token for biasing word/phrase. Used only if
--hotwords-file is given.
""",
)


def check_args(args):
if not Path(args.tokens).is_file():
raise ValueError(f"{args.tokens} does not exist")
Expand All @@ -342,6 +367,10 @@ def check_args(args):
assert Path(args.decoder).is_file(), args.decoder
assert Path(args.joiner).is_file(), args.joiner

if args.hotwords_file != "":
assert args.decoding_method == "modified_beam_search", args.decoding_method
assert Path(args.hotwords_file).is_file(), args.hotwords_file


def get_args():
parser = argparse.ArgumentParser(
Expand All @@ -351,6 +380,7 @@ def get_args():
add_model_args(parser)
add_feature_config_args(parser)
add_decoding_args(parser)
add_hotwords_args(parser)

parser.add_argument(
"--port",
Expand Down Expand Up @@ -792,6 +822,8 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
feature_dim=args.feat_dim,
decoding_method=args.decoding_method,
max_active_paths=args.max_active_paths,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
)
elif args.paraformer:
assert len(args.nemo_ctc) == 0, args.nemo_ctc
Expand Down
70 changes: 11 additions & 59 deletions python-api-examples/offline-decode-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
from typing import List, Tuple

import numpy as np
import sentencepiece as spm
import sherpa_onnx


Expand All @@ -98,43 +97,25 @@ def get_args():
)

parser.add_argument(
"--bpe-model",
"--hotwords-file",
type=str,
default="",
help="""
Path to bpe.model,
Used only when --decoding-method=modified_beam_search
""",
)
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
parser.add_argument(
"--modeling-unit",
type=str,
default="char",
help="""
The type of modeling unit.
Valid values are bpe, bpe+char, char.
Note: the char here means characters in CJK languages.
▁HE LL O ▁WORLD
你 好 世 界
""",
)

parser.add_argument(
"--contexts",
type=str,
default="",
help="""
The context list, it is a string containing some words/phrases separated
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
""",
)

parser.add_argument(
"--context-score",
"--hotwords-score",
type=float,
default=1.5,
help="""
The context score of each token for biasing word/phrase. Used only if
--contexts is given.
The hotword score of each token for biasing word/phrase. Used only if
--hotwords-file is given.
""",
)

Expand Down Expand Up @@ -273,25 +254,6 @@ def assert_file_exists(filename: str):
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)


def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
sp = None
if "bpe" in args.modeling_unit:
assert_file_exists(args.bpe_model)
sp = spm.SentencePieceProcessor()
sp.load(args.bpe_model)
tokens = {}
with open(args.tokens, "r", encoding="utf-8") as f:
for line in f:
toks = line.strip().split()
assert len(toks) == 2, len(toks)
assert toks[0] not in tokens, f"Duplicate token: {toks} "
tokens[toks[0]] = int(toks[1])
return sherpa_onnx.encode_contexts(
modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
)


def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
"""
Args:
Expand Down Expand Up @@ -322,19 +284,13 @@ def main():
assert_file_exists(args.tokens)
assert args.num_threads > 0, args.num_threads

contexts_list = []
if args.encoder:
assert len(args.paraformer) == 0, args.paraformer
assert len(args.nemo_ctc) == 0, args.nemo_ctc
assert len(args.whisper_encoder) == 0, args.whisper_encoder
assert len(args.whisper_decoder) == 0, args.whisper_decoder
assert len(args.tdnn_model) == 0, args.tdnn_model

contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
if contexts:
print(f"Contexts list: {contexts}")
contexts_list = encode_contexts(args, contexts)

assert_file_exists(args.encoder)
assert_file_exists(args.decoder)
assert_file_exists(args.joiner)
Expand All @@ -348,7 +304,8 @@ def main():
sample_rate=args.sample_rate,
feature_dim=args.feature_dim,
decoding_method=args.decoding_method,
context_score=args.context_score,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
debug=args.debug,
)
elif args.paraformer:
Expand Down Expand Up @@ -425,12 +382,7 @@ def main():
samples, sample_rate = read_wave(wave_filename)
duration = len(samples) / sample_rate
total_duration += duration
if contexts_list:
assert len(args.paraformer) == 0, args.paraformer
assert len(args.nemo_ctc) == 0, args.nemo_ctc
s = recognizer.create_stream(contexts_list=contexts_list)
else:
s = recognizer.create_stream()
s = recognizer.create_stream()
s.accept_waveform(sample_rate, samples)

streams.append(s)
Expand Down
73 changes: 11 additions & 62 deletions python-api-examples/online-decode-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
from typing import List, Tuple

import numpy as np
import sentencepiece as spm
import sherpa_onnx


Expand Down Expand Up @@ -124,46 +123,25 @@ def get_args():
)

parser.add_argument(
"--bpe-model",
"--hotwords-file",
type=str,
default="",
help="""
Path to bpe.model, it will be used to tokenize contexts biasing phrases.
Used only when --decoding-method=modified_beam_search
""",
)

parser.add_argument(
"--modeling-unit",
type=str,
default="char",
help="""
The type of modeling unit, it will be used to tokenize contexts biasing phrases.
Valid values are bpe, bpe+char, char.
Note: the char here means characters in CJK languages.
Used only when --decoding-method=modified_beam_search
""",
)
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
parser.add_argument(
"--contexts",
type=str,
default="",
help="""
The context list, it is a string containing some words/phrases separated
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
Used only when --decoding-method=modified_beam_search
▁HE LL O ▁WORLD
你 好 世 界
""",
)

parser.add_argument(
"--context-score",
"--hotwords-score",
type=float,
default=1.5,
help="""
The context score of each token for biasing word/phrase. Used only if
--contexts is given.
Used only when --decoding-method=modified_beam_search
The hotword score of each token for biasing word/phrase. Used only if
--hotwords-file is given.
""",
)

Expand Down Expand Up @@ -214,27 +192,6 @@ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
return samples_float32, f.getframerate()


def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
sp = None
if "bpe" in args.modeling_unit:
assert_file_exists(args.bpe_model)
sp = spm.SentencePieceProcessor()
sp.load(args.bpe_model)
tokens = {}
with open(args.tokens, "r", encoding="utf-8") as f:
for line in f:
toks = line.strip().split()
assert len(toks) == 2, len(toks)
assert toks[0] not in tokens, f"Duplicate token: {toks} "
tokens[toks[0]] = int(toks[1])
return sherpa_onnx.encode_contexts(
modeling_unit=args.modeling_unit,
contexts=contexts,
sp=sp,
tokens_table=tokens,
)


def main():
args = get_args()
assert_file_exists(args.tokens)
Expand All @@ -258,7 +215,8 @@ def main():
feature_dim=80,
decoding_method=args.decoding_method,
max_active_paths=args.max_active_paths,
context_score=args.context_score,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
)
elif args.paraformer_encoder:
recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
Expand All @@ -277,12 +235,6 @@ def main():
print("Started!")
start_time = time.time()

contexts_list = []
contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
if contexts:
print(f"Contexts list: {contexts}")
contexts_list = encode_contexts(args, contexts)

streams = []
total_duration = 0
for wave_filename in args.sound_files:
Expand All @@ -291,10 +243,7 @@ def main():
duration = len(samples) / sample_rate
total_duration += duration

if contexts_list:
s = recognizer.create_stream(contexts_list=contexts_list)
else:
s = recognizer.create_stream()
s = recognizer.create_stream()

s.accept_waveform(sample_rate, samples)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,30 @@ def get_args():
help="Valid values: cpu, cuda, coreml",
)

parser.add_argument(
"--hotwords-file",
type=str,
default="",
help="""
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
""",
)

parser.add_argument(
"--hotwords-score",
type=float,
default=1.5,
help="""
The hotword score of each token for biasing word/phrase. Used only if
--hotwords-file is given.
""",
)


return parser.parse_args()


Expand All @@ -104,6 +128,8 @@ def create_recognizer(args):
rule3_min_utterance_length=300, # it essentially disables this rule
decoding_method=args.decoding_method,
provider=args.provider,
hotwords_file=agrs.hotwords_file,
hotwords_score=args.hotwords_score,
)
return recognizer

Expand Down
Loading

0 comments on commit 47184f9

Please sign in to comment.