k2-fsa · danpovey · Jul 31, 2021 · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/egs/librispeech/ASR/local/prepare_lang.py b/egs/librispeech/ASR/local/prepare_lang.py
@@ -18,15 +18,13 @@
         lexicon = k2.Fsa.from_dict(d)
 
 5. Generate L_disambig.pt, in k2 format.
-
-6. Generate lexicon_disambig.txt
 """
 import math
 import re
 import sys
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 import k2
 import torch
@@ -90,6 +88,10 @@ def write_lexicon(filename: str, lexicon: Lexicon) -> None:
 def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
     """Write a symbol to ID mapping to a file.
 
+    Note:
+      No need to implement `read_mapping` as it can be done
+      through :func:`k2.SymbolTable.from_file`.
+
     Args:
       filename:
         Filename to save the mapping.
@@ -119,7 +121,7 @@ def get_phones(lexicon: Lexicon) -> List[str]:
     return sorted_ans
 
 
-def get_words(lexicon: List[Tuple[str, List[str]]]) -> List[str]:
+def get_words(lexicon: Lexicon) -> List[str]:
     """Get words from a lexicon.
 
     Args:
@@ -213,12 +215,46 @@ def generate_id_map(symbols: List[str]) -> Dict[str, int]:
     return {sym: i for i, sym in enumerate(symbols)}
 
 
+def add_self_loops(
+    arcs: List[List[Any]], disambig_phone: int, disambig_word: int
+) -> List[List[Any]]:
+    """Adds self-loops to states of an FST to propagate disambiguation symbols
+    through it. They are added on each state with non-epsilon output symbols
+    on at least one arc out of the state.
+
+    See also fstaddselfloops.pl from Kaldi. One difference is that
+    Kaldi uses OpenFst style FSTs and it has multiple final states.
+    This function uses k2 style FSTs and it does not need to add self-loops
+    to the final state.
+
+    Args:
+      arcs:
+        A list-of-list. The sublist contains
+        `[src_state, dest_state, label, aux_label, score]`
+
+    Return:
+      Return new `arcs` that contain self-loops.
+    """
+    states_needs_self_loops = set()
+    for arc in arcs:
+        src, dst, ilable, olable, score = arc
+        if olable != 0:
+            states_needs_self_loops.add(src)
+
+    ans = []
+    for s in states_needs_self_loops:
+        ans.append([s, s, disambig_phone, disambig_word, 0])
+
+    return arcs + ans
+
+
 def lexicon_to_fst(
     lexicon: Lexicon,
     phone2id: Dict[str, int],
     word2id: Dict[str, int],
     sil_phone: str = "SIL",
     sil_prob: float = 0.5,
+    need_self_loops: bool = False,
 ) -> k2.Fsa:
     """Convert a lexicon to an FST (in k2 format) with optional silence at
     the beginning and end of the word.
@@ -235,6 +271,9 @@ def lexicon_to_fst(
       sil_prob:
         The probability for adding a silence at the beginning and end
         of the word.
+      need_self_loops:
+        If True, add self-loop to states with non-epsilon output symbols
+        on at least one arc out of the state.
     Returns:
       Return an instance of `k2.Fsa` representing the given lexicon.
     """
@@ -285,6 +324,15 @@ def lexicon_to_fst(
         arcs.append([cur_state, loop_state, prons[i], w, no_sil_score])
         arcs.append([cur_state, sil_state, prons[i], w, sil_score])
 
+    if need_self_loops:
+        disambig_phone = phone2id["#0"]
+        disambig_word = word2id["#0"]
+        arcs = add_self_loops(
+            arcs,
+            disambig_phone=disambig_phone,
+            disambig_word=disambig_word,
+        )
+
     final_state = next_state
     arcs.append([loop_state, final_state, -1, -1, 0])
     arcs.append([final_state])
@@ -346,13 +394,10 @@ def main():
         word2id=word2id,
         sil_phone=sil_phone,
         sil_prob=sil_prob,
+        need_self_loops=True,
     )
 
-    # TODO(fangjun): add self-loops to L_disambig
-    # whose ilabel is phone2id['#0'] and olable is word2id['#0']
-    # Need to implement it in k2
-
-    if True:
+    if False:
         # Just for debugging, will remove it
         torch.save(L.as_dict(), out_dir / "L.pt")
         torch.save(L_disambig.as_dict(), out_dir / "L_disambig.pt")

diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
@@ -75,3 +75,15 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   mkdir -p data/fbank
   ./local/compute_fbank_musan.py
 fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  echo "Stage 5: Prepare phone based lang"
+  # TODO: add BPE based lang
+  mkdir -p data/lang
+
+  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
+    cat - data/lm/librispeech-lexicon.txt |
+    sort | uniq > data/lang/lexicon.txt
+
+  ./local/prepare_lang.py
+fi