Skip to content

Commit

Permalink
split decoder into sosoa and vocoder
Browse files Browse the repository at this point in the history
  • Loading branch information
Yosshi999 committed Oct 6, 2024
1 parent 4a243d4 commit 6611f67
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 59 deletions.
23 changes: 18 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ python run.py \
```

## モデルをonnxに変換
* `python convert.py --yukarin_s_model_dir "model/yukarin_s" --yukarin_sa_model_dir "model/yukarin_sa" --yukarin_sosoa_model_dir "model/yukarin_sosoa" --hifigan_model_dir "model/hifigan"` でonnxへの変換が可能。modelフォルダ内のyukarin_s, yukarin_sa, yukarin_sosoaフォルダにonnxが保存される
* `python convert.py --yukarin_s_model_dir "model/yukarin_s" --yukarin_sa_model_dir "model/yukarin_sa" --yukarin_sosoa_model_dir "model/yukarin_sosoa" --hifigan_model_dir "model/hifigan"` でonnxへの変換が可能。`--working_dir`で指定したフォルダ(デフォルトはmodel)にonnxが保存される
- `speaker_ids`オプションに指定する数値は自由。どの数値を指定しても生成されるonnxモデルは全ての`speaker_id`に対応しており、値を変えて実行しなおしたり、複数のidを指定したりする必要は無い。
- yukarin_sosoaフォルダにはhifi_ganと合わせた`decode.onnx`が保存される
- yukarin_sosfはオプショナルで、追加する場合は`--yukarin_sosf_model_dir "model/yukarin_sosf"`などを指定する

* onnxで実行したい場合は`run.py``--method=onnx`で実行する; `python run.py --yukarin_s_model_dir "model" --yukarin_sa_model_dir "model" --yukarin_sosoa_model_dir "model" --hifigan_model_dir "model" --speaker_ids 5 --method=onnx`
Expand Down Expand Up @@ -86,7 +85,7 @@ python run.py \

## 自分で学習したモデルの onnx を作りたい場合

VOICEVOX をビルドするには以下の 3 つの onnx が必要です。
VOICEVOX をビルドするには以下の 4 つの onnx が必要です。
(predict_contourはオプショナルです。)

- predict_duration.onnx
Expand Down Expand Up @@ -161,7 +160,7 @@ VOICEVOX をビルドするには以下の 3 つの onnx が必要です。
- shape: [length]
- dtype: bool
- 値は True か False
- decode.onnx
- predict_spectrogram.onnx
- 入力
- f0
- shape: [length, 1]
Expand All @@ -173,14 +172,28 @@ VOICEVOX をビルドするには以下の 3 つの onnx が必要です。
- speaker_id
- shape: [1]
- dtype: int
- 出力
- spec
- shape: [length, feats]
- dtype: float
- 周波数分解能が feats のメルスペクトログラム
- vocoder.onnx
- 入力
- f0
- shape: [length, 1]
- dtype: float
- spec
- shape: [length, feats]
- dtype: float
- 周波数分解能が feats のメルスペクトログラム
- 出力
- wave
- shape: [outlength]
- dtype: float
- 値は [-1.0, 1.0] の音声波形
- サンプリング周波数は 24kHz

音素 id は辞書に依存します。また predict_duration.onnx や predict_intonation.onnx の出力はコアによって変換されて decode.onnx の入力になります。コアを変更しない場合は phoneme_length を元に f0 と phoneme が 93.75(=24k/256)Hz になるように変換されます。
音素 id は辞書に依存します。また predict_duration.onnx や predict_intonation.onnx の出力はコアによって変換されて predict_spectrogram.onnx や vocoder.onnx の入力になります。コアを変更しない場合は phoneme_length を元に f0 と phoneme と spec が 93.75(=24k/256)Hz になるように変換されます。

## パッケージの追加・更新

Expand Down
82 changes: 31 additions & 51 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def convert_spectrogram(model_dir: Path, device: str, offset: int, working_dir:
dynamic_axes={
"f0": {0: "length"},
"phoneme": {0: "length"},
"spec": {0: "row", 1: "col"}
"spec": {0: "length", 1: "feats"}
}
)
return outpath, size
Expand All @@ -221,7 +221,7 @@ def convert_vocoder(model_dir: Path, device: str, working_dir: Path, sample_inpu
to_tensor(sample_input["spec"], device=device),
to_tensor(sample_input["f0"], device=device),
)
outpath = working_dir.joinpath(f"vocoder.onnx")
outpath = working_dir.joinpath(f"vocoder_unopt.onnx")
torch.onnx.export(
wrapper,
args,
Expand All @@ -231,7 +231,7 @@ def convert_vocoder(model_dir: Path, device: str, working_dir: Path, sample_inpu
input_names=["spec", "f0"],
output_names=["wave"],
dynamic_axes={
"spec": {0: "row", 1: "col"},
"spec": {0: "length", 1: "feats"},
"f0": {0: "length"},
}
)
Expand Down Expand Up @@ -431,59 +431,36 @@ def rename(graph, prefix: str, freeze_names: List[str]):
logger.info(f"saved {output_onnx_path}")
return output_onnx_path


def fuse(onnx1: Path, onnx2: Path):
"""ふたつのONNXモデルを直列に接続する。spectrogramとvocoderを接続するために利用する。"""
# you can use onnx.compose.merge_models
# https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md#onnx-compose
logger = logging.getLogger("fuse")
model1 = onnx.load(onnx1)
model2 = onnx.load(onnx2)
opset = model1.opset_import[0].version
logger.info("opset: %d" % opset)

merged_graph = onnx.GraphProto()
merged_graph.node.extend(model1.graph.node)
merged_graph.node.extend(model2.graph.node)
# model1のoutputであるspecはそのままmodel2のinputであるspecに接続される
merged_graph.input.extend(model1.graph.input)
merged_graph.output.extend(model2.graph.output)

init1 = set([i.name for i in model1.graph.initializer])
init2 = set([i.name for i in model2.graph.initializer])
assert len(init1 & init2) == 0
merged_graph.initializer.extend(model1.graph.initializer)
merged_graph.initializer.extend(model2.graph.initializer)

spinit1 = set([i.name for i in model1.graph.sparse_initializer])
spinit2 = set([i.name for i in model2.graph.sparse_initializer])
assert len(spinit1 & spinit2) == 0
merged_graph.sparse_initializer.extend(model1.graph.sparse_initializer)
merged_graph.sparse_initializer.extend(model2.graph.sparse_initializer)

info1 = set([i.name for i in model1.graph.value_info])
info2 = set([i.name for i in model2.graph.value_info])
assert len(info1 & info2) == 0
merged_graph.value_info.extend(model1.graph.value_info)
merged_graph.value_info.extend(model2.graph.value_info)

merged_graph.name = "decoder"

merged = onnx.helper.make_model(merged_graph, opset_imports=[onnx.helper.make_operatorsetid("", opset)])
logger.info(f"fused {onnx1} and {onnx2}")
output_onnx_path = onnx1.parent / "decode_unopt.onnx"
onnx.checker.check_model(merged)
onnx.save(merged, output_onnx_path)
logger.info(f"saved {output_onnx_path}")
return output_onnx_path

def optim(path: Path, output_path: Path):
"""ONNX Runtime sessionを作るときに走る最適化を利用する"""
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
sess_options.optimized_model_filepath = str(output_path)
session = onnxruntime.InferenceSession(str(path), sess_options)

def repair_vocoder(path: Path, output_path: Path, hifigan_input):
"""モデルのコンフィグによってはvocoder onnxのf0入力が消えていることがあるので修正"""
model = onnx.load(path)
opset = model.opset_import[0].version
input_nodes = list(model.graph.input)
input_names = [n.name for n in input_nodes]
if "f0" not in input_names:
sample_f0_shape = list(hifigan_input["f0"].shape)
sample_f0_shape[0] = None # axis-0 is dynamic
arg_f0 = onnx.helper.make_tensor_value_info("f0", onnx.TensorProto.FLOAT, sample_f0_shape)
input_nodes.append(arg_f0)

repaired_graph = onnx.helper.make_graph(
nodes=list(model.graph.node),
name="vocoder",
inputs=input_nodes,
outputs=list(model.graph.output),
initializer=list(model.graph.initializer)
)
repaired_model = onnx.helper.make_model(repaired_graph, opset_imports=[onnx.helper.make_operatorsetid("", opset)])
onnx.checker.check_model(repaired_model)
onnx.save(repaired_model, output_path)

def run(
yukarin_s_model_dir: List[Path],
yukarin_sa_model_dir: List[Path],
Expand Down Expand Up @@ -570,13 +547,16 @@ def run(
if len(contour_onnx_list) > 0:
contour_merged_onnx = concat(contour_onnx_list, offsets)
spectrogram_merged_onnx = concat(spectrogram_onnx_list, offsets)
decoder_onnx = fuse(spectrogram_merged_onnx, vocoder_onnx)
logger.info("--- optimization ---")
optim(duration_merged_onnx, working_dir / "duration.onnx")
optim(intonation_merged_onnx, working_dir / "intonation.onnx")
if len(contour_onnx_list) > 0:
optim(contour_merged_onnx, working_dir / "contour.onnx")
optim(decoder_onnx, working_dir / "decode.onnx")
optim(spectrogram_merged_onnx, working_dir / "spectrogram.onnx")
optim(vocoder_onnx, working_dir / "vocoder_maybef0.onnx")
logger.info("--- vocoder repair ---")
repair_vocoder(working_dir / "vocoder_maybef0.onnx", working_dir / "vocoder.onnx", sample_inputs["hifigan_input"])

logger.info("--- DONE! ---")


Expand Down
14 changes: 11 additions & 3 deletions vv_core_inference/onnx_decode_forwarder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@ def make_decode_forwarder(yukarin_sosoa_model_dir: Path, hifigan_model_dir: Path
providers = ['CPUExecutionProvider']
if device == "cuda":
providers.insert(0, 'CUDAExecutionProvider')
session = onnxruntime.InferenceSession(
str(yukarin_sosoa_model_dir.joinpath("decode.onnx")),
spectrogram_session = onnxruntime.InferenceSession(
str(yukarin_sosoa_model_dir.joinpath("spectrogram.onnx")),
providers=providers
)
vocoder_session = onnxruntime.InferenceSession(
str(hifigan_model_dir.joinpath("vocoder.onnx")),
providers=providers
)

Expand All @@ -26,10 +30,14 @@ def _dispatcher(
if speaker_id is not None:
speaker_id = np.asarray(speaker_id)
speaker_id = speaker_id.reshape((1,)).astype(np.int64)
wave = session.run(["wave"], {
spec = spectrogram_session.run(["spec"], {
"f0": f0,
"phoneme": phoneme,
"speaker_id": speaker_id,
})[0]
wave = vocoder_session.run(["wave"], {
"spec": spec,
"f0": f0,
})[0]
return None, wave
return _dispatcher

0 comments on commit 6611f67

Please sign in to comment.