Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split decoder into sosoa and vocoder #28

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,8 @@ python run.py \
```

## モデルをonnxに変換
* `python convert.py --yukarin_s_model_dir "model/yukarin_s" --yukarin_sa_model_dir "model/yukarin_sa" --yukarin_sosoa_model_dir "model/yukarin_sosoa" --hifigan_model_dir "model/hifigan"` でonnxへの変換が可能。modelフォルダ内のyukarin_s, yukarin_sa, yukarin_sosoaフォルダにonnxが保存される
* `python convert.py --yukarin_s_model_dir "model/yukarin_s" --yukarin_sa_model_dir "model/yukarin_sa" --yukarin_sosoa_model_dir "model/yukarin_sosoa" --hifigan_model_dir "model/hifigan"` でonnxへの変換が可能。`--working_dir`で指定したフォルダ(デフォルトはmodel)にonnxが保存される
- `speaker_ids`オプションに指定する数値は自由。どの数値を指定しても生成されるonnxモデルは全ての`speaker_id`に対応しており、値を変えて実行しなおしたり、複数のidを指定したりする必要は無い。
- yukarin_sosoaフォルダにはhifi_ganと合わせた`decode.onnx`が保存される
- yukarin_sosfはオプショナルで、追加する場合は`--yukarin_sosf_model_dir "model/yukarin_sosf"`などを指定する

* onnxで実行したい場合は`run.py`を`--method=onnx`で実行する; `python run.py --yukarin_s_model_dir "model" --yukarin_sa_model_dir "model" --yukarin_sosoa_model_dir "model" --hifigan_model_dir "model" --speaker_ids 5 --method=onnx`
Expand Down Expand Up @@ -86,7 +85,7 @@ python run.py \

## 自分で学習したモデルの onnx を作りたい場合

VOICEVOX をビルドするには以下の 3 つの onnx が必要です。
VOICEVOX をビルドするには以下の 4 つの onnx が必要です。
(predict_contourはオプショナルです。)

- predict_duration.onnx
Expand Down Expand Up @@ -161,7 +160,7 @@ VOICEVOX をビルドするには以下の 3 つの onnx が必要です。
- shape: [length]
- dtype: bool
- 値は True か False
- decode.onnx
- predict_spectrogram.onnx
- 入力
- f0
- shape: [length, 1]
Expand All @@ -173,14 +172,28 @@ VOICEVOX をビルドするには以下の 3 つの onnx が必要です。
- speaker_id
- shape: [1]
- dtype: int
- 出力
- spec
- shape: [length, feats]
- dtype: float
- 周波数分解能が feats のメルスペクトログラム
- vocoder.onnx
- 入力
- f0
- shape: [length, 1]
- dtype: float
- spec
- shape: [length, feats]
- dtype: float
- 周波数分解能が feats のメルスペクトログラム
- 出力
- wave
- shape: [outlength]
- dtype: float
- 値は [-1.0, 1.0] の音声波形
- サンプリング周波数は 24kHz

音素 id は辞書に依存します。また predict_duration.onnx や predict_intonation.onnx の出力はコアによって変換されて decode.onnx の入力になります。コアを変更しない場合は phoneme_length を元に f0 と phoneme が 93.75(=24k/256)Hz になるように変換されます。
音素 id は辞書に依存します。また predict_duration.onnx や predict_intonation.onnx の出力はコアによって変換されて predict_spectrogram.onnx や vocoder.onnx の入力になります。コアを変更しない場合は phoneme_length を元に f0 と phoneme と spec が 93.75(=24k/256)Hz になるように変換されます。

## パッケージの追加・更新

Expand Down
57 changes: 6 additions & 51 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def convert_spectrogram(model_dir: Path, device: str, offset: int, working_dir:
dynamic_axes={
"f0": {0: "length"},
"phoneme": {0: "length"},
"spec": {0: "row", 1: "col"}
"spec": {0: "length", 1: "feats"}
}
)
return outpath, size
Expand All @@ -221,7 +221,7 @@ def convert_vocoder(model_dir: Path, device: str, working_dir: Path, sample_inpu
to_tensor(sample_input["spec"], device=device),
to_tensor(sample_input["f0"], device=device),
)
outpath = working_dir.joinpath(f"vocoder.onnx")
outpath = working_dir.joinpath(f"vocoder_unopt.onnx")
torch.onnx.export(
wrapper,
args,
Expand All @@ -231,7 +231,7 @@ def convert_vocoder(model_dir: Path, device: str, working_dir: Path, sample_inpu
input_names=["spec", "f0"],
output_names=["wave"],
dynamic_axes={
"spec": {0: "row", 1: "col"},
"spec": {0: "length", 1: "feats"},
"f0": {0: "length"},
}
)
Expand Down Expand Up @@ -431,52 +431,6 @@ def rename(graph, prefix: str, freeze_names: List[str]):
logger.info(f"saved {output_onnx_path}")
return output_onnx_path


def fuse(onnx1: Path, onnx2: Path):
"""ふたつのONNXモデルを直列に接続する。spectrogramとvocoderを接続するために利用する。"""
# you can use onnx.compose.merge_models
# https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md#onnx-compose
logger = logging.getLogger("fuse")
model1 = onnx.load(onnx1)
model2 = onnx.load(onnx2)
opset = model1.opset_import[0].version
logger.info("opset: %d" % opset)

merged_graph = onnx.GraphProto()
merged_graph.node.extend(model1.graph.node)
merged_graph.node.extend(model2.graph.node)
# model1のoutputであるspecはそのままmodel2のinputであるspecに接続される
merged_graph.input.extend(model1.graph.input)
merged_graph.output.extend(model2.graph.output)

init1 = set([i.name for i in model1.graph.initializer])
init2 = set([i.name for i in model2.graph.initializer])
assert len(init1 & init2) == 0
merged_graph.initializer.extend(model1.graph.initializer)
merged_graph.initializer.extend(model2.graph.initializer)

spinit1 = set([i.name for i in model1.graph.sparse_initializer])
spinit2 = set([i.name for i in model2.graph.sparse_initializer])
assert len(spinit1 & spinit2) == 0
merged_graph.sparse_initializer.extend(model1.graph.sparse_initializer)
merged_graph.sparse_initializer.extend(model2.graph.sparse_initializer)

info1 = set([i.name for i in model1.graph.value_info])
info2 = set([i.name for i in model2.graph.value_info])
assert len(info1 & info2) == 0
merged_graph.value_info.extend(model1.graph.value_info)
merged_graph.value_info.extend(model2.graph.value_info)

merged_graph.name = "decoder"

merged = onnx.helper.make_model(merged_graph, opset_imports=[onnx.helper.make_operatorsetid("", opset)])
logger.info(f"fused {onnx1} and {onnx2}")
output_onnx_path = onnx1.parent / "decode_unopt.onnx"
onnx.checker.check_model(merged)
onnx.save(merged, output_onnx_path)
logger.info(f"saved {output_onnx_path}")
return output_onnx_path

def optim(path: Path, output_path: Path):
"""ONNX Runtime sessionを作るときに走る最適化を利用する"""
sess_options = onnxruntime.SessionOptions()
Expand Down Expand Up @@ -570,13 +524,14 @@ def run(
if len(contour_onnx_list) > 0:
contour_merged_onnx = concat(contour_onnx_list, offsets)
spectrogram_merged_onnx = concat(spectrogram_onnx_list, offsets)
decoder_onnx = fuse(spectrogram_merged_onnx, vocoder_onnx)
logger.info("--- optimization ---")
optim(duration_merged_onnx, working_dir / "duration.onnx")
optim(intonation_merged_onnx, working_dir / "intonation.onnx")
if len(contour_onnx_list) > 0:
optim(contour_merged_onnx, working_dir / "contour.onnx")
optim(decoder_onnx, working_dir / "decode.onnx")
optim(spectrogram_merged_onnx, working_dir / "spectrogram.onnx")
optim(vocoder_onnx, working_dir / "vocoder.onnx")

logger.info("--- DONE! ---")


Expand Down
2 changes: 2 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
from itertools import product
import numpy as np
from pathlib import Path
from typing import List, Optional

Expand All @@ -19,6 +20,7 @@ def run(
speaker_ids: List[int],
method: str,
):
np.random.seed(0)
device = "cuda" if use_gpu else "cpu"
if method == "torch":
from vv_core_inference.make_decode_forwarder import make_decode_forwarder
Expand Down
14 changes: 11 additions & 3 deletions vv_core_inference/onnx_decode_forwarder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@ def make_decode_forwarder(yukarin_sosoa_model_dir: Path, hifigan_model_dir: Path
providers = ['CPUExecutionProvider']
if device == "cuda":
providers.insert(0, 'CUDAExecutionProvider')
session = onnxruntime.InferenceSession(
str(yukarin_sosoa_model_dir.joinpath("decode.onnx")),
spectrogram_session = onnxruntime.InferenceSession(
str(yukarin_sosoa_model_dir.joinpath("spectrogram.onnx")),
providers=providers
)
vocoder_session = onnxruntime.InferenceSession(
str(hifigan_model_dir.joinpath("vocoder.onnx")),
providers=providers
)

Expand All @@ -26,10 +30,14 @@ def _dispatcher(
if speaker_id is not None:
speaker_id = np.asarray(speaker_id)
speaker_id = speaker_id.reshape((1,)).astype(np.int64)
wave = session.run(["wave"], {
spec = spectrogram_session.run(["spec"], {
"f0": f0,
"phoneme": phoneme,
"speaker_id": speaker_id,
})[0]
wave = vocoder_session.run(["wave"], {
"spec": spec,
"f0": f0,
})[0]
return None, wave
return _dispatcher
Loading