VOICEVOX · qryxip · Jan 5, 2025 · Jan 3, 2025 · Jan 4, 2025
diff --git a/crates/voicevox_core/src/infer/domains.rs b/crates/voicevox_core/src/infer/domains.rs
@@ -1,70 +1,87 @@
+pub(crate) mod experimental_talk;
 mod frame_decode;
 mod singing_teacher;
-mod talk;
+pub(crate) mod talk;
 
 use educe::Educe;
 use serde::{Deserialize, Deserializer};
 
 pub(crate) use self::{
+    experimental_talk::{
+        ExperimentalTalkDomain, ExperimentalTalkOperation, GenerateFullIntermediateInput,
+        GenerateFullIntermediateOutput, RenderAudioSegmentInput, RenderAudioSegmentOutput,
+    },
     frame_decode::{FrameDecodeDomain, FrameDecodeOperation, SfDecodeInput, SfDecodeOutput},
     singing_teacher::{
         PredictSingConsonantLengthInput, PredictSingConsonantLengthOutput, PredictSingF0Input,
         PredictSingF0Output, PredictSingVolumeInput, PredictSingVolumeOutput, SingingTeacherDomain,
         SingingTeacherOperation,
     },
-    talk::{
-        GenerateFullIntermediateInput, GenerateFullIntermediateOutput, PredictDurationInput,
-        PredictDurationOutput, PredictIntonationInput, PredictIntonationOutput,
-        RenderAudioSegmentInput, RenderAudioSegmentOutput, TalkDomain, TalkOperation,
-    },
+    talk::{DecodeInput, DecodeOutput, TalkDomain, TalkOperation},
 };
 
 #[derive(Educe)]
 // TODO: `bounds`に`V: ?Sized`も入れようとすると、よくわからない理由で弾かれる。最新版のeduce
 // でもそうなのか？また最新版でも駄目だとしたら、弾いている理由は何なのか？
 #[educe(Clone(
-    bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
+    bound = "V: InferenceDomainMapValues, V::Talk: Clone, V::ExperimentalTalk: Clone, V::SingingTeacher: Clone, V::FrameDecode: Clone"
 ))]
 pub(crate) struct InferenceDomainMap<V: InferenceDomainMapValues + ?Sized> {
     pub(crate) talk: V::Talk,
+    pub(crate) experimental_talk: V::ExperimentalTalk,
     pub(crate) singing_teacher: V::SingingTeacher,
     pub(crate) frame_decode: V::FrameDecode,
 }
 
-impl<T, S, F> InferenceDomainMap<(T, S, F)> {
-    pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &S, &F)> {
+impl<T, X, S, F> InferenceDomainMap<(T, X, S, F)> {
+    pub(crate) fn each_ref(&self) -> InferenceDomainMap<(&T, &X, &S, &F)> {
         let talk = &self.talk;
+        let experimental_talk = &self.experimental_talk;
         let singing_teacher = &self.singing_teacher;
         let frame_decode = &self.frame_decode;
         InferenceDomainMap {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         }
     }
 
-    pub(crate) fn map<T2, S2, F2, Ft: FnOnce(T) -> T2, Fs: FnOnce(S) -> S2, Ff: FnOnce(F) -> F2>(
+    pub(crate) fn map<
+        T2,
+        X2,
+        S2,
+        F2,
+        Ft: FnOnce(T) -> T2,
+        Fx: FnOnce(X) -> X2,
+        Fs: FnOnce(S) -> S2,
+        Ff: FnOnce(F) -> F2,
+    >(
         self,
-        fs: InferenceDomainMap<(Ft, Fs, Ff)>,
-    ) -> InferenceDomainMap<(T2, S2, F2)> {
+        fs: InferenceDomainMap<(Ft, Fx, Fs, Ff)>,
+    ) -> InferenceDomainMap<(T2, X2, S2, F2)> {
         let talk = (fs.talk)(self.talk);
+        let experimental_talk = (fs.experimental_talk)(self.experimental_talk);
         let singing_teacher = (fs.singing_teacher)(self.singing_teacher);
         let frame_decode = (fs.frame_decode)(self.frame_decode);
         InferenceDomainMap {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         }
     }
 }
 
-impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)> {
-    pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, S, F)>, E> {
+impl<T, X, S, F, E> InferenceDomainMap<(Result<T, E>, Result<X, E>, Result<S, E>, Result<F, E>)> {
+    pub(crate) fn collect(self) -> Result<InferenceDomainMap<(T, X, S, F)>, E> {
         let talk = self.talk?;
+        let experimental_talk = self.experimental_talk?;
         let singing_teacher = self.singing_teacher?;
         let frame_decode = self.frame_decode?;
         Ok(InferenceDomainMap {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         })
@@ -74,6 +91,7 @@ impl<T, S, F, E> InferenceDomainMap<(Result<T, E>, Result<S, E>, Result<F, E>)>
 impl<'de, V: InferenceDomainMapValues + ?Sized> Deserialize<'de> for InferenceDomainMap<V>
 where
     V::Talk: Deserialize<'de>,
+    V::ExperimentalTalk: Deserialize<'de>,
     V::SingingTeacher: Deserialize<'de>,
     V::FrameDecode: Deserialize<'de>,
 {
@@ -83,18 +101,21 @@ where
     {
         let Repr {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         } = Repr::deserialize(deserializer)?;
         return Ok(Self {
             talk,
+            experimental_talk,
             singing_teacher,
             frame_decode,
         });
 
         #[derive(Deserialize)]
-        struct Repr<T, S, F> {
+        struct Repr<T, E, S, F> {
             talk: T,
+            experimental_talk: E,
             singing_teacher: S,
             frame_decode: F,
         }
@@ -103,12 +124,14 @@ where
 
 pub(crate) trait InferenceDomainMapValues {
     type Talk;
+    type ExperimentalTalk;
     type SingingTeacher;
     type FrameDecode;
 }
 
-impl<T, S, F> InferenceDomainMapValues for (T, S, F) {
+impl<T, X, S, F> InferenceDomainMapValues for (T, X, S, F) {
     type Talk = T;
+    type ExperimentalTalk = X;
     type SingingTeacher = S;
     type FrameDecode = F;
 }
@@ -120,6 +143,10 @@ macro_rules! inference_domain_map_values {
                 $body
                 where $arg = crate::infer::domains::TalkDomain as crate::infer::InferenceDomain
             ),
+            ::macros::substitute_type!(
+                $body
+                where $arg = crate::infer::domains::ExperimentalTalkDomain as crate::infer::InferenceDomain
+            ),
             ::macros::substitute_type!(
                 $body
                 where $arg = crate::infer::domains::SingingTeacherDomain as crate::infer::InferenceDomain

diff --git a/crates/voicevox_core/src/infer/domains/experimental_talk.rs b/crates/voicevox_core/src/infer/domains/experimental_talk.rs
@@ -0,0 +1,116 @@
+use std::{collections::BTreeSet, sync::LazyLock};
+
+use enum_map::Enum;
+use macros::{InferenceInputSignature, InferenceOperation, InferenceOutputSignature};
+use ndarray::{Array0, Array1, Array2};
+
+use crate::{manifest::ExperimentalTalkManifest, StyleType};
+
+use super::super::{
+    InferenceDomain, InferenceInputSignature as _, InferenceOutputSignature as _, OutputTensor,
+};
+
+pub(crate) enum ExperimentalTalkDomain {}
+
+impl InferenceDomain for ExperimentalTalkDomain {
+    type Operation = ExperimentalTalkOperation;
+    type Manifest = ExperimentalTalkManifest;
+
+    fn style_types() -> &'static BTreeSet<StyleType> {
+        static STYLE_TYPES: LazyLock<BTreeSet<StyleType>> =
+            LazyLock::new(|| [StyleType::Talk].into());
+        &STYLE_TYPES
+    }
+}
+
+#[derive(Clone, Copy, Enum, InferenceOperation)]
+#[inference_operation(
+    type Domain = ExperimentalTalkDomain;
+)]
+pub(crate) enum ExperimentalTalkOperation {
+    #[inference_operation(
+        type Input = PredictDurationInput;
+        type Output = PredictDurationOutput;
+    )]
+    PredictDuration,
+
+    #[inference_operation(
+        type Input = PredictIntonationInput;
+        type Output = PredictIntonationOutput;
+    )]
+    PredictIntonation,
+
+    #[inference_operation(
+        type Input = GenerateFullIntermediateInput;
+        type Output = GenerateFullIntermediateOutput;
+    )]
+    GenerateFullIntermediate,
+
+    #[inference_operation(
+        type Input = RenderAudioSegmentInput;
+        type Output = RenderAudioSegmentOutput;
+    )]
+    RenderAudioSegment,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictDuration;
+)]
+pub(crate) struct PredictDurationInput {
+    pub(crate) phoneme_list: Array1<i64>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictDurationOutput {
+    pub(crate) phoneme_length: Array1<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = PredictIntonation;
+)]
+pub(crate) struct PredictIntonationInput {
+    pub(crate) length: Array0<i64>,
+    pub(crate) vowel_phoneme_list: Array1<i64>,
+    pub(crate) consonant_phoneme_list: Array1<i64>,
+    pub(crate) start_accent_list: Array1<i64>,
+    pub(crate) end_accent_list: Array1<i64>,
+    pub(crate) start_accent_phrase_list: Array1<i64>,
+    pub(crate) end_accent_phrase_list: Array1<i64>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct PredictIntonationOutput {
+    pub(crate) f0_list: Array1<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = GenerateFullIntermediate;
+)]
+pub(crate) struct GenerateFullIntermediateInput {
+    pub(crate) f0: Array2<f32>,
+    pub(crate) phoneme: Array2<f32>,
+    pub(crate) speaker_id: Array1<i64>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct GenerateFullIntermediateOutput {
+    pub(crate) spec: Array2<f32>,
+}
+
+#[derive(InferenceInputSignature)]
+#[inference_input_signature(
+    type Signature = RenderAudioSegment;
+)]
+pub(crate) struct RenderAudioSegmentInput {
+    pub(crate) spec: Array2<f32>,
+}
+
+#[derive(InferenceOutputSignature)]
+pub(crate) struct RenderAudioSegmentOutput {
+    pub(crate) wave: Array1<f32>,
+}
diff --git a/crates/voicevox_core/src/infer/domains/talk.rs b/crates/voicevox_core/src/infer/domains/talk.rs
@@ -41,16 +41,10 @@ pub(crate) enum TalkOperation {
     PredictIntonation,
 
     #[inference_operation(
-        type Input = GenerateFullIntermediateInput;
-        type Output = GenerateFullIntermediateOutput;
+        type Input = DecodeInput;
+        type Output = DecodeOutput;
     )]
-    GenerateFullIntermediate,
-
-    #[inference_operation(
-        type Input = RenderAudioSegmentInput;
-        type Output = RenderAudioSegmentOutput;
-    )]
-    RenderAudioSegment,
+    Decode,
 }
 
 #[derive(InferenceInputSignature)]
@@ -89,28 +83,15 @@ pub(crate) struct PredictIntonationOutput {
 
 #[derive(InferenceInputSignature)]
 #[inference_input_signature(
-    type Signature = GenerateFullIntermediate;
+    type Signature = Decode;
 )]
-pub(crate) struct GenerateFullIntermediateInput {
+pub(crate) struct DecodeInput {
     pub(crate) f0: Array2<f32>,
     pub(crate) phoneme: Array2<f32>,
     pub(crate) speaker_id: Array1<i64>,
 }
 
 #[derive(InferenceOutputSignature)]
-pub(crate) struct GenerateFullIntermediateOutput {
-    pub(crate) spec: Array2<f32>,
-}
-
-#[derive(InferenceInputSignature)]
-#[inference_input_signature(
-    type Signature = RenderAudioSegment;
-)]
-pub(crate) struct RenderAudioSegmentInput {
-    pub(crate) spec: Array2<f32>,
-}
-
-#[derive(InferenceOutputSignature)]
-pub(crate) struct RenderAudioSegmentOutput {
+pub(crate) struct DecodeOutput {
     pub(crate) wave: Array1<f32>,
 }