fx speed

audeering · Nov 9, 2024 · 0b026b5 · 0b026b5
1 parent dd490e0
commit 0b026b5
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -25,27 +25,24 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python dem
 
 ## API
 
-Start Flask `api.py` on a `tmux-session`
+Flask `api.py` on a `tmux-session`
 
 ```
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
 ```
 
 ## Inference
 
-If `api.py` runs on a different machine, copy [here](https://github.com/audeering/shift/blob/main/tts.py#L85) the IP shown in the terminal of `api.py`.
+Examples below need `api.py` to be already running. If `api.py` runs on a different machine, [use the IP shown in the terminal](https://github.com/audeering/shift/blob/main/tts.py#L85) of `api.py`.
 
 
 **Text To Speech**
 
 
 
 ```python
-# Basic TTS - See Available Voices
+# Basic TTS - See Available Voices Above - saves .wav in ./out
 python tts.py --text assets/LLM_description.txt --voice "en_US/m-ailabs_low#mary_ann"
-
-# voice cloning
-python tts.py --text assets/LLM_description.txt --native assets/native_voice.wav
 ```
 
 [Listen to Various Generations](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4)

diff --git a/api.py b/api.py
@@ -30,7 +30,8 @@
 
 def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
-                       voice=None):
+                       voice=None,
+                       speed=None):
     '''24 kHZ tts'''
 
     if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
@@ -58,7 +59,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
 
     x = msinference.foreign(text=text,
                             lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
-                            speed=.87)
+                            speed=speed)
 
     return x
 
@@ -113,18 +114,19 @@ def serve_wav():
     #                      object-into-a-representation-suitable-for-mongodb
     r = request.form.to_dict(flat=False)
 
-    # Physically Save Client Files
+    # Physically Save Client Files - DELTE / of name ?
     for f, obj in request.files.items():
 
         obj.save(f'flask_cache/{f[-6:]}')
 
     args = SimpleNamespace(
-        text      = None if r.get('text')  is None else 'flask_cache/' + r.get('text' )[0][-6:],  # ['sample.txt']
+        text      = None if r.get('text')  is None else 'flask_cache/' + r.get('text' )[0][-6:],
         video     = None if r.get('video') is None else 'flask_cache/' + r.get('video')[0][-6:],
-        image     = None if r.get('image') is None else 'flask_cache/' + r.get('image')[0][-6:], #flask_cache/' + request.data.get("image"),
-        voice     = r.get('voice')[0],
-        native    = None if r.get('native') is None else 'flask_cache/' + r.get('native')[0],
-        affective = r.get('affective')[0]
+        image     = None if r.get('image') is None else 'flask_cache/' + r.get('image')[0][-6:],
+        native    = None if r.get('native') is None else 'flask_cache/' + r.get('native')[0][-6:],
+        affective =       r.get('affective')[0],
+        voice     =       r.get('voice')[0],
+        speed     = float(r.get('speed')[0])  # For Non-English MMS TTS
                           )  # alpha_num('/folder1/folder2/file.txt')
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
 
@@ -189,7 +191,9 @@ def serve_wav():
                     '#', '_').replace(
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
-    print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
+
+    # NOTE: style vector is None for FOREIGN LANGS
+
     # ====SILENT VIDEO====
 
     if args.video is not None:
@@ -317,7 +321,8 @@ def inpaint_banner(get_frame, t):
                 x = tts_multi_sentence(
                     text=_text_,
                     precomputed_style_vector=precomputed_style_vector,
-                    voice=args.voice)
+                    voice=args.voice,
+                    speed=args.speed)
 
                 # PAUSES BETWEEN SUBTITLE SEGMENTS
 
@@ -359,7 +364,7 @@ def inpaint_banner(get_frame, t):
             OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_video_from_txt.mp4'
             x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
-                               voice=args.voice)
+                               voice=args.voice,                            speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 24000)
 
     # IMAGE 2 SPEECH
@@ -376,7 +381,7 @@ def inpaint_banner(get_frame, t):
 
         x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
-                               voice=args.voice
+                               voice=args.voice,                            speed=args.speed
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
 
@@ -404,7 +409,8 @@ def inpaint_banner(get_frame, t):
         # Fallback: No image nor video provided - do only tts
         x = tts_multi_sentence(text=text,
                             precomputed_style_vector=precomputed_style_vector, 
-                            voice=args.voice)
+                            voice=args.voice,
+                            speed=args.speed)
 
         OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
         soundfile.write(OUT_FILE, x, 24000)

diff --git a/msinference.py b/msinference.py
@@ -381,7 +381,7 @@ def preprocess_char(self, text, lang=None):
         return text
 
 
-def foreign(text=None, lang='romanian', speed=1.64):
+def foreign(text=None, lang='romanian', speed=None):
     # TTS for non english languages supported by 
     # https://huggingface.co/spaces/mms-meta/MMS
 
@@ -402,7 +402,7 @@ def foreign(text=None, lang='romanian', speed=1.64):
     elif 'rom' in lang.lower():
 
         lang_code = 'ron'
-        speed=1.24
+        speed = 1.24 if speed is None else speed
 
     else:
         lang_code = lang.split()[0].strip()

diff --git a/tts.py b/tts.py
@@ -74,10 +74,10 @@ def command_line_args():
         default=None
     )
     parser.add_argument(
-        '--scene',
-        help='Sound scene description.',
+        '--speed',
+        help='speec of TTS (only used in Non English voices).',
         type=str,
-        default=None, #'calm background sounds of a castle'
+        default=1.24,
     )
     return parser
 
@@ -91,7 +91,7 @@ def send_to_server(args):
         'text': args.text,
         'image': args.image,
         'video': args.video,
-        'scene': args.scene,
+        'speed': args.speed,
         # 'out_file': args.out_file   # let serve save as temp
     }