Skip to content

Commit

Permalink
fx speed
Browse files Browse the repository at this point in the history
  • Loading branch information
dkounadis committed Nov 9, 2024
1 parent dd490e0 commit 0b026b5
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 25 deletions.
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,24 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python dem

## API

Start Flask `api.py` on a `tmux-session`
Flask `api.py` on a `tmux-session`

```
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
```

## Inference

If `api.py` runs on a different machine, copy [here](https://github.com/audeering/shift/blob/main/tts.py#L85) the IP shown in the terminal of `api.py`.
Examples below need `api.py` to be already running. If `api.py` runs on a different machine, [use the IP shown in the terminal](https://github.com/audeering/shift/blob/main/tts.py#L85) of `api.py`.


**Text To Speech**



```python
# Basic TTS - See Available Voices
# Basic TTS - See Available Voices Above - saves .wav in ./out
python tts.py --text assets/LLM_description.txt --voice "en_US/m-ailabs_low#mary_ann"

# voice cloning
python tts.py --text assets/LLM_description.txt --native assets/native_voice.wav
```

[Listen to Various Generations](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4)
Expand Down
32 changes: 19 additions & 13 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@

def tts_multi_sentence(precomputed_style_vector=None,
text=None,
voice=None):
voice=None,
speed=None):
'''24 kHZ tts'''

if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
Expand Down Expand Up @@ -58,7 +59,7 @@ def tts_multi_sentence(precomputed_style_vector=None,

x = msinference.foreign(text=text,
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
speed=.87)
speed=speed)

return x

Expand Down Expand Up @@ -113,18 +114,19 @@ def serve_wav():
# object-into-a-representation-suitable-for-mongodb
r = request.form.to_dict(flat=False)

# Physically Save Client Files
# Physically Save Client Files - DELTE / of name ?
for f, obj in request.files.items():

obj.save(f'flask_cache/{f[-6:]}')

args = SimpleNamespace(
text = None if r.get('text') is None else 'flask_cache/' + r.get('text' )[0][-6:], # ['sample.txt']
text = None if r.get('text') is None else 'flask_cache/' + r.get('text' )[0][-6:],
video = None if r.get('video') is None else 'flask_cache/' + r.get('video')[0][-6:],
image = None if r.get('image') is None else 'flask_cache/' + r.get('image')[0][-6:], #flask_cache/' + request.data.get("image"),
voice = r.get('voice')[0],
native = None if r.get('native') is None else 'flask_cache/' + r.get('native')[0],
affective = r.get('affective')[0]
image = None if r.get('image') is None else 'flask_cache/' + r.get('image')[0][-6:],
native = None if r.get('native') is None else 'flask_cache/' + r.get('native')[0][-6:],
affective = r.get('affective')[0],
voice = r.get('voice')[0],
speed = float(r.get('speed')[0]) # For Non-English MMS TTS
) # alpha_num('/folder1/folder2/file.txt')
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')

Expand Down Expand Up @@ -189,7 +191,9 @@ def serve_wav():
'#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
print('\n STYLE VECTOR \n', precomputed_style_vector.shape)

# NOTE: style vector is None for FOREIGN LANGS

# ====SILENT VIDEO====

if args.video is not None:
Expand Down Expand Up @@ -317,7 +321,8 @@ def inpaint_banner(get_frame, t):
x = tts_multi_sentence(
text=_text_,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice)
voice=args.voice,
speed=args.speed)

# PAUSES BETWEEN SUBTITLE SEGMENTS

Expand Down Expand Up @@ -359,7 +364,7 @@ def inpaint_banner(get_frame, t):
OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_video_from_txt.mp4'
x = tts_multi_sentence(text=text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice)
voice=args.voice, speed=args.speed)
soundfile.write(AUDIO_TRACK, x, 24000)

# IMAGE 2 SPEECH
Expand All @@ -376,7 +381,7 @@ def inpaint_banner(get_frame, t):

x = tts_multi_sentence(text=text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice
voice=args.voice, speed=args.speed
)
soundfile.write(AUDIO_TRACK, x, 24000)

Expand Down Expand Up @@ -404,7 +409,8 @@ def inpaint_banner(get_frame, t):
# Fallback: No image nor video provided - do only tts
x = tts_multi_sentence(text=text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice)
voice=args.voice,
speed=args.speed)

OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
soundfile.write(OUT_FILE, x, 24000)
Expand Down
4 changes: 2 additions & 2 deletions msinference.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def preprocess_char(self, text, lang=None):
return text


def foreign(text=None, lang='romanian', speed=1.64):
def foreign(text=None, lang='romanian', speed=None):
# TTS for non english languages supported by
# https://huggingface.co/spaces/mms-meta/MMS

Expand All @@ -402,7 +402,7 @@ def foreign(text=None, lang='romanian', speed=1.64):
elif 'rom' in lang.lower():

lang_code = 'ron'
speed=1.24
speed = 1.24 if speed is None else speed

else:
lang_code = lang.split()[0].strip()
Expand Down
8 changes: 4 additions & 4 deletions tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ def command_line_args():
default=None
)
parser.add_argument(
'--scene',
help='Sound scene description.',
'--speed',
help='speec of TTS (only used in Non English voices).',
type=str,
default=None, #'calm background sounds of a castle'
default=1.24,
)
return parser

Expand All @@ -91,7 +91,7 @@ def send_to_server(args):
'text': args.text,
'image': args.image,
'video': args.video,
'scene': args.scene,
'speed': args.speed,
# 'out_file': args.out_file # let serve save as temp
}

Expand Down

0 comments on commit 0b026b5

Please sign in to comment.