From 9c522b48438b728fd3bc9b1615abb3c30d386b98 Mon Sep 17 00:00:00 2001 From: Jai Dhingra Date: Tue, 29 Oct 2024 18:19:50 +0530 Subject: [PATCH 1/2] Added speech recognition in text-to-image generator program --- .../speech_textToImageGenerator.py | 77 +++++++++++++++++++ .../ImageGenerators/textToImageGenerator.py | 51 ------------ 2 files changed, 77 insertions(+), 51 deletions(-) create mode 100644 src/apps/pages/programs/ImageGenerators/speech_textToImageGenerator.py delete mode 100644 src/apps/pages/programs/ImageGenerators/textToImageGenerator.py diff --git a/src/apps/pages/programs/ImageGenerators/speech_textToImageGenerator.py b/src/apps/pages/programs/ImageGenerators/speech_textToImageGenerator.py new file mode 100644 index 00000000..bb9c8bc2 --- /dev/null +++ b/src/apps/pages/programs/ImageGenerators/speech_textToImageGenerator.py @@ -0,0 +1,77 @@ +import streamlit as st +from PIL import Image +import torch +from diffusers import StableDiffusionPipeline +import speech_recognition as sr + +# Function to generate image +@st.cache_resource +def load_pipeline(): + model_id = "CompVis/stable-diffusion-v1-4" + device = "cuda" if torch.cuda.is_available() else "cpu" + + # Load the pipeline without float16 if using CPU + if device == "cpu": + pipe = StableDiffusionPipeline.from_pretrained(model_id) + else: + pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16) + + pipe.to(device) + return pipe + +# Function to recognize speech +def recognize_speech(): + recognizer = sr.Recognizer() + with sr.Microphone() as source: + st.info("Listening for speech...") + recognizer.adjust_for_ambient_noise(source, duration=1) + audio = recognizer.listen(source) + + try: + st.info("Recognizing speech...") + text = recognizer.recognize_google(audio) + st.success(f"Recognized: {text}") + return text + except sr.UnknownValueError: + st.error("Could not understand audio") + except sr.RequestError as e: + st.error(f"Could not request results; {e}") + return None + +# Define the main function +def speech_textToImageGenerator(): + + # Button to use speech recognition outside the form + if st.button("Use Speech Recognition"): + recognized_text = recognize_speech() + if recognized_text: + st.session_state.input_text = recognized_text + + # Text input form + with st.form("Text_input_form"): + prompt = st.text_input("Enter a prompt to generate Image:", st.session_state.get("input_text", "")) + submit_button = st.form_submit_button("Generate Image") + + # Generate image if prompt is available + if prompt and submit_button: + try: + pipe = load_pipeline() + + # Generate the image + if torch.cuda.is_available(): + with torch.autocast("cuda"): + output = pipe(prompt + " 4k, High Resolution", guidance_scale=8.5) + else: + output = pipe(prompt + " 4k, High Resolution", guidance_scale=8.5) + + image = output.images[0] + st.image(image, caption="Generated Image", use_column_width=True) + + # Optionally, save the image + image.save('src/apps/pages/programs/ImageGenerators/generated_image.png') + st.success("Image generated successfully!") + except Exception as e: + st.error(f"An error occurred: {str(e)}") + elif submit_button: + st.warning("Please enter a prompt first.") + diff --git a/src/apps/pages/programs/ImageGenerators/textToImageGenerator.py b/src/apps/pages/programs/ImageGenerators/textToImageGenerator.py deleted file mode 100644 index cc782885..00000000 --- a/src/apps/pages/programs/ImageGenerators/textToImageGenerator.py +++ /dev/null @@ -1,51 +0,0 @@ -import streamlit as st -from PIL import Image -import torch -from diffusers import StableDiffusionPipeline - -# Function to generate image -@st.cache_resource -def load_pipeline(): - model_id = "CompVis/stable-diffusion-v1-4" - device = "cuda" if torch.cuda.is_available() else "cpu" - - # Load the pipeline without float16 if using CPU - if device == "cpu": - pipe = StableDiffusionPipeline.from_pretrained(model_id) - else: - pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16) - - pipe.to(device) - return pipe - -# Define the main function -def textToImageGenerator(): - with st.form("Text_input_form"): - st.session_state.input_text = st.text_input("Enter a prompt to generate Image:") - - start_button = st.form_submit_button("Generate Image") - if start_button: - st.session_state.running = True - - if st.session_state.input_text: - try: - pipe = load_pipeline() - - if torch.cuda.is_available(): - with torch.autocast("cuda"): - output = pipe(st.session_state.input_text + " 4k, High Resolution", guidance_scale=8.5) - else: - output = pipe(st.session_state.input_text + " 4k, High Resolution", guidance_scale=8.5) - - image = output.images[0] - st.image(image, caption="Generated Image", use_column_width=True) - - # Optionally, save the image - image.save('src/apps/pages/programs/ImageGenerators/generated_image.png') - st.success("Image generated successfully!") - except Exception as e: - st.error(f"An error occurred: {str(e)}") - else: - st.warning("Please enter a prompt first.") - - From 13a065ad5a7c5af77f0d1f9c2fe46375d37c5918 Mon Sep 17 00:00:00 2001 From: Jai Dhingra <117927011+jaidh01@users.noreply.github.com> Date: Tue, 29 Oct 2024 21:09:35 +0530 Subject: [PATCH 2/2] Updated requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 8ee2b256..3621e915 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,3 +40,4 @@ torch==2.3.0 ultralytics==8.3.3 diffusers==0.30.3 transformers==4.45.2 +PyAudio==0.2.14