Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added speech recognition in text-to-image generator program #211

Merged
merged 2 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ torch==2.3.0
ultralytics==8.3.3
diffusers==0.30.3
transformers==4.45.2
PyAudio==0.2.14
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import streamlit as st
from PIL import Image
import torch
from diffusers import StableDiffusionPipeline
import speech_recognition as sr

# Function to generate image
@st.cache_resource
def load_pipeline():
model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the pipeline without float16 if using CPU
if device == "cpu":
pipe = StableDiffusionPipeline.from_pretrained(model_id)
else:
pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)

pipe.to(device)
return pipe

# Function to recognize speech
def recognize_speech():
recognizer = sr.Recognizer()
with sr.Microphone() as source:
st.info("Listening for speech...")
recognizer.adjust_for_ambient_noise(source, duration=1)
audio = recognizer.listen(source)

try:
st.info("Recognizing speech...")
text = recognizer.recognize_google(audio)
st.success(f"Recognized: {text}")
return text
except sr.UnknownValueError:
st.error("Could not understand audio")
except sr.RequestError as e:
st.error(f"Could not request results; {e}")
return None

# Define the main function
def speech_textToImageGenerator():

# Button to use speech recognition outside the form
if st.button("Use Speech Recognition"):
recognized_text = recognize_speech()
if recognized_text:
st.session_state.input_text = recognized_text

# Text input form
with st.form("Text_input_form"):
prompt = st.text_input("Enter a prompt to generate Image:", st.session_state.get("input_text", ""))
submit_button = st.form_submit_button("Generate Image")

# Generate image if prompt is available
if prompt and submit_button:
try:
pipe = load_pipeline()

# Generate the image
if torch.cuda.is_available():
with torch.autocast("cuda"):
output = pipe(prompt + " 4k, High Resolution", guidance_scale=8.5)
else:
output = pipe(prompt + " 4k, High Resolution", guidance_scale=8.5)

image = output.images[0]
st.image(image, caption="Generated Image", use_column_width=True)

# Optionally, save the image
image.save('src/apps/pages/programs/ImageGenerators/generated_image.png')
st.success("Image generated successfully!")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
elif submit_button:
st.warning("Please enter a prompt first.")

51 changes: 0 additions & 51 deletions src/apps/pages/programs/ImageGenerators/textToImageGenerator.py

This file was deleted.