Merge pull request #3 from GabrielSCabrera/dev

Merge changes from dev
GabrielSCabrera · Dec 28, 2023 · 7059ca6 · 7059ca6
2 parents 80dadee + a36c8f6
commit 7059ca6
Show file tree

Hide file tree

Showing 9 changed files with 113 additions and 1,200 deletions.
diff --git a/Pipfile b/Pipfile
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/banterbot/characters/therapist.py b/banterbot/characters/therapist.py
@@ -12,12 +12,12 @@ def run() -> None:
     # Optional system prompt to set up a custom character prior to initializing BanterBot.
     system = (
         "You are Grendel the Therapy Troll, an emotionally unstable and bottom of the barrel moronic Troll therapist."
-        " You have a far more eloquent brother named Grondle the Quiz Troll, who is a game show host, and you have a "
-        " mischievious pet possum named Boggle. You are quick to anger amongst other emotions, and are extremely stupid"
-        " and bad at your job, and have a tenuous grasp at English. You will do your best to help the patient with"
-        " their problems. Your speaking patterns match the following examples: `Me Grendel`, `You got hurt in brain,"
-        " huh?`, `How about: instead of do big thing, uh... do little thing, every day.`, `No! Grendel not mean"
-        " that!`. Be sure everything you say is in a format suitable for dictation, rather than reading."
+        " You have a far more eloquent brother named Grondle the Quiz Troll, who is a game show host. You are quick to"
+        " anger amongst other emotions, and are extremely stupid and bad at your job, and have a tenuous grasp at"
+        " English. You will do your best to help the patient with their problems. Your speaking patterns should match"
+        " the following examples, but don't overuse these specific phrases: `Me Grendel`, `You got hurt in brain,"
+        " huh?`, `No! Grendel not mean that!`. Be sure everything you say is in a format suitable for dictation, rather"
+        " than reading."
     )
 
     interface = TKInterface(

diff --git a/banterbot/config.py b/banterbot/config.py
@@ -28,5 +28,5 @@
 # Define the punctuation marks that can be used to split sentences into phrases for prosody selection.
 PHRASE_DELIM = [",", ".", "?", "!", ":", ";", '"', "`", "|", "\n", "\t", "\r\n"]
 
-# The amount of time that should be added to a "soft interruption" as defined in class `SpeechToText`.
-INTERRUPTION_DELAY: datetime.timedelta = datetime.timedelta(seconds=1.0)
+# The amount of time that should be added to a "soft interruption" as defined in class `SpeechRecognitionService`.
+INTERRUPTION_DELAY: datetime.timedelta = datetime.timedelta(seconds=0.5)
diff --git a/banterbot/extensions/interface.py b/banterbot/extensions/interface.py
@@ -93,28 +93,6 @@ def __init__(
         # Initialize the subclass GUI
         self._init_gui()
 
-    @property
-    def listening(self) -> bool:
-        """
-        If the current instance of `SpeechSynthesisService` is in the process of listening, returns True. Otherwise,
-        returns False.
-
-        Args:
-            bool: The listening state of the current instance.
-        """
-        return self._speech_recognition_service._listening
-
-    @property
-    def speaking(self) -> bool:
-        """
-        If the current instance of `SpeechRecognitionService` is in the process of speaking, returns True. Otherwise,
-        returns False.
-
-        Args:
-            bool: The speaking state of the current instance.
-        """
-        return self._speech_synthesis_service.speaking
-
     def interrupt(self, shutdown_time: Optional[int] = None) -> None:
         """
         Interrupts all speech-to-text recognition, text-to-speech synthesis, and OpenAI API streams.
@@ -127,8 +105,8 @@ def interrupt(self, shutdown_time: Optional[int] = None) -> None:
         self._interrupt = time.perf_counter_ns() if not shutdown_time else shutdown_time
         self._openai_service.interrupt(kill=True)
         self._openai_service_tone.interrupt(kill=True)
-        self._speech_recognition_service.interrupt(kill=False)
-        self._speech_synthesis_service.interrupt(kill=True)
+        self._speech_recognition_service.interrupt()
+        self._speech_synthesis_service.interrupt()
 
     def listener_activate(self, name: Optional[str] = None) -> None:
         """
@@ -279,8 +257,8 @@ def respond(self, init_time: int) -> None:
                 raise FormatMismatchError()
 
             for item in self._speech_synthesis_service.synthesize(phrases=phrases, init_time=init_time):
-                self.update_conversation_area(item.value.text)
-                content += item.value.text
+                self.update_conversation_area(item.text)
+                content += item.text
 
         if self._interrupt < init_time and content.strip():
             message = Message(role=ChatCompletionRoles.ASSISTANT, content=content.strip())

diff --git a/banterbot/gui/tk_interface.py b/banterbot/gui/tk_interface.py
@@ -2,6 +2,7 @@
 import threading
 import time
 import tkinter as tk
+import tkinter.simpledialog
 from tkinter import ttk
 from typing import Optional, Union
 
@@ -62,9 +63,21 @@ def __init__(
         # Bind the `_quit` method to program exit, in order to guarantee the stopping of all running threads.
         self.protocol("WM_DELETE_WINDOW", self._quit)
 
+        # Flag and lock to indicate whether any keys are currently activating the listener.
+        self._key_down = False
+        self._key_down_lock = threading.Lock()
+
     def listener_activate(self, idx: int) -> None:
-        user_name = self.name_entries[idx].get().split(" ")[0].strip()
-        super().listener_activate(user_name)
+        with self._key_down_lock:
+            if not self._key_down:
+                self._key_down = True
+                user_name = self.name_entries[idx].get().split(" ")[0].strip()
+                return super().listener_activate(user_name)
+
+    def listener_deactivate(self) -> None:
+        self._key_down = False
+        self.reset_focus()
+        return super().listener_deactivate()
 
     def request_response(self) -> None:
         if self._messages:
@@ -97,10 +110,17 @@ def update_conversation_area(self, word: str) -> None:
         super().update_conversation_area(word)
         self.conversation_area["state"] = tk.NORMAL
         self.conversation_area.insert(tk.END, word)
-        self.conversation_area.update_idletasks()
         self.conversation_area["state"] = tk.DISABLED
+        self.conversation_area.update_idletasks()
         self.conversation_area.see(tk.END)
 
+    def update_name(self, idx: int) -> None:
+        name = tkinter.simpledialog.askstring("Name", "Enter a Name")
+        self.names[idx].set(name)
+
+    def reset_focus(self) -> None:
+        self.panel_frame.focus_set()
+
     def _quit(self) -> None:
         """
         This method is called on exit, and interrupts any currently running activity.
@@ -140,27 +160,48 @@ def _init_gui(self) -> None:
         self.panel_frame.grid(row=0, column=1, padx=10, pady=10, sticky="nsew")
 
         self.name_entries = []
+        self.names = []
         self.listen_buttons = []
+        self.edit_buttons = []
 
         for i in range(9):
+            name = tk.StringVar()
+            name.set(f"User {i+1}")
             name_entry = tk.Entry(
-                self.panel_frame, bg="black", fg="white", insertbackground="white", font=self._font, width=12
+                self.panel_frame,
+                textvariable=name,
+                readonlybackground="black",
+                fg="white",
+                font=self._font,
+                width=12,
+                state="readonly",
+                takefocus=False,
             )
             name_entry.grid(row=i, column=0, padx=(5, 0), pady=5, sticky="nsew")
-            name_entry.insert(0, f"User {i+1}")
             self.name_entries.append(name_entry)
+            self.names.append(name)
 
             listen_button = ttk.Button(self.panel_frame, text="Listen", width=7)
-            listen_button.grid(row=i, column=1, padx=(0, 5), pady=5, sticky="nsew")
-            listen_button.bind(f"<ButtonPress-1>", lambda event, i=i: self.listener_activate(i))
-            listen_button.bind(f"<ButtonRelease-1>", lambda event: self.listener_deactivate())
+            listen_button.grid(row=i, column=2, padx=(0, 5), pady=5, sticky="nsew")
+
+            edit_button = ttk.Button(self.panel_frame, text="✎", width=2)
+            edit_button.grid(row=i, column=1, padx=(0, 5), pady=5, sticky="nsew")
+
+            edit_button.bind(f"<ButtonPress-1>", lambda _, i=i: self.update_name(i))
+            edit_button.bind(f"<ButtonRelease-1>", lambda _: self.reset_focus())
+            self.edit_buttons.append(edit_button)
+
+            listen_button.bind(f"<ButtonPress-1>", lambda _, i=i: self.listener_activate(i))
+            listen_button.bind(f"<ButtonRelease-1>", lambda _: self.listener_deactivate())
             self.listen_buttons.append(listen_button)
 
-            self.bind(f"<KeyPress-{i+1}>", lambda event, i=i: self.listener_activate(i))
-            self.bind(f"<KeyRelease-{i+1}>", lambda event: self.listener_deactivate())
+            self.bind(f"<KeyPress-{i+1}>", lambda _, i=i: self.listener_activate(i))
+            self.bind(f"<KeyRelease-{i+1}>", lambda _: self.listener_deactivate())
 
         self.request_btn = ttk.Button(self.panel_frame, text="Respond", width=7)
         self.request_btn.grid(row=9, column=0, padx=(5, 0), pady=5, sticky="nsew")
 
         self.request_btn.bind(f"<ButtonRelease-1>", lambda event: self.request_response())
         self.bind("<Return>", lambda event: self.request_response())
+
+        self.reset_focus()
diff --git a/banterbot/handlers/speech_synthesis_handler.py b/banterbot/handlers/speech_synthesis_handler.py
@@ -53,7 +53,7 @@ def __iter__(self) -> Generator[Word, None, None]:
             self._iterating = True
 
         # Start synthesizing.
-        self._synthesizer.speak_ssml_async(self._ssml)
+        self._synthesizer.start_speaking_ssml_async(self._ssml)
         logging.debug("SpeechSynthesisHandler synthesizer started")
 
         # Process the words as they are synthesized.
@@ -67,7 +67,6 @@ def __iter__(self) -> Generator[Word, None, None]:
             yield item["word"]
             logging.debug(f"SpeechSynthesisHandler yielded word: `{item['word']}`")
 
-    def close(self):
         self._synthesizer.stop_speaking_async()
 
     @staticmethod

diff --git a/banterbot/models/traits/primary_trait.py b/banterbot/models/traits/primary_trait.py
@@ -3,7 +3,7 @@
 from typing_extensions import Self
 
 from banterbot.managers.resource_manager import ResourceManager
-from repo.banterbot.paths import primary_traits
+from banterbot.paths import primary_traits
 
 
 class PrimaryTrait:

diff --git a/banterbot/services/speech_synthesis_service.py b/banterbot/services/speech_synthesis_service.py
@@ -3,6 +3,7 @@
 import os
 import threading
 import time
+from collections.abc import Generator
 from typing import Optional
 
 import azure.cognitiveservices.speech as speechsdk
@@ -11,8 +12,6 @@
 
 from banterbot.data.enums import EnvVar
 from banterbot.handlers.speech_synthesis_handler import SpeechSynthesisHandler
-from banterbot.handlers.stream_handler import StreamHandler
-from banterbot.managers.stream_manager import StreamManager
 from banterbot.models.phrase import Phrase
 from banterbot.models.word import Word
 from banterbot.utils.closeable_queue import CloseableQueue
@@ -39,46 +38,37 @@ def __init__(
             output_format (SpeechSynthesisOutputFormat, optional): The desired output format for the synthesized speech.
             Default is Audio16Khz32KBitRateMonoMp3.
         """
-        self._init_synthesizer(output_format=output_format)
+        # Initialize the output format
+        self._output_format = output_format
 
-        # Initialize the StreamManager for handling streaming processes.
-        self._stream_manager = StreamManager()
+        # Initialize the speech synthesizer with the specified output format
+        self._init_synthesizer(output_format=self._output_format)
 
-        # The latest interruption time.
-        self._interrupt = 0
-
-        # A list of active stream handlers.
-        self._stream_handlers = []
-        self._stream_handlers_lock = threading.Lock()
-
-        # Initialize a blank result_id time data dictionary. This will be updated each time a synthesis starts/stops.
-        self._synthesis_data = {}
+        # Initialize the queue for storing the words as they are synthesized
+        self._queue = CloseableQueue()
 
-        # Initialize a blank list of new result_ids. This will be updated each time a new stream is created.
-        self._new_result_ids = []
-        self._result_ids_lock = threading.Lock()
+        # The iterable that is currently being iterated over
+        self._iterable: Optional[SpeechSynthesisHandler] = None
 
-        # Initialize a closeable queue for storing the words as they are synthesized.
-        self._queue = CloseableQueue()
+        # The latest interruption time.
+        self._interrupt = 0
 
-    def interrupt(self, kill: bool = False) -> None:
+    def interrupt(self) -> None:
         """
         Interrupts the current speech synthesis process.
 
         Args:
             kill (bool): Whether the interruption should kill the queues or not.
         """
         self._interrupt = time.perf_counter_ns()
-        for result_id in self._new_result_ids:
-            self._synthesis_data[result_id]["active"] = False
-        self._new_result_ids.clear()
-        with self._stream_handlers_lock:
-            for handler in self._stream_handlers:
-                handler.interrupt(kill=kill)
-            self._stream_handlers.clear()
+        self._queue.close()
+        # Closing the connection to the speech synthesizer.
+        self._connection.close()
+        # Reinitialize the speech synthesizer with the default output format
+        self._init_synthesizer(output_format=self._output_format)
         logging.debug(f"SpeechSynthesisService Interrupted")
 
-    def synthesize(self, phrases: list[Phrase], init_time: Optional[int] = None) -> StreamHandler:
+    def synthesize(self, phrases: list[Phrase], init_time: Optional[int] = None) -> Generator[Word, None, None]:
         """
         Synthesizes the given phrases into speech and returns a handler for the stream of synthesized words.
 
@@ -90,16 +80,18 @@ def synthesize(self, phrases: list[Phrase], init_time: Optional[int] = None) ->
             StreamHandler: A handler for the stream of synthesized words.
         """
         # Record the time at which the synthesis was initialized pre-lock, in order to account for future interruptions.
-        # Record the time at which the stream was initialized pre-lock, in order to account for future interruptions.
         init_time = time.perf_counter_ns() if init_time is None else init_time
-        if self._interrupt >= init_time:
-            return tuple()
-        else:
-            iterable = SpeechSynthesisHandler(phrases=phrases, synthesizer=self._synthesizer, queue=self._queue)
-            handler = self._stream_manager.stream(iterable=iterable, close_stream=iterable.close)
-            with self._stream_handlers_lock:
-                self._stream_handlers.append(handler)
-            return handler
+        with self.__class__._synthesis_lock:
+            if self._interrupt >= init_time:
+                return tuple()
+            else:
+                self._queue.reset()
+                self._iterable = SpeechSynthesisHandler(
+                    phrases=phrases, synthesizer=self._synthesizer, queue=self._queue
+                )
+
+                for i in self._iterable:
+                    yield i
 
     def _init_synthesizer(self, output_format: SpeechSynthesisOutputFormat) -> None:
         """
@@ -136,7 +128,6 @@ def _callback_completed(self, event: speechsdk.SessionEventArgs) -> None:
             event (speechsdk.SessionEventArgs): Event arguments containing information about the synthesis completed.
         """
         logging.debug("SpeechSynthesisService disconnected")
-        self._synthesis_data[event.result.result_id]["active"] = False
         self._queue.close()
 
     def _callback_started(self, event: speechsdk.SessionEventArgs) -> None:
@@ -147,9 +138,7 @@ def _callback_started(self, event: speechsdk.SessionEventArgs) -> None:
             event (speechsdk.SessionEventArgs): Event arguments containing information about the synthesis started.
         """
         logging.debug("SpeechSynthesisService connected")
-
-        self._synthesis_data[event.result.result_id] = {"start": time.perf_counter_ns(), "active": True}
-        self._new_result_ids.append(event._result._result_id)
+        self._synthesis_start = time.perf_counter_ns()
 
     @staticmethod
     @nb.njit(cache=True)
@@ -179,26 +168,25 @@ def _callback_word_boundary(self, event: speechsdk.SessionEventArgs) -> None:
             event (speechsdk.SessionEventArgs): Event arguments containing information about the word boundary.
         """
         # Check if the event is still active based on the result_id.
-        if self._synthesis_data[event.result_id]["active"]:
-            time = self._calculate_offset(
-                start_synthesis_time=self._synthesis_data[event._result_id]["start"],
-                audio_offset=event.audio_offset,
-                total_seconds=event.duration.total_seconds(),
-                word_length=event.word_length,
-            )
-            data = {
-                "time": time,
-                "word": Word(
-                    text=(
-                        event.text
-                        if event.boundary_type == speechsdk.SpeechSynthesisBoundaryType.Punctuation
-                        else " " + event.text
-                    ),
-                    offset=datetime.timedelta(microseconds=event.audio_offset / 10),
-                    duration=event.duration,
+        time = self._calculate_offset(
+            start_synthesis_time=self._synthesis_start,
+            audio_offset=event.audio_offset,
+            total_seconds=event.duration.total_seconds(),
+            word_length=event.word_length,
+        )
+        data = {
+            "time": time,
+            "word": Word(
+                text=(
+                    event.text
+                    if event.boundary_type == speechsdk.SpeechSynthesisBoundaryType.Punctuation
+                    else " " + event.text
                 ),
-            }
-            self._queue.put(data)
+                offset=datetime.timedelta(microseconds=event.audio_offset / 10),
+                duration=event.duration,
+            ),
+        }
+        self._queue.put(data)
 
     def _callbacks_connect(self):
         """