Various updates to PsiStudio, Audio and Sigma (#310)

* ### __Change to SIGMA__ * Fixed issues with moving to the previous step. * Added support for using specific synthesis voices via the Sigma client configuration. * Updated the Sigma app to use caching for speech synthesis. ### __Changes to Components__ * Added `SpeechSynthesisCache` component to `Microsoft.Psi.Speech` to implement a cache for generated speech synthesis content. * The Azure-based `SpeechSynthesizer` component from `Microsoft.Psi.CognitiveServices.Speech` can now cache the generated utterances for future use (future identical requests do not need to go to the cloud, speeding up synthesis). The developer can also now select (via configuration) whether the audio buffers are streamed in real-time or as they arrive, and how close ahead of real time the generated audio buffers are streamed. These options allow for better controlling playback and avoiding drops. ### __Changes to Audio__ * Added a `Streamline` operator for audio streams that allows for normalizing gaps and overlaps in the audio buffer stream based on different methods (Concatenate, Pleat, Unpleat) * Added a batch processing task for exporting audio streams to wav files ### __Changes to PsiStudio and Visualization__ * Improved robustness of audio playback and addressed a number of issues that created drift between audio and visual playback. * Run batch task processing menu is now split into submenus by batch task processing namespace. * Added ability to export audio streams (or selections thereof) to a wav file. ### __Changes to Microsoft.Psi.Data__ * Added a `SessionImporter.OpenStream` overload that allows for providing a `[PartitionName]:StreamName` stream specification. This simplifies the configuration of batch processing tasks where input streams need to be specified, eliminating the need for specifying the partition separately. The existing batch tasks were adjusted to leverage this feature. ### __Changes to Runtime__ * Introduced `Merge` interval operator to compute a non-overlapping set of intervals that covers a given set of (potentially overlapping intervals) * Update per CR
microsoft · Apr 3, 2024 · 14fe106 · 14fe106
1 parent a6709c7
commit 14fe106
Show file tree

Hide file tree

Showing 33 changed files with 1,467 additions and 534 deletions.
diff --git a/Applications/Microsoft.Psi.MixedReality.Applications.UniversalWindows/StereoKitClientApp.cs b/Applications/Microsoft.Psi.MixedReality.Applications.UniversalWindows/StereoKitClientApp.cs
@@ -207,6 +207,7 @@ void StopPipeline()
                 state = State.StoppingPipeline;
                 Task.Run(() =>
                 {
+                    this.OnStoppingPipeline();
                     try
                     {
                         pipeline?.Dispose();
@@ -822,7 +823,7 @@ public virtual void PopulateConfigurationDefaults()
         }
 
         /// <summary>
-        /// Populates the UI at the waiting to start point.
+        /// Virtual method that populates the UI at the waiting to start point.
         /// </summary>
         public virtual void OnWaitingForStart()
         {
@@ -873,6 +874,13 @@ public virtual void OnWaitingForStart()
             }
         }
 
+        /// <summary>
+        /// Virtual method that performs any relevant tasks just prior to stopping (disposing) the pipeline.
+        /// </summary>
+        public virtual void OnStoppingPipeline()
+        {
+        }
+
         /// <summary>
         /// Gets the extra types to be used for serialization.
         /// </summary>

diff --git a/Applications/Sigma/Sigma.UniversalWindows/Sigma.UniversalWindows.csproj b/Applications/Sigma/Sigma.UniversalWindows/Sigma.UniversalWindows.csproj
@@ -127,6 +127,10 @@
       <Project>{d55700d9-6050-44ac-abc0-ac1fbf9dfd3f}</Project>
       <Name>Microsoft.Psi.Spatial.Euclidean</Name>
     </ProjectReference>
+    <ProjectReference Include="..\..\..\Sources\Speech\Microsoft.Psi.Speech\Microsoft.Psi.Speech.csproj">
+      <Project>{3889F11A-B537-47F9-8819-146DB7E66B0C}</Project>
+      <Name>Microsoft.Psi.Speech</Name>
+    </ProjectReference>
     <ProjectReference Include="..\..\Microsoft.Psi.MixedReality.Applications.UniversalWindows\Microsoft.Psi.MixedReality.Applications.UniversalWindows.csproj">
       <Project>{1afbbd50-ce3a-4792-be84-15e897d281dd}</Project>
       <Name>Microsoft.Psi.MixedReality.Applications.UniversalWindows</Name>

diff --git a/Applications/Sigma/Sigma.UniversalWindows/SigmaApp.cs b/Applications/Sigma/Sigma.UniversalWindows/SigmaApp.cs
@@ -5,6 +5,7 @@ namespace Sigma
 {
     using System;
     using System.Collections.Generic;
+    using System.IO;
     using System.Linq;
     using MathNet.Spatial.Euclidean;
     using Microsoft.Psi;
@@ -14,6 +15,7 @@ namespace Sigma
     using Microsoft.Psi.MixedReality.Applications;
     using Microsoft.Psi.MixedReality.ResearchMode;
     using Microsoft.Psi.MixedReality.StereoKit;
+    using Microsoft.Psi.Speech;
     using StereoKit;
     using Windows.Storage;
     using GazeSensor = Microsoft.Psi.MixedReality.WinRT.GazeSensor;
@@ -29,6 +31,7 @@ public class SigmaApp : StereoKitClientApp<SigmaAppConfiguration>
         private readonly Type[] sigmaAppConfigurationTypes;
         private UserStateConstructor userStateConstructor;
         private ISigmaUserInterface sigmaUserInterface;
+        private SpeechSynthesisCache speechSynthesisCache;
 
         private Dictionary<string, bool> selectedOutputPreviewStream = default;
 
@@ -75,6 +78,30 @@ public override void OnWaitingForStart()
             }
         }
 
+        /// <inheritdoc/>
+        public override async void OnStoppingPipeline()
+        {
+            base.OnStoppingPipeline();
+
+            // Write the speech synthesis cache to a file
+            if (this.speechSynthesisCache != null)
+            {
+                try
+                {
+                    using var stream = await KnownFolders.DocumentsLibrary.OpenStreamForWriteAsync("SpeechSynthesisCache.dat", CreationCollisionOption.ReplaceExisting);
+                    this.speechSynthesisCache.Write(stream);
+                }
+                catch
+                {
+                    var cacheFile = KnownFolders.DocumentsLibrary.GetFileAsync("SpeechSynthesisCache.dat").AsTask().GetAwaiter().GetResult();
+                    if (cacheFile != null)
+                    {
+                        await cacheFile.DeleteAsync();
+                    }
+                }
+            }
+        }
+
         /// <inheritdoc/>
         public override HoloLensStreams GetHoloLensStreams(Pipeline pipeline, out DepthCamera depthCamera)
             => LiveHoloLensStreams.Create(pipeline, out depthCamera, this.SelectedConfiguration, this.selectedOutputPreviewStream[this.SelectedConfiguration.Name]);
@@ -112,11 +139,35 @@ public override IClientServerCommunicationStreams CreateUserInterfacePipeline(Pi
             // Construct the speech synthesis components
             var speechSynthesisKey = this.ReadFileAsync(KnownFolders.DocumentsLibrary, "CognitiveServicesSpeechKey.txt").GetAwaiter().GetResult();
 
-            // Setup the speech synthesis
+            // Open the speech synthesis cache file if it exists
+            var speechSynthesisCacheFileStream = default(Stream);
+            try
+            {
+                speechSynthesisCacheFileStream = KnownFolders.DocumentsLibrary.OpenStreamForReadAsync("SpeechSynthesisCache.dat").GetAwaiter().GetResult();
+                this.speechSynthesisCache = new SpeechSynthesisCache(speechSynthesisCacheFileStream);
+            }
+            catch
+            {
+                this.speechSynthesisCache = new SpeechSynthesisCache(this.SelectedConfiguration.SpeechSynthesizerVoiceName);
+            }
+            finally
+            {
+                speechSynthesisCacheFileStream?.Dispose();
+            }
+
+            // If the cache is of a different voice, re-initialize
+            if (this.speechSynthesisCache.SpeechSynthesisVoiceName != this.SelectedConfiguration.SpeechSynthesizerVoiceName)
+            {
+                this.speechSynthesisCache = new SpeechSynthesisCache(this.SelectedConfiguration.SpeechSynthesizerVoiceName);
+            }
+
+            // Construct the speech synthesizer configuration
             var config = new SpeechSynthesizerConfiguration()
             {
                 SubscriptionKey = speechSynthesisKey,
                 Region = "westus",
+                VoiceName = this.SelectedConfiguration.SpeechSynthesizerVoiceName,
+                Cache = this.speechSynthesisCache,
             };
 
             // Construct the speech sythesizer

diff --git a/Applications/Sigma/Sigma.UniversalWindows/SigmaAppConfiguration.cs b/Applications/Sigma/Sigma.UniversalWindows/SigmaAppConfiguration.cs
@@ -64,6 +64,12 @@ public abstract class SigmaAppConfiguration : ClientAppConfiguration
         /// </summary>
         public int AudioReframeSizeMs { get; set; } = 100;
 
+        /// <summary>
+        /// Gets or sets the voice to use for the speech synthesizer. For possible options see
+        /// https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts#prebuilt-neural-voices.
+        /// </summary>
+        public string SpeechSynthesizerVoiceName { get; set; } = "en-US-JennyNeural";
+
         /// <summary>
         /// Creates a Sigma user interface.
         /// </summary>

diff --git a/Applications/Sigma/Sigma/BatchProcessingTasks/ExportDataTask.cs b/Applications/Sigma/Sigma/BatchProcessingTasks/ExportDataTask.cs
@@ -22,7 +22,7 @@ namespace Sigma
     /// Batch task for exporting Sigma data.
     /// </summary>
     [BatchProcessingTask(
-        "Sigma - Export Captured Data",
+        "Export Captured Data",
         Description = "This task exports the data collected by the Sigma app to a set of text files.")]
     public class ExportDataTask : BatchProcessingTask<ExportDataTaskConfiguration>
     {

diff --git a/Applications/Sigma/Sigma/BatchProcessingTasks/ExportNotesTask.cs b/Applications/Sigma/Sigma/BatchProcessingTasks/ExportNotesTask.cs
@@ -17,7 +17,7 @@ namespace Sigma
     /// Batch task for exporting Sigma data.
     /// </summary>
     [BatchProcessingTask(
-        "Sigma - Export User Notes",
+        "Export User Notes",
         Description = "This task exports notes taken by the user to a tab-delimited file.")]
     public class ExportNotesTask : BatchProcessingTask<ExportNotesTask.ExportNotesTaskConfiguration>
     {

diff --git a/Applications/Sigma/Sigma/BatchProcessingTasks/Object2DDetectionBasedObjectTrackerTask.cs b/Applications/Sigma/Sigma/BatchProcessingTasks/Object2DDetectionBasedObjectTrackerTask.cs
@@ -17,7 +17,7 @@ namespace Sigma
     /// Batch task that runs the object 2D detection based tracker.
     /// </summary>
     [BatchProcessingTask(
-        "Sigma - Run the object 2D detection based tracker",
+        "Run the object 2D detection based tracker",
         Description = "This task generates a new partition with results from object 2D detection based tracking.")]
     public class Object2DDetectionBasedObjectTrackerTask : BatchProcessingTask<Object2DDetectionBasedObjectTrackerTaskConfiguration>
     {

diff --git a/Applications/Sigma/Sigma/InteractionModels/Diamond/DiamondDialogStates.cs b/Applications/Sigma/Sigma/InteractionModels/Diamond/DiamondDialogStates.cs
@@ -36,7 +36,7 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
                     interactionModel.InteractionState.TaskName = interactionModel.Configuration.AutoStartTaskName;
                     if (interactionModel.TryGetKnownTask())
                     {
-                        yield return DialogAction.Speak($"Today I'm here to help you {interactionModel.Configuration.AutoStartTaskName}.");
+                        yield return DialogAction.Speak($"Today I'm here to help you {interactionModel.Configuration.AutoStartTaskName.ToLower().TrimEnd('.')}.");
 
                         // Move to a glanceable position
                         yield return DialogAction.Execute(interactionModel.MoveToGlanceablePosition);
@@ -119,7 +119,7 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
 
                         if (isKnownTask)
                         {
-                            yield return DialogAction.Speak($"Sure. I can help you {interactionModel.InteractionState.TaskName.ToLower()}.");
+                            yield return DialogAction.Speak($"Sure. I can help you {interactionModel.InteractionState.TaskName.ToLower().TrimEnd('.')}.");
                         }
                         else
                         {
@@ -158,6 +158,24 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
         /// </summary>
         public class ExecuteStep : DialogState<DiamondInteractionModel>
         {
+            private readonly bool usePreamble = true;
+
+            /// <summary>
+            /// Initializes a new instance of the <see cref="ExecuteStep"/> class.
+            /// </summary>
+            public ExecuteStep()
+            {
+            }
+
+            /// <summary>
+            /// Initializes a new instance of the <see cref="ExecuteStep"/> class.
+            /// </summary>
+            /// <param name="usePreamble">Indicates whether to use a preamble for the step (e.g., first, next, finally...)</param>
+            public ExecuteStep(bool usePreamble)
+            {
+                this.usePreamble = usePreamble;
+            }
+
             /// <inheritdoc/>
             public override void OnEnter(DiamondInteractionModel interactionModel)
             {
@@ -182,8 +200,15 @@ public override (string SystemPrompt, string[] UserResponseSet) GetSystemPromptA
                     (interactionModel.InteractionState.TryGetSelectedStepOfType<DoStep>(out var doStep) && doStep.Label == "1");
 
                 var systemPrompt = isFirstStep ?
-                    $"The first step is to {interactionModel.InteractionState.SelectedStep.GetSpokenInstructions().ToLower()}" :
-                    $"{interactionModel.InteractionState.SelectedStep.GetSpokenInstructions()}";
+                    $"{interactionModel.InteractionState.SelectedStep.GetSpokenInstructions().Trim('.')}." :
+                    $"{interactionModel.InteractionState.SelectedStep.GetSpokenInstructions().Trim('.')}.";
+
+                if (this.usePreamble)
+                {
+                    systemPrompt = isFirstStep ?
+                        $"The first step is to {systemPrompt.ToLower()}" :
+                        $"Next, {systemPrompt.ToLower()}";
+                }
 
                 var userResponseSet = new List<string>()
                 {
@@ -228,18 +253,25 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
                             yield return DialogAction.ContinueWith<ExecuteStep>(noSpeechSynthesis: true);
                         }
                     }
-                    else if (speechRecognition.Contains("previous"))
+                    else if (speechRecognition.Contains("previous") && speechRecognition.Contains("step"))
                     {
                         if (interactionModel.InteractionState.SelectedStepIndex > 0)
                         {
-                            interactionModel.InteractionState.GemState = GemState.AtUserInterface();
                             interactionModel.InteractionState.SelectedStepIndex--;
-                            yield return DialogAction.Execute(interactionModel.ContinueWithSelectedStep);
+                            if (interactionModel.InteractionState.TryGetSelectedStepOfType<ComplexStep>(out var complexStep))
+                            {
+                                interactionModel.InteractionState.SelectedSubStepIndex = complexStep.SubSteps.Count - 1;
+                                yield return DialogAction.ContinueWith(new ExecuteSubStep(usePreamble: false));
+                            }
+                            else
+                            {
+                                yield return DialogAction.ContinueWith(new ExecuteStep(usePreamble: false));
+                            }
                         }
                         else
                         {
-                            yield return DialogAction.Speak("This is the first step of the task.");
-                            yield return DialogAction.ContinueWith<ExecuteStep>(noSpeechSynthesis: true);
+                            yield return DialogAction.Speak("We are already at the first step of the task.");
+                            yield return DialogAction.ContinueWith<ExecuteSubStep>(noSpeechSynthesis: true);
                         }
                     }
                     else if (speechRecognition.Contains("start") && speechRecognition.Contains("timer") && interactionModel.InteractionState.SelectedStep is DoStep doStep)
@@ -436,6 +468,24 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
         /// </summary>
         public class ExecuteSubStep : DialogState<DiamondInteractionModel>
         {
+            private readonly bool usePreamble = true;
+
+            /// <summary>
+            /// Initializes a new instance of the <see cref="ExecuteSubStep"/> class.
+            /// </summary>
+            public ExecuteSubStep()
+            {
+            }
+
+            /// <summary>
+            /// Initializes a new instance of the <see cref="ExecuteSubStep"/> class.
+            /// </summary>
+            /// <param name="usePreamble">Indicates whether to use a preamble for the step (e.g., first, next, finally...)</param>
+            public ExecuteSubStep(bool usePreamble)
+            {
+                this.usePreamble = usePreamble;
+            }
+
             /// <inheritdoc/>
             public override void OnEnter(DiamondInteractionModel interactionModel)
             {
@@ -446,28 +496,37 @@ public override void OnEnter(DiamondInteractionModel interactionModel)
             /// <inheritdoc/>
             public override (string SystemPrompt, string[] UserResponseSet) GetSystemPromptAndUserResponseSet(DiamondInteractionModel interactionModel)
             {
-                string systemPrompt;
-                if (interactionModel.InteractionState.SelectedSubStepIndex == 0)
-                {
-                    systemPrompt = $"First, {(interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.ToLower()}.";
-                }
-                else if (interactionModel.InteractionState.SelectedSubStepIndex > 1 && interactionModel.InteractionState.SelectedSubStepIndex == (interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps.Count - 1)
+                string systemPrompt = (interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.TrimEnd('.');
+
+                if (this.usePreamble)
                 {
-                    systemPrompt = $"Finally, {(interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.ToLower()}.";
+                    if (interactionModel.InteractionState.SelectedSubStepIndex == 0)
+                    {
+                        systemPrompt = $"First, {systemPrompt.ToLower()}.";
+                    }
+                    else if (interactionModel.InteractionState.SelectedSubStepIndex > 1 && interactionModel.InteractionState.SelectedSubStepIndex == (interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps.Count - 1)
+                    {
+                        systemPrompt = $"Finally, {systemPrompt.ToLower()}.";
+                    }
+                    else
+                    {
+                        systemPrompt = $"Next, {systemPrompt.ToLower()}.";
+                    }
                 }
                 else
                 {
-                    systemPrompt = $"Next, {(interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.ToLower()}.";
+                    systemPrompt = $"{systemPrompt.Capitalize()}.";
                 }
 
-                var userRespose = new string[]
+                var userResponseSet = new List<string>() { "Next step." };
+                if ((interactionModel.InteractionState.SelectedSubStepIndex > 0) || (interactionModel.InteractionState.SelectedStepIndex > 0))
                 {
-                    "Next step.",
-                    "Go to the previous step.",
-                    "Let's abandon this task.",
-                };
+                    userResponseSet.Add("Previous step.");
+                }
+
+                userResponseSet.Add("Let's abandon this task.");
 
-                return (systemPrompt, userRespose);
+                return (systemPrompt, userResponseSet.ToArray());
             }
 
             /// <inheritdoc/>
@@ -488,6 +547,7 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
                     {
                         yield return DialogAction.Speak("Great.");
                         interactionModel.InteractionState.SelectedStepIndex++;
+                        interactionModel.InteractionState.SelectedSubStepIndex = null;
                         yield return DialogAction.Execute(interactionModel.ContinueWithSelectedStep);
                     }
 
@@ -508,6 +568,35 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
                         }
                     }
                 }
+                else if (speechRecognitionResult != null &&
+                    speechRecognitionResult.Contains("previous") &&
+                    speechRecognitionResult.Contains("step"))
+                {
+                    if (interactionModel.InteractionState.SelectedSubStepIndex > 0)
+                    {
+                        interactionModel.InteractionState.SelectedSubStepIndex--;
+                        yield return DialogAction.ContinueWith(new ExecuteSubStep(usePreamble: false));
+                    }
+                    else if (interactionModel.InteractionState.SelectedStepIndex > 0)
+                    {
+                        interactionModel.InteractionState.SelectedStepIndex--;
+                        if (interactionModel.InteractionState.TryGetSelectedStepOfType<ComplexStep>(out var complexStep))
+                        {
+                            interactionModel.InteractionState.SelectedSubStepIndex = complexStep.SubSteps.Count - 1;
+                            yield return DialogAction.ContinueWith(new ExecuteSubStep(usePreamble: false));
+                        }
+                        else
+                        {
+                            interactionModel.InteractionState.SelectedSubStepIndex = null;
+                            yield return DialogAction.ContinueWith(new ExecuteStep(usePreamble: false));
+                        }
+                    }
+                    else
+                    {
+                        yield return DialogAction.Speak("We are already at the first step of the task.");
+                        yield return DialogAction.ContinueWith<ExecuteSubStep>(noSpeechSynthesis: true);
+                    }
+                }
                 else if (speechRecognitionResult != null && speechRecognitionResult.ContainsOneOf("freeze", "pause"))
                 {
                     yield return DialogAction.ContinueWith(new PauseInteraction(this));