Skip to content

Commit

Permalink
Various updates to PsiStudio, Audio and Sigma (#310)
Browse files Browse the repository at this point in the history
* ### __Change to SIGMA__
* Fixed issues with moving to the previous step.
* Added support for using specific synthesis voices via the Sigma client configuration.
* Updated the Sigma app to use caching for speech synthesis.

### __Changes to Components__
* Added `SpeechSynthesisCache` component to `Microsoft.Psi.Speech` to implement a cache for generated speech synthesis content.
* The Azure-based `SpeechSynthesizer` component from `Microsoft.Psi.CognitiveServices.Speech` can now cache the generated utterances
for future use (future identical requests do not need to go to the cloud, speeding up synthesis). The developer can also now select (via configuration) whether the audio buffers are streamed in real-time or as they arrive, and how close ahead of real time the generated audio buffers are streamed. These options allow for better controlling playback and avoiding drops.

### __Changes to Audio__
* Added a `Streamline` operator for audio streams that allows for normalizing gaps and overlaps in the audio buffer stream based on different methods (Concatenate, Pleat, Unpleat)
* Added a batch processing task for exporting audio streams to wav files

### __Changes to PsiStudio and Visualization__
* Improved robustness of audio playback and addressed a number of issues that created drift between audio and visual playback.
* Run batch task processing menu is now split into submenus by batch task processing namespace.
* Added ability to export audio streams (or selections thereof) to a wav file.

### __Changes to Microsoft.Psi.Data__
* Added a `SessionImporter.OpenStream` overload that allows for providing a `[PartitionName]:StreamName` stream specification. This simplifies the configuration of batch processing tasks where input streams need to be specified, eliminating the need for specifying the partition separately. The existing batch tasks were adjusted to leverage this feature.

### __Changes to Runtime__
* Introduced `Merge` interval operator to compute a non-overlapping set of intervals that covers a given set of (potentially overlapping intervals)

* Update per CR
  • Loading branch information
danbohus authored Apr 3, 2024
1 parent a6709c7 commit 14fe106
Show file tree
Hide file tree
Showing 33 changed files with 1,467 additions and 534 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ void StopPipeline()
state = State.StoppingPipeline;
Task.Run(() =>
{
this.OnStoppingPipeline();
try
{
pipeline?.Dispose();
Expand Down Expand Up @@ -822,7 +823,7 @@ public virtual void PopulateConfigurationDefaults()
}

/// <summary>
/// Populates the UI at the waiting to start point.
/// Virtual method that populates the UI at the waiting to start point.
/// </summary>
public virtual void OnWaitingForStart()
{
Expand Down Expand Up @@ -873,6 +874,13 @@ public virtual void OnWaitingForStart()
}
}

/// <summary>
/// Virtual method that performs any relevant tasks just prior to stopping (disposing) the pipeline.
/// </summary>
public virtual void OnStoppingPipeline()
{
}

/// <summary>
/// Gets the extra types to be used for serialization.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@
<Project>{d55700d9-6050-44ac-abc0-ac1fbf9dfd3f}</Project>
<Name>Microsoft.Psi.Spatial.Euclidean</Name>
</ProjectReference>
<ProjectReference Include="..\..\..\Sources\Speech\Microsoft.Psi.Speech\Microsoft.Psi.Speech.csproj">
<Project>{3889F11A-B537-47F9-8819-146DB7E66B0C}</Project>
<Name>Microsoft.Psi.Speech</Name>
</ProjectReference>
<ProjectReference Include="..\..\Microsoft.Psi.MixedReality.Applications.UniversalWindows\Microsoft.Psi.MixedReality.Applications.UniversalWindows.csproj">
<Project>{1afbbd50-ce3a-4792-be84-15e897d281dd}</Project>
<Name>Microsoft.Psi.MixedReality.Applications.UniversalWindows</Name>
Expand Down
53 changes: 52 additions & 1 deletion Applications/Sigma/Sigma.UniversalWindows/SigmaApp.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace Sigma
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using MathNet.Spatial.Euclidean;
using Microsoft.Psi;
Expand All @@ -14,6 +15,7 @@ namespace Sigma
using Microsoft.Psi.MixedReality.Applications;
using Microsoft.Psi.MixedReality.ResearchMode;
using Microsoft.Psi.MixedReality.StereoKit;
using Microsoft.Psi.Speech;
using StereoKit;
using Windows.Storage;
using GazeSensor = Microsoft.Psi.MixedReality.WinRT.GazeSensor;
Expand All @@ -29,6 +31,7 @@ public class SigmaApp : StereoKitClientApp<SigmaAppConfiguration>
private readonly Type[] sigmaAppConfigurationTypes;
private UserStateConstructor userStateConstructor;
private ISigmaUserInterface sigmaUserInterface;
private SpeechSynthesisCache speechSynthesisCache;

private Dictionary<string, bool> selectedOutputPreviewStream = default;

Expand Down Expand Up @@ -75,6 +78,30 @@ public override void OnWaitingForStart()
}
}

/// <inheritdoc/>
public override async void OnStoppingPipeline()
{
base.OnStoppingPipeline();

// Write the speech synthesis cache to a file
if (this.speechSynthesisCache != null)
{
try
{
using var stream = await KnownFolders.DocumentsLibrary.OpenStreamForWriteAsync("SpeechSynthesisCache.dat", CreationCollisionOption.ReplaceExisting);
this.speechSynthesisCache.Write(stream);
}
catch
{
var cacheFile = KnownFolders.DocumentsLibrary.GetFileAsync("SpeechSynthesisCache.dat").AsTask().GetAwaiter().GetResult();
if (cacheFile != null)
{
await cacheFile.DeleteAsync();
}
}
}
}

/// <inheritdoc/>
public override HoloLensStreams GetHoloLensStreams(Pipeline pipeline, out DepthCamera depthCamera)
=> LiveHoloLensStreams.Create(pipeline, out depthCamera, this.SelectedConfiguration, this.selectedOutputPreviewStream[this.SelectedConfiguration.Name]);
Expand Down Expand Up @@ -112,11 +139,35 @@ public override IClientServerCommunicationStreams CreateUserInterfacePipeline(Pi
// Construct the speech synthesis components
var speechSynthesisKey = this.ReadFileAsync(KnownFolders.DocumentsLibrary, "CognitiveServicesSpeechKey.txt").GetAwaiter().GetResult();

// Setup the speech synthesis
// Open the speech synthesis cache file if it exists
var speechSynthesisCacheFileStream = default(Stream);
try
{
speechSynthesisCacheFileStream = KnownFolders.DocumentsLibrary.OpenStreamForReadAsync("SpeechSynthesisCache.dat").GetAwaiter().GetResult();
this.speechSynthesisCache = new SpeechSynthesisCache(speechSynthesisCacheFileStream);
}
catch
{
this.speechSynthesisCache = new SpeechSynthesisCache(this.SelectedConfiguration.SpeechSynthesizerVoiceName);
}
finally
{
speechSynthesisCacheFileStream?.Dispose();
}

// If the cache is of a different voice, re-initialize
if (this.speechSynthesisCache.SpeechSynthesisVoiceName != this.SelectedConfiguration.SpeechSynthesizerVoiceName)
{
this.speechSynthesisCache = new SpeechSynthesisCache(this.SelectedConfiguration.SpeechSynthesizerVoiceName);
}

// Construct the speech synthesizer configuration
var config = new SpeechSynthesizerConfiguration()
{
SubscriptionKey = speechSynthesisKey,
Region = "westus",
VoiceName = this.SelectedConfiguration.SpeechSynthesizerVoiceName,
Cache = this.speechSynthesisCache,
};

// Construct the speech sythesizer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ public abstract class SigmaAppConfiguration : ClientAppConfiguration
/// </summary>
public int AudioReframeSizeMs { get; set; } = 100;

/// <summary>
/// Gets or sets the voice to use for the speech synthesizer. For possible options see
/// https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts#prebuilt-neural-voices.
/// </summary>
public string SpeechSynthesizerVoiceName { get; set; } = "en-US-JennyNeural";

/// <summary>
/// Creates a Sigma user interface.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace Sigma
/// Batch task for exporting Sigma data.
/// </summary>
[BatchProcessingTask(
"Sigma - Export Captured Data",
"Export Captured Data",
Description = "This task exports the data collected by the Sigma app to a set of text files.")]
public class ExportDataTask : BatchProcessingTask<ExportDataTaskConfiguration>
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace Sigma
/// Batch task for exporting Sigma data.
/// </summary>
[BatchProcessingTask(
"Sigma - Export User Notes",
"Export User Notes",
Description = "This task exports notes taken by the user to a tab-delimited file.")]
public class ExportNotesTask : BatchProcessingTask<ExportNotesTask.ExportNotesTaskConfiguration>
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace Sigma
/// Batch task that runs the object 2D detection based tracker.
/// </summary>
[BatchProcessingTask(
"Sigma - Run the object 2D detection based tracker",
"Run the object 2D detection based tracker",
Description = "This task generates a new partition with results from object 2D detection based tracking.")]
public class Object2DDetectionBasedObjectTrackerTask : BatchProcessingTask<Object2DDetectionBasedObjectTrackerTaskConfiguration>
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
interactionModel.InteractionState.TaskName = interactionModel.Configuration.AutoStartTaskName;
if (interactionModel.TryGetKnownTask())
{
yield return DialogAction.Speak($"Today I'm here to help you {interactionModel.Configuration.AutoStartTaskName}.");
yield return DialogAction.Speak($"Today I'm here to help you {interactionModel.Configuration.AutoStartTaskName.ToLower().TrimEnd('.')}.");

// Move to a glanceable position
yield return DialogAction.Execute(interactionModel.MoveToGlanceablePosition);
Expand Down Expand Up @@ -119,7 +119,7 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input

if (isKnownTask)
{
yield return DialogAction.Speak($"Sure. I can help you {interactionModel.InteractionState.TaskName.ToLower()}.");
yield return DialogAction.Speak($"Sure. I can help you {interactionModel.InteractionState.TaskName.ToLower().TrimEnd('.')}.");
}
else
{
Expand Down Expand Up @@ -158,6 +158,24 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
/// </summary>
public class ExecuteStep : DialogState<DiamondInteractionModel>
{
private readonly bool usePreamble = true;

/// <summary>
/// Initializes a new instance of the <see cref="ExecuteStep"/> class.
/// </summary>
public ExecuteStep()
{
}

/// <summary>
/// Initializes a new instance of the <see cref="ExecuteStep"/> class.
/// </summary>
/// <param name="usePreamble">Indicates whether to use a preamble for the step (e.g., first, next, finally...)</param>
public ExecuteStep(bool usePreamble)
{
this.usePreamble = usePreamble;
}

/// <inheritdoc/>
public override void OnEnter(DiamondInteractionModel interactionModel)
{
Expand All @@ -182,8 +200,15 @@ public override (string SystemPrompt, string[] UserResponseSet) GetSystemPromptA
(interactionModel.InteractionState.TryGetSelectedStepOfType<DoStep>(out var doStep) && doStep.Label == "1");

var systemPrompt = isFirstStep ?
$"The first step is to {interactionModel.InteractionState.SelectedStep.GetSpokenInstructions().ToLower()}" :
$"{interactionModel.InteractionState.SelectedStep.GetSpokenInstructions()}";
$"{interactionModel.InteractionState.SelectedStep.GetSpokenInstructions().Trim('.')}." :
$"{interactionModel.InteractionState.SelectedStep.GetSpokenInstructions().Trim('.')}.";

if (this.usePreamble)
{
systemPrompt = isFirstStep ?
$"The first step is to {systemPrompt.ToLower()}" :
$"Next, {systemPrompt.ToLower()}";
}

var userResponseSet = new List<string>()
{
Expand Down Expand Up @@ -228,18 +253,25 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
yield return DialogAction.ContinueWith<ExecuteStep>(noSpeechSynthesis: true);
}
}
else if (speechRecognition.Contains("previous"))
else if (speechRecognition.Contains("previous") && speechRecognition.Contains("step"))
{
if (interactionModel.InteractionState.SelectedStepIndex > 0)
{
interactionModel.InteractionState.GemState = GemState.AtUserInterface();
interactionModel.InteractionState.SelectedStepIndex--;
yield return DialogAction.Execute(interactionModel.ContinueWithSelectedStep);
if (interactionModel.InteractionState.TryGetSelectedStepOfType<ComplexStep>(out var complexStep))
{
interactionModel.InteractionState.SelectedSubStepIndex = complexStep.SubSteps.Count - 1;
yield return DialogAction.ContinueWith(new ExecuteSubStep(usePreamble: false));
}
else
{
yield return DialogAction.ContinueWith(new ExecuteStep(usePreamble: false));
}
}
else
{
yield return DialogAction.Speak("This is the first step of the task.");
yield return DialogAction.ContinueWith<ExecuteStep>(noSpeechSynthesis: true);
yield return DialogAction.Speak("We are already at the first step of the task.");
yield return DialogAction.ContinueWith<ExecuteSubStep>(noSpeechSynthesis: true);
}
}
else if (speechRecognition.Contains("start") && speechRecognition.Contains("timer") && interactionModel.InteractionState.SelectedStep is DoStep doStep)
Expand Down Expand Up @@ -436,6 +468,24 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
/// </summary>
public class ExecuteSubStep : DialogState<DiamondInteractionModel>
{
private readonly bool usePreamble = true;

/// <summary>
/// Initializes a new instance of the <see cref="ExecuteSubStep"/> class.
/// </summary>
public ExecuteSubStep()
{
}

/// <summary>
/// Initializes a new instance of the <see cref="ExecuteSubStep"/> class.
/// </summary>
/// <param name="usePreamble">Indicates whether to use a preamble for the step (e.g., first, next, finally...)</param>
public ExecuteSubStep(bool usePreamble)
{
this.usePreamble = usePreamble;
}

/// <inheritdoc/>
public override void OnEnter(DiamondInteractionModel interactionModel)
{
Expand All @@ -446,28 +496,37 @@ public override void OnEnter(DiamondInteractionModel interactionModel)
/// <inheritdoc/>
public override (string SystemPrompt, string[] UserResponseSet) GetSystemPromptAndUserResponseSet(DiamondInteractionModel interactionModel)
{
string systemPrompt;
if (interactionModel.InteractionState.SelectedSubStepIndex == 0)
{
systemPrompt = $"First, {(interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.ToLower()}.";
}
else if (interactionModel.InteractionState.SelectedSubStepIndex > 1 && interactionModel.InteractionState.SelectedSubStepIndex == (interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps.Count - 1)
string systemPrompt = (interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.TrimEnd('.');

if (this.usePreamble)
{
systemPrompt = $"Finally, {(interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.ToLower()}.";
if (interactionModel.InteractionState.SelectedSubStepIndex == 0)
{
systemPrompt = $"First, {systemPrompt.ToLower()}.";
}
else if (interactionModel.InteractionState.SelectedSubStepIndex > 1 && interactionModel.InteractionState.SelectedSubStepIndex == (interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps.Count - 1)
{
systemPrompt = $"Finally, {systemPrompt.ToLower()}.";
}
else
{
systemPrompt = $"Next, {systemPrompt.ToLower()}.";
}
}
else
{
systemPrompt = $"Next, {(interactionModel.InteractionState.SelectedStep as ComplexStep).SubSteps[interactionModel.InteractionState.SelectedSubStepIndex.Value].Description.ToLower()}.";
systemPrompt = $"{systemPrompt.Capitalize()}.";
}

var userRespose = new string[]
var userResponseSet = new List<string>() { "Next step." };
if ((interactionModel.InteractionState.SelectedSubStepIndex > 0) || (interactionModel.InteractionState.SelectedStepIndex > 0))
{
"Next step.",
"Go to the previous step.",
"Let's abandon this task.",
};
userResponseSet.Add("Previous step.");
}

userResponseSet.Add("Let's abandon this task.");

return (systemPrompt, userRespose);
return (systemPrompt, userResponseSet.ToArray());
}

/// <inheritdoc/>
Expand All @@ -488,6 +547,7 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
{
yield return DialogAction.Speak("Great.");
interactionModel.InteractionState.SelectedStepIndex++;
interactionModel.InteractionState.SelectedSubStepIndex = null;
yield return DialogAction.Execute(interactionModel.ContinueWithSelectedStep);
}

Expand All @@ -508,6 +568,35 @@ public override IEnumerable<DialogAction> GetNextDialogActions(IInputEvent input
}
}
}
else if (speechRecognitionResult != null &&
speechRecognitionResult.Contains("previous") &&
speechRecognitionResult.Contains("step"))
{
if (interactionModel.InteractionState.SelectedSubStepIndex > 0)
{
interactionModel.InteractionState.SelectedSubStepIndex--;
yield return DialogAction.ContinueWith(new ExecuteSubStep(usePreamble: false));
}
else if (interactionModel.InteractionState.SelectedStepIndex > 0)
{
interactionModel.InteractionState.SelectedStepIndex--;
if (interactionModel.InteractionState.TryGetSelectedStepOfType<ComplexStep>(out var complexStep))
{
interactionModel.InteractionState.SelectedSubStepIndex = complexStep.SubSteps.Count - 1;
yield return DialogAction.ContinueWith(new ExecuteSubStep(usePreamble: false));
}
else
{
interactionModel.InteractionState.SelectedSubStepIndex = null;
yield return DialogAction.ContinueWith(new ExecuteStep(usePreamble: false));
}
}
else
{
yield return DialogAction.Speak("We are already at the first step of the task.");
yield return DialogAction.ContinueWith<ExecuteSubStep>(noSpeechSynthesis: true);
}
}
else if (speechRecognitionResult != null && speechRecognitionResult.ContainsOneOf("freeze", "pause"))
{
yield return DialogAction.ContinueWith(new PauseInteraction(this));
Expand Down
Loading

0 comments on commit 14fe106

Please sign in to comment.