From e0a75d7f1db6a933ca58bf183bcf38f67b24b7c7 Mon Sep 17 00:00:00 2001 From: Aaron Clauson Date: Sun, 22 Dec 2024 21:10:02 +0000 Subject: [PATCH] ChatGPT sample connects as long as OPUS offered (#1261) * wip: demo program. * Added initial attempt at OPUS codec wiring. * Updated comment. --- .../WebRTCExamples/WebRTCChatGPT/Program.cs | 37 ++++++----- src/SIPSorcery.csproj | 1 + src/app/Media/Codecs/AudioEncoder.cs | 61 +++++++++++++++++++ 3 files changed, 84 insertions(+), 15 deletions(-) diff --git a/examples/WebRTCExamples/WebRTCChatGPT/Program.cs b/examples/WebRTCExamples/WebRTCChatGPT/Program.cs index 074926e00..c6343a1fa 100755 --- a/examples/WebRTCExamples/WebRTCChatGPT/Program.cs +++ b/examples/WebRTCExamples/WebRTCChatGPT/Program.cs @@ -4,16 +4,17 @@ // Description: An example WebRTC application that can be used to interact with // ChatGPT's real-time API https://platform.openai.com/docs/guides/realtime-webrtc. // -// NOTE: As of 19 Dec 2024 this examle has never worked. It can establish the WebRTC connection -// but does not get a response on the data channel. Testing with a js version in Chrome -// was only able to randomly get one successful data channel response from 20 or 30 -// attempts so seems the OpenAI end may be having capacity issues. +// NOTE: As of 22 Dec 2024 this example does work to establish an RTP flow but the +// OPUS encoder is not currently working and need to track down why the data channel +// messages aren't being received. The issues could be related. // // Remarks: // To get the ephemeral secret you first need an API key from OpenAI at // https://platform.openai.com/settings/organization/api-keys. // -// The API key can then be used to create an ephemeral secret using the curl comamnd below, +// If you don't want to pass your OpenAI API key to this app an alternative approach is +// to create an ephemeral secret using the curl comamnd below and then hard code it into +// the application. // NOTE each epehmeral key seems like it can ONLY be used once: // curl -v https://api.openai.com/v1/realtime/sessions ^ // --header "Authorization: Bearer %OPENAPI_TOKEN%" ^ @@ -134,23 +135,24 @@ static async Task Main(string[] args) openApiDataChannel.send(JsonSerializer.Serialize(responseCreate)); }; + openApiDataChannel.onclose += () => logger.LogDebug($"OpenAPI data channel {openApiDataChannel.label} closed."); + openApiDataChannel.onmessage += (datachan, type, data) => { - logger.LogInformation($"Data channel {datachan.label} message {type} received: {Encoding.UTF8.GetString(data)}."); + logger.LogInformation($"OpenAPI data channel {datachan.label} message {type} received: {Encoding.UTF8.GetString(data)}."); }; // Plumbing code to facilitate a graceful exit. - CancellationTokenSource exitCts = new CancellationTokenSource(); // Cancellation token to stop the SIP transport and RTP stream. ManualResetEvent exitMre = new ManualResetEvent(false); - // Ctrl-c will gracefully exit the call at any point. + // Ctrl-c will gracefully exit the app at any point. Console.CancelKeyPress += delegate (object sender, ConsoleCancelEventArgs e) { e.Cancel = true; exitMre.Set(); }; - // Wait for a signal saying the call failed, was cancelled with ctrl-c or completed. + // Wait for a signal saying the atempt failed or was cancelled with ctrl-c. exitMre.WaitOne(); } @@ -166,14 +168,19 @@ private static async Task CreatePeerConnection() // Sink (speaker) only audio end point. WindowsAudioEndPoint windowsAudioEP = new WindowsAudioEndPoint(new AudioEncoder(), -1, -1, false, false); + windowsAudioEP.RestrictFormats(x => x.FormatName == "OPUS"); windowsAudioEP.OnAudioSinkError += err => logger.LogWarning($"Audio sink error. {err}."); + windowsAudioEP.OnAudioSourceEncodedSample += peerConnection.SendAudio; - //var audioFormats = new List { new AudioFormat(SDPWellKnownMediaFormatsEnum.PCMU) }; - //MediaStreamTrack audioTrack = new MediaStreamTrack(audioFormats, MediaStreamStatusEnum.SendRecv); - MediaStreamTrack audioTrack = new MediaStreamTrack(windowsAudioEP.GetAudioSourceFormats(), MediaStreamStatusEnum.RecvOnly); + MediaStreamTrack audioTrack = new MediaStreamTrack(windowsAudioEP.GetAudioSourceFormats(), MediaStreamStatusEnum.SendRecv); peerConnection.addTrack(audioTrack); - peerConnection.OnAudioFormatsNegotiated += (audioFormats) => windowsAudioEP.SetAudioSinkFormat(audioFormats.First()); + peerConnection.OnAudioFormatsNegotiated += (audioFormats) => + { + logger.LogDebug($"Audio format negotiated {audioFormats.First().FormatName}."); + windowsAudioEP.SetAudioSinkFormat(audioFormats.First()); + windowsAudioEP.SetAudioSourceFormat(audioFormats.First()); + }; peerConnection.OnReceiveReport += RtpSession_OnReceiveReport; peerConnection.OnSendReport += RtpSession_OnSendReport; peerConnection.OnTimeout += (mediaType) => logger.LogDebug($"Timeout on media {mediaType}."); @@ -197,11 +204,11 @@ private static async Task CreatePeerConnection() peerConnection.OnRtpPacketReceived += (IPEndPoint rep, SDPMediaTypesEnum media, RTPPacket rtpPkt) => { - logger.LogDebug($"RTP {media} pkt received, SSRC {rtpPkt.Header.SyncSource}."); + //logger.LogDebug($"RTP {media} pkt received, SSRC {rtpPkt.Header.SyncSource}."); if (media == SDPMediaTypesEnum.audio) { - windowsAudioEP.GotAudioRtp(rep, rtpPkt.Header.SyncSource, rtpPkt.Header.SequenceNumber, rtpPkt.Header.Timestamp, rtpPkt.Header.PayloadType, rtpPkt.Header.MarkerBit == 1, rtpPkt.Payload); + //windowsAudioEP.GotAudioRtp(rep, rtpPkt.Header.SyncSource, rtpPkt.Header.SequenceNumber, rtpPkt.Header.Timestamp, rtpPkt.Header.PayloadType, rtpPkt.Header.MarkerBit == 1, rtpPkt.Payload); } }; diff --git a/src/SIPSorcery.csproj b/src/SIPSorcery.csproj index 57fd7ea39..c34cdbcba 100755 --- a/src/SIPSorcery.csproj +++ b/src/SIPSorcery.csproj @@ -17,6 +17,7 @@ + diff --git a/src/app/Media/Codecs/AudioEncoder.cs b/src/app/Media/Codecs/AudioEncoder.cs index e0cc13cba..b4ed76b2f 100755 --- a/src/app/Media/Codecs/AudioEncoder.cs +++ b/src/app/Media/Codecs/AudioEncoder.cs @@ -13,16 +13,20 @@ // BSD 3-Clause "New" or "Revised" License, see included LICENSE.md file. //----------------------------------------------------------------------------- +using Concentus; using System; using System.Collections.Generic; using System.Linq; using SIPSorceryMedia.Abstractions; +using Concentus.Enums; namespace SIPSorcery.Media { public class AudioEncoder : IAudioEncoder { private const int G722_BIT_RATE = 64000; // G722 sampling rate is 16KHz with bits per sample of 16. + private const int OPUS_SAMPLE_RATE = 48000; // Opus codec sampling rate, 48KHz. + private const int OPUS_CHANNELS = 1; // Opus codec number of channels. private G722Codec _g722Codec; private G722CodecState _g722CodecState; @@ -32,6 +36,9 @@ public class AudioEncoder : IAudioEncoder private G729Encoder _g729Encoder; private G729Decoder _g729Decoder; + private IOpusDecoder _opusDecoder; + private IOpusEncoder _opusEncoder; + private List _linearFormats = new List { new AudioFormat(AudioCodecsEnum.L16, 117, 16000), @@ -47,6 +54,7 @@ public class AudioEncoder : IAudioEncoder new AudioFormat(SDPWellKnownMediaFormatsEnum.PCMA), new AudioFormat(SDPWellKnownMediaFormatsEnum.G722), new AudioFormat(SDPWellKnownMediaFormatsEnum.G729), + new AudioFormat(111, "OPUS", OPUS_SAMPLE_RATE, OPUS_CHANNELS, "useinbandfec=1") }; public List SupportedFormats @@ -87,7 +95,9 @@ public byte[] EncodeAudio(short[] pcm, AudioFormat format) else if (format.Codec == AudioCodecsEnum.G729) { if (_g729Encoder == null) + { _g729Encoder = new G729Encoder(); + } byte[] pcmBytes = new byte[pcm.Length * sizeof(short)]; Buffer.BlockCopy(pcm, 0, pcmBytes, 0, pcmBytes.Length); @@ -114,6 +124,24 @@ public byte[] EncodeAudio(short[] pcm, AudioFormat format) // Put on the wire as little endian. return pcm.SelectMany(x => new byte[] { (byte)(x), (byte)(x >> 8) }).ToArray(); } + else if (format.Codec == AudioCodecsEnum.OPUS) + { + if (_opusEncoder == null) + { + _opusEncoder = OpusCodecFactory.CreateEncoder(OPUS_SAMPLE_RATE, OPUS_CHANNELS, OpusApplication.OPUS_APPLICATION_VOIP); + } + + // Opus expects PCM data in float format [-1.0, 1.0]. + float[] pcmFloat = new float[pcm.Length]; + for (int i = 0; i < pcm.Length; i++) + { + pcmFloat[i] = pcm[i] / 32768f; // Convert to float range [-1.0, 1.0] + } + + byte[] encodedSample = new byte[pcm.Length]; + int encodedLength = _opusEncoder.Encode(pcmFloat, pcmFloat.Length / OPUS_CHANNELS, encodedSample, encodedSample.Length); + return encodedSample.Take(encodedLength).ToArray(); + } else { throw new ApplicationException($"Audio format {format.Codec} cannot be encoded."); @@ -171,6 +199,26 @@ public short[] DecodeAudio(byte[] encodedSample, AudioFormat format) // arrive from somewhere like the SkypeBot SDK they will be in little endian format). return encodedSample.Where((x, i) => i % 2 == 0).Select((y, i) => (short)(encodedSample[i * 2 + 1] << 8 | encodedSample[i * 2])).ToArray(); } + else if (format.Codec == AudioCodecsEnum.OPUS) + { + if (_opusDecoder == null) + { + _opusDecoder = OpusCodecFactory.CreateDecoder(OPUS_SAMPLE_RATE, OPUS_CHANNELS); + } + + float[] decodedPcmFloat = new float[encodedSample.Length * 2]; + int decodedLength = _opusDecoder.Decode(encodedSample, decodedPcmFloat, decodedPcmFloat.Length, true); + + // Convert float PCM to short PCM + short[] decodedPcm = new short[decodedLength]; + for (int i = 0; i < decodedLength; i++) + { + // Clamp the value to the valid range of -32768 to 32767 + decodedPcm[i] = ClampToShort(decodedPcmFloat[i] * 32767); + } + + return decodedPcm; + } else { throw new ApplicationException($"Audio format {format.Codec} cannot be decoded."); @@ -182,5 +230,18 @@ public short[] Resample(short[] pcm, int inRate, int outRate) { return PcmResampler.Resample(pcm, inRate, outRate); } + + private short ClampToShort(float value) + { + if (value > short.MaxValue) + { + return short.MaxValue; + } + if (value < short.MinValue) + { + return short.MinValue; + } + return (short)value; + } } }