ChatGPT sample connects as long as OPUS offered (sipsorcery-org#1261)

* wip: demo program. * Added initial attempt at OPUS codec wiring. * Updated comment.
ha-ves · Dec 22, 2024 · e0a75d7 · e0a75d7
1 parent f3573ed
commit e0a75d7
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 15 deletions.
diff --git a/examples/WebRTCExamples/WebRTCChatGPT/Program.cs b/examples/WebRTCExamples/WebRTCChatGPT/Program.cs
@@ -4,16 +4,17 @@
 // Description: An example WebRTC application that can be used to interact with
 // ChatGPT's real-time API https://platform.openai.com/docs/guides/realtime-webrtc.
 //
-// NOTE: As of 19 Dec 2024 this examle has never worked. It can establish the WebRTC connection
-// but does not get a response on the data channel. Testing with a js version in Chrome
-// was only able to randomly get one successful data channel response from 20 or 30
-// attempts so seems the OpenAI end may be having capacity issues.
+// NOTE: As of 22 Dec 2024 this example does work to establish an RTP flow but the
+// OPUS encoder is not currently working and need to track down why the data channel
+// messages aren't being received. The issues could be related.
 //
 // Remarks:
 // To get the ephemeral secret you first need an API key from OpenAI at
 // https://platform.openai.com/settings/organization/api-keys.
 //
-// The API key can then be used to create an ephemeral secret using the curl comamnd below,
+// If you don't want to pass your OpenAI API key to this app an alternative approach is
+// to create an ephemeral secret using the curl comamnd below and then hard code it into
+// the application.
 // NOTE each epehmeral key seems like it can ONLY be used once:
 // curl -v https://api.openai.com/v1/realtime/sessions ^
 //  --header "Authorization: Bearer %OPENAPI_TOKEN%" ^
@@ -134,23 +135,24 @@ static async Task Main(string[] args)
                 openApiDataChannel.send(JsonSerializer.Serialize(responseCreate));
             };
 
+            openApiDataChannel.onclose += () => logger.LogDebug($"OpenAPI data channel {openApiDataChannel.label} closed.");
+
             openApiDataChannel.onmessage += (datachan, type, data) =>
             {
-                logger.LogInformation($"Data channel {datachan.label} message {type} received: {Encoding.UTF8.GetString(data)}.");
+                logger.LogInformation($"OpenAPI data channel {datachan.label} message {type} received: {Encoding.UTF8.GetString(data)}.");
             };
 
             // Plumbing code to facilitate a graceful exit.
-            CancellationTokenSource exitCts = new CancellationTokenSource(); // Cancellation token to stop the SIP transport and RTP stream.
             ManualResetEvent exitMre = new ManualResetEvent(false);
 
-            // Ctrl-c will gracefully exit the call at any point.
+            // Ctrl-c will gracefully exit the app at any point.
             Console.CancelKeyPress += delegate (object sender, ConsoleCancelEventArgs e)
             {
                 e.Cancel = true;
                 exitMre.Set();
             };
 
-            // Wait for a signal saying the call failed, was cancelled with ctrl-c or completed.
+            // Wait for a signal saying the atempt failed or was cancelled with ctrl-c.
             exitMre.WaitOne();
         }
 
@@ -166,14 +168,19 @@ private static async Task<RTCPeerConnection> CreatePeerConnection()
 
             // Sink (speaker) only audio end point.
             WindowsAudioEndPoint windowsAudioEP = new WindowsAudioEndPoint(new AudioEncoder(), -1, -1, false, false);
+            windowsAudioEP.RestrictFormats(x => x.FormatName == "OPUS");
             windowsAudioEP.OnAudioSinkError += err => logger.LogWarning($"Audio sink error. {err}.");
+            windowsAudioEP.OnAudioSourceEncodedSample +=  peerConnection.SendAudio;
 
-            //var audioFormats = new List<AudioFormat> { new AudioFormat(SDPWellKnownMediaFormatsEnum.PCMU) };
-            //MediaStreamTrack audioTrack = new MediaStreamTrack(audioFormats, MediaStreamStatusEnum.SendRecv);
-            MediaStreamTrack audioTrack = new MediaStreamTrack(windowsAudioEP.GetAudioSourceFormats(), MediaStreamStatusEnum.RecvOnly);
+            MediaStreamTrack audioTrack = new MediaStreamTrack(windowsAudioEP.GetAudioSourceFormats(), MediaStreamStatusEnum.SendRecv);
             peerConnection.addTrack(audioTrack);
 
-            peerConnection.OnAudioFormatsNegotiated += (audioFormats) => windowsAudioEP.SetAudioSinkFormat(audioFormats.First());
+            peerConnection.OnAudioFormatsNegotiated += (audioFormats) =>
+            {
+                logger.LogDebug($"Audio format negotiated {audioFormats.First().FormatName}.");
+                windowsAudioEP.SetAudioSinkFormat(audioFormats.First());
+                windowsAudioEP.SetAudioSourceFormat(audioFormats.First());
+            };
             peerConnection.OnReceiveReport += RtpSession_OnReceiveReport;
             peerConnection.OnSendReport += RtpSession_OnSendReport;
             peerConnection.OnTimeout += (mediaType) => logger.LogDebug($"Timeout on media {mediaType}.");
@@ -197,11 +204,11 @@ private static async Task<RTCPeerConnection> CreatePeerConnection()
 
             peerConnection.OnRtpPacketReceived += (IPEndPoint rep, SDPMediaTypesEnum media, RTPPacket rtpPkt) =>
             {
-                logger.LogDebug($"RTP {media} pkt received, SSRC {rtpPkt.Header.SyncSource}.");
+                //logger.LogDebug($"RTP {media} pkt received, SSRC {rtpPkt.Header.SyncSource}.");
 
                 if (media == SDPMediaTypesEnum.audio)
                 {
-                    windowsAudioEP.GotAudioRtp(rep, rtpPkt.Header.SyncSource, rtpPkt.Header.SequenceNumber, rtpPkt.Header.Timestamp, rtpPkt.Header.PayloadType, rtpPkt.Header.MarkerBit == 1, rtpPkt.Payload);
+                    //windowsAudioEP.GotAudioRtp(rep, rtpPkt.Header.SyncSource, rtpPkt.Header.SequenceNumber, rtpPkt.Header.Timestamp, rtpPkt.Header.PayloadType, rtpPkt.Header.MarkerBit == 1, rtpPkt.Payload);
                 }
             };
 

diff --git a/src/SIPSorcery.csproj b/src/SIPSorcery.csproj
@@ -17,6 +17,7 @@
   </ItemGroup>
 
   <ItemGroup>
+    <PackageReference Include="Concentus" Version="2.2.2" />
     <PackageReference Include="Portable.BouncyCastle" Version="1.9.0" />
     <PackageReference Include="DnsClient" Version="1.8.0" />
     <PackageReference Include="SIPSorcery.WebSocketSharp" Version="0.0.1" />

diff --git a/src/app/Media/Codecs/AudioEncoder.cs b/src/app/Media/Codecs/AudioEncoder.cs
@@ -13,16 +13,20 @@
 // BSD 3-Clause "New" or "Revised" License, see included LICENSE.md file.
 //-----------------------------------------------------------------------------
 
+using Concentus;
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using SIPSorceryMedia.Abstractions;
+using Concentus.Enums;
 
 namespace SIPSorcery.Media
 {
     public class AudioEncoder : IAudioEncoder
     {
         private const int G722_BIT_RATE = 64000;              // G722 sampling rate is 16KHz with bits per sample of 16.
+        private const int OPUS_SAMPLE_RATE = 48000;           // Opus codec sampling rate, 48KHz.
+        private const int OPUS_CHANNELS = 1;                  // Opus codec number of channels.
 
         private G722Codec _g722Codec;
         private G722CodecState _g722CodecState;
@@ -32,6 +36,9 @@ public class AudioEncoder : IAudioEncoder
         private G729Encoder _g729Encoder;
         private G729Decoder _g729Decoder;
 
+        private IOpusDecoder _opusDecoder;
+        private IOpusEncoder _opusEncoder;
+
         private List<AudioFormat> _linearFormats = new List<AudioFormat>
         {
             new AudioFormat(AudioCodecsEnum.L16, 117, 16000),
@@ -47,6 +54,7 @@ public class AudioEncoder : IAudioEncoder
             new AudioFormat(SDPWellKnownMediaFormatsEnum.PCMA),
             new AudioFormat(SDPWellKnownMediaFormatsEnum.G722),
             new AudioFormat(SDPWellKnownMediaFormatsEnum.G729),
+            new AudioFormat(111, "OPUS", OPUS_SAMPLE_RATE, OPUS_CHANNELS, "useinbandfec=1")
         };
 
         public List<AudioFormat> SupportedFormats
@@ -87,7 +95,9 @@ public byte[] EncodeAudio(short[] pcm, AudioFormat format)
             else if (format.Codec == AudioCodecsEnum.G729)
             {
                 if (_g729Encoder == null)
+                {
                     _g729Encoder = new G729Encoder();
+                }
 
                 byte[] pcmBytes = new byte[pcm.Length * sizeof(short)];
                 Buffer.BlockCopy(pcm, 0, pcmBytes, 0, pcmBytes.Length);
@@ -114,6 +124,24 @@ public byte[] EncodeAudio(short[] pcm, AudioFormat format)
                 // Put on the wire as little endian.
                 return pcm.SelectMany(x => new byte[] { (byte)(x), (byte)(x >> 8) }).ToArray();
             }
+            else if (format.Codec == AudioCodecsEnum.OPUS)
+            {
+                if (_opusEncoder == null)
+                {
+                    _opusEncoder = OpusCodecFactory.CreateEncoder(OPUS_SAMPLE_RATE, OPUS_CHANNELS, OpusApplication.OPUS_APPLICATION_VOIP);
+                }
+
+                // Opus expects PCM data in float format [-1.0, 1.0].
+                float[] pcmFloat = new float[pcm.Length];
+                for (int i = 0; i < pcm.Length; i++)
+                {
+                    pcmFloat[i] = pcm[i] / 32768f; // Convert to float range [-1.0, 1.0]
+                }
+
+                byte[] encodedSample = new byte[pcm.Length];
+                int encodedLength = _opusEncoder.Encode(pcmFloat, pcmFloat.Length / OPUS_CHANNELS, encodedSample, encodedSample.Length);
+                return encodedSample.Take(encodedLength).ToArray();
+            }
             else
             {
                 throw new ApplicationException($"Audio format {format.Codec} cannot be encoded.");
@@ -171,6 +199,26 @@ public short[] DecodeAudio(byte[] encodedSample, AudioFormat format)
                 // arrive from somewhere like the SkypeBot SDK they will be in little endian format).
                 return encodedSample.Where((x, i) => i % 2 == 0).Select((y, i) => (short)(encodedSample[i * 2 + 1] << 8 | encodedSample[i * 2])).ToArray();
             }
+            else if (format.Codec == AudioCodecsEnum.OPUS)
+            {
+                if (_opusDecoder == null)
+                {
+                    _opusDecoder = OpusCodecFactory.CreateDecoder(OPUS_SAMPLE_RATE, OPUS_CHANNELS);
+                }
+
+                float[] decodedPcmFloat = new float[encodedSample.Length * 2];
+                int decodedLength = _opusDecoder.Decode(encodedSample, decodedPcmFloat, decodedPcmFloat.Length, true);
+
+                // Convert float PCM to short PCM
+                short[] decodedPcm = new short[decodedLength];
+                for (int i = 0; i < decodedLength; i++)
+                {
+                    // Clamp the value to the valid range of -32768 to 32767
+                    decodedPcm[i] = ClampToShort(decodedPcmFloat[i] * 32767);
+                }
+
+                return decodedPcm;
+            }
             else
             {
                 throw new ApplicationException($"Audio format {format.Codec} cannot be decoded.");
@@ -182,5 +230,18 @@ public short[] Resample(short[] pcm, int inRate, int outRate)
         {
             return PcmResampler.Resample(pcm, inRate, outRate);
         }
+
+        private short ClampToShort(float value)
+        {
+            if (value > short.MaxValue)
+            {
+                return short.MaxValue;
+            }
+            if (value < short.MinValue)
+            {
+                return short.MinValue;
+            }
+            return (short)value;
+        }
     }
 }