Skip to content

Commit

Permalink
ChatGPT sample connects as long as OPUS offered (sipsorcery-org#1261)
Browse files Browse the repository at this point in the history
* wip: demo program.

* Added initial attempt at OPUS codec wiring.

* Updated comment.
  • Loading branch information
sipsorcery authored Dec 22, 2024
1 parent f3573ed commit e0a75d7
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 15 deletions.
37 changes: 22 additions & 15 deletions examples/WebRTCExamples/WebRTCChatGPT/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
// Description: An example WebRTC application that can be used to interact with
// ChatGPT's real-time API https://platform.openai.com/docs/guides/realtime-webrtc.
//
// NOTE: As of 19 Dec 2024 this examle has never worked. It can establish the WebRTC connection
// but does not get a response on the data channel. Testing with a js version in Chrome
// was only able to randomly get one successful data channel response from 20 or 30
// attempts so seems the OpenAI end may be having capacity issues.
// NOTE: As of 22 Dec 2024 this example does work to establish an RTP flow but the
// OPUS encoder is not currently working and need to track down why the data channel
// messages aren't being received. The issues could be related.
//
// Remarks:
// To get the ephemeral secret you first need an API key from OpenAI at
// https://platform.openai.com/settings/organization/api-keys.
//
// The API key can then be used to create an ephemeral secret using the curl comamnd below,
// If you don't want to pass your OpenAI API key to this app an alternative approach is
// to create an ephemeral secret using the curl comamnd below and then hard code it into
// the application.
// NOTE each epehmeral key seems like it can ONLY be used once:
// curl -v https://api.openai.com/v1/realtime/sessions ^
// --header "Authorization: Bearer %OPENAPI_TOKEN%" ^
Expand Down Expand Up @@ -134,23 +135,24 @@ static async Task Main(string[] args)
openApiDataChannel.send(JsonSerializer.Serialize(responseCreate));
};

openApiDataChannel.onclose += () => logger.LogDebug($"OpenAPI data channel {openApiDataChannel.label} closed.");

openApiDataChannel.onmessage += (datachan, type, data) =>
{
logger.LogInformation($"Data channel {datachan.label} message {type} received: {Encoding.UTF8.GetString(data)}.");
logger.LogInformation($"OpenAPI data channel {datachan.label} message {type} received: {Encoding.UTF8.GetString(data)}.");
};

// Plumbing code to facilitate a graceful exit.
CancellationTokenSource exitCts = new CancellationTokenSource(); // Cancellation token to stop the SIP transport and RTP stream.
ManualResetEvent exitMre = new ManualResetEvent(false);

// Ctrl-c will gracefully exit the call at any point.
// Ctrl-c will gracefully exit the app at any point.
Console.CancelKeyPress += delegate (object sender, ConsoleCancelEventArgs e)
{
e.Cancel = true;
exitMre.Set();
};

// Wait for a signal saying the call failed, was cancelled with ctrl-c or completed.
// Wait for a signal saying the atempt failed or was cancelled with ctrl-c.
exitMre.WaitOne();
}

Expand All @@ -166,14 +168,19 @@ private static async Task<RTCPeerConnection> CreatePeerConnection()

// Sink (speaker) only audio end point.
WindowsAudioEndPoint windowsAudioEP = new WindowsAudioEndPoint(new AudioEncoder(), -1, -1, false, false);
windowsAudioEP.RestrictFormats(x => x.FormatName == "OPUS");
windowsAudioEP.OnAudioSinkError += err => logger.LogWarning($"Audio sink error. {err}.");
windowsAudioEP.OnAudioSourceEncodedSample += peerConnection.SendAudio;

//var audioFormats = new List<AudioFormat> { new AudioFormat(SDPWellKnownMediaFormatsEnum.PCMU) };
//MediaStreamTrack audioTrack = new MediaStreamTrack(audioFormats, MediaStreamStatusEnum.SendRecv);
MediaStreamTrack audioTrack = new MediaStreamTrack(windowsAudioEP.GetAudioSourceFormats(), MediaStreamStatusEnum.RecvOnly);
MediaStreamTrack audioTrack = new MediaStreamTrack(windowsAudioEP.GetAudioSourceFormats(), MediaStreamStatusEnum.SendRecv);
peerConnection.addTrack(audioTrack);

peerConnection.OnAudioFormatsNegotiated += (audioFormats) => windowsAudioEP.SetAudioSinkFormat(audioFormats.First());
peerConnection.OnAudioFormatsNegotiated += (audioFormats) =>
{
logger.LogDebug($"Audio format negotiated {audioFormats.First().FormatName}.");
windowsAudioEP.SetAudioSinkFormat(audioFormats.First());
windowsAudioEP.SetAudioSourceFormat(audioFormats.First());
};
peerConnection.OnReceiveReport += RtpSession_OnReceiveReport;
peerConnection.OnSendReport += RtpSession_OnSendReport;
peerConnection.OnTimeout += (mediaType) => logger.LogDebug($"Timeout on media {mediaType}.");
Expand All @@ -197,11 +204,11 @@ private static async Task<RTCPeerConnection> CreatePeerConnection()

peerConnection.OnRtpPacketReceived += (IPEndPoint rep, SDPMediaTypesEnum media, RTPPacket rtpPkt) =>
{
logger.LogDebug($"RTP {media} pkt received, SSRC {rtpPkt.Header.SyncSource}.");
//logger.LogDebug($"RTP {media} pkt received, SSRC {rtpPkt.Header.SyncSource}.");

if (media == SDPMediaTypesEnum.audio)
{
windowsAudioEP.GotAudioRtp(rep, rtpPkt.Header.SyncSource, rtpPkt.Header.SequenceNumber, rtpPkt.Header.Timestamp, rtpPkt.Header.PayloadType, rtpPkt.Header.MarkerBit == 1, rtpPkt.Payload);
//windowsAudioEP.GotAudioRtp(rep, rtpPkt.Header.SyncSource, rtpPkt.Header.SequenceNumber, rtpPkt.Header.Timestamp, rtpPkt.Header.PayloadType, rtpPkt.Header.MarkerBit == 1, rtpPkt.Payload);
}
};

Expand Down
1 change: 1 addition & 0 deletions src/SIPSorcery.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
</ItemGroup>

<ItemGroup>
<PackageReference Include="Concentus" Version="2.2.2" />
<PackageReference Include="Portable.BouncyCastle" Version="1.9.0" />
<PackageReference Include="DnsClient" Version="1.8.0" />
<PackageReference Include="SIPSorcery.WebSocketSharp" Version="0.0.1" />
Expand Down
61 changes: 61 additions & 0 deletions src/app/Media/Codecs/AudioEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,20 @@
// BSD 3-Clause "New" or "Revised" License, see included LICENSE.md file.
//-----------------------------------------------------------------------------

using Concentus;
using System;
using System.Collections.Generic;
using System.Linq;
using SIPSorceryMedia.Abstractions;
using Concentus.Enums;

namespace SIPSorcery.Media
{
public class AudioEncoder : IAudioEncoder
{
private const int G722_BIT_RATE = 64000; // G722 sampling rate is 16KHz with bits per sample of 16.
private const int OPUS_SAMPLE_RATE = 48000; // Opus codec sampling rate, 48KHz.
private const int OPUS_CHANNELS = 1; // Opus codec number of channels.

private G722Codec _g722Codec;
private G722CodecState _g722CodecState;
Expand All @@ -32,6 +36,9 @@ public class AudioEncoder : IAudioEncoder
private G729Encoder _g729Encoder;
private G729Decoder _g729Decoder;

private IOpusDecoder _opusDecoder;
private IOpusEncoder _opusEncoder;

private List<AudioFormat> _linearFormats = new List<AudioFormat>
{
new AudioFormat(AudioCodecsEnum.L16, 117, 16000),
Expand All @@ -47,6 +54,7 @@ public class AudioEncoder : IAudioEncoder
new AudioFormat(SDPWellKnownMediaFormatsEnum.PCMA),
new AudioFormat(SDPWellKnownMediaFormatsEnum.G722),
new AudioFormat(SDPWellKnownMediaFormatsEnum.G729),
new AudioFormat(111, "OPUS", OPUS_SAMPLE_RATE, OPUS_CHANNELS, "useinbandfec=1")
};

public List<AudioFormat> SupportedFormats
Expand Down Expand Up @@ -87,7 +95,9 @@ public byte[] EncodeAudio(short[] pcm, AudioFormat format)
else if (format.Codec == AudioCodecsEnum.G729)
{
if (_g729Encoder == null)
{
_g729Encoder = new G729Encoder();
}

byte[] pcmBytes = new byte[pcm.Length * sizeof(short)];
Buffer.BlockCopy(pcm, 0, pcmBytes, 0, pcmBytes.Length);
Expand All @@ -114,6 +124,24 @@ public byte[] EncodeAudio(short[] pcm, AudioFormat format)
// Put on the wire as little endian.
return pcm.SelectMany(x => new byte[] { (byte)(x), (byte)(x >> 8) }).ToArray();
}
else if (format.Codec == AudioCodecsEnum.OPUS)
{
if (_opusEncoder == null)
{
_opusEncoder = OpusCodecFactory.CreateEncoder(OPUS_SAMPLE_RATE, OPUS_CHANNELS, OpusApplication.OPUS_APPLICATION_VOIP);
}

// Opus expects PCM data in float format [-1.0, 1.0].
float[] pcmFloat = new float[pcm.Length];
for (int i = 0; i < pcm.Length; i++)
{
pcmFloat[i] = pcm[i] / 32768f; // Convert to float range [-1.0, 1.0]
}

byte[] encodedSample = new byte[pcm.Length];
int encodedLength = _opusEncoder.Encode(pcmFloat, pcmFloat.Length / OPUS_CHANNELS, encodedSample, encodedSample.Length);
return encodedSample.Take(encodedLength).ToArray();
}
else
{
throw new ApplicationException($"Audio format {format.Codec} cannot be encoded.");
Expand Down Expand Up @@ -171,6 +199,26 @@ public short[] DecodeAudio(byte[] encodedSample, AudioFormat format)
// arrive from somewhere like the SkypeBot SDK they will be in little endian format).
return encodedSample.Where((x, i) => i % 2 == 0).Select((y, i) => (short)(encodedSample[i * 2 + 1] << 8 | encodedSample[i * 2])).ToArray();
}
else if (format.Codec == AudioCodecsEnum.OPUS)
{
if (_opusDecoder == null)
{
_opusDecoder = OpusCodecFactory.CreateDecoder(OPUS_SAMPLE_RATE, OPUS_CHANNELS);
}

float[] decodedPcmFloat = new float[encodedSample.Length * 2];
int decodedLength = _opusDecoder.Decode(encodedSample, decodedPcmFloat, decodedPcmFloat.Length, true);

// Convert float PCM to short PCM
short[] decodedPcm = new short[decodedLength];
for (int i = 0; i < decodedLength; i++)
{
// Clamp the value to the valid range of -32768 to 32767
decodedPcm[i] = ClampToShort(decodedPcmFloat[i] * 32767);
}

return decodedPcm;
}
else
{
throw new ApplicationException($"Audio format {format.Codec} cannot be decoded.");
Expand All @@ -182,5 +230,18 @@ public short[] Resample(short[] pcm, int inRate, int outRate)
{
return PcmResampler.Resample(pcm, inRate, outRate);
}

private short ClampToShort(float value)
{
if (value > short.MaxValue)
{
return short.MaxValue;
}
if (value < short.MinValue)
{
return short.MinValue;
}
return (short)value;
}
}
}

0 comments on commit e0a75d7

Please sign in to comment.