Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QEMU Audio support in VNC Viewer (currently Windows only) #1478

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion common/rfb/CConnection.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ static LogWriter vlog("CConnection");
CConnection::CConnection()
: csecurity(0),
supportsLocalCursor(false), supportsCursorPosition(false),
supportsDesktopResize(false), supportsLEDState(false),
supportsDesktopResize(false), supportsLEDState(false), supportsAudio(false),
is(0), os(0), reader_(0), writer_(0),
shared(false),
state_(RFBSTATE_UNINITIALISED), serverName(strDup("")),
Expand Down Expand Up @@ -524,6 +524,18 @@ void CConnection::framebufferUpdateEnd()

firstUpdate = false;
}

if (server.awaitsQEMUAudioFormatMsg) {
if (supportsAudio) {
rdr::U8 sampleFormat, channels;
rdr::U32 samplingFreq;
if (audioInitAndGetFormat(&sampleFormat, &channels, &samplingFreq)) {
writer()->writeQemuAudioSetFormat(sampleFormat, channels, samplingFreq);
writer()->writeQemuAudioEnableOrDisable(true /* enable */);
}
}
server.awaitsQEMUAudioFormatMsg = false;
}
}

bool CConnection::dataRect(const Rect& r, int encoding)
Expand Down Expand Up @@ -608,6 +620,13 @@ void CConnection::handleClipboardProvide(rdr::U32 flags,
handleClipboardData(serverClipboard);
}

bool CConnection::audioInitAndGetFormat(rdr::U8* sampleFormat,
rdr::U8* channels,
rdr::U32* samplingFreq)
{
return false;
}

void CConnection::authSuccess()
{
}
Expand Down Expand Up @@ -828,6 +847,9 @@ void CConnection::updateEncodings()
encodings.push_back(pseudoEncodingLEDState);
encodings.push_back(pseudoEncodingVMwareLEDState);
}
if (supportsAudio) {
encodings.push_back(pseudoEncodingQEMUAudio);
}

encodings.push_back(pseudoEncodingDesktopName);
encodings.push_back(pseudoEncodingLastRect);
Expand Down
4 changes: 4 additions & 0 deletions common/rfb/CConnection.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ namespace rfb {
const size_t* lengths,
const rdr::U8* const* data);

virtual bool audioInitAndGetFormat(rdr::U8* sampleFormat,
rdr::U8* channels,
rdr::U32* samplingFreq);

// Methods to be overridden in a derived class

Expand Down Expand Up @@ -242,6 +245,7 @@ namespace rfb {
bool supportsCursorPosition;
bool supportsDesktopResize;
bool supportsLEDState;
bool supportsAudio;

private:
// This is a default implementation of fences that automatically
Expand Down
27 changes: 27 additions & 0 deletions common/rfb/CMsgHandler.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ void CMsgHandler::supportsQEMUKeyEvent()
server.supportsQEMUKeyEvent = true;
}

void CMsgHandler::supportsQEMUAudioAndAwaitsFormatMsgOnce()
{
if (!server.supportsQEMUAudio) {
server.supportsQEMUAudio = true;
server.awaitsQEMUAudioFormatMsg = true;
}
}

void CMsgHandler::serverInit(int width, int height,
const PixelFormat& pf,
const char* name)
Expand Down Expand Up @@ -167,3 +175,22 @@ void CMsgHandler::handleClipboardProvide(rdr::U32 flags,
const rdr::U8* const* data)
{
}

size_t CMsgHandler::audioSampleSize()
{
return 1;
}

void CMsgHandler::audioNotifyStreamingStartStop(bool isStart)
{
}

size_t CMsgHandler::audioAddSamples(const rdr::U8* data, size_t size)
{
return size;
}

bool CMsgHandler::audioSubmitSamples()
{
return false;
}
6 changes: 6 additions & 0 deletions common/rfb/CMsgHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ namespace rfb {
virtual void fence(rdr::U32 flags, unsigned len, const char data[]);
virtual void endOfContinuousUpdates();
virtual void supportsQEMUKeyEvent();
virtual void supportsQEMUAudioAndAwaitsFormatMsgOnce();
virtual void serverInit(int width, int height,
const PixelFormat& pf,
const char* name) = 0;
Expand Down Expand Up @@ -85,6 +86,11 @@ namespace rfb {
const size_t* lengths,
const rdr::U8* const* data);

virtual size_t audioSampleSize();
virtual void audioNotifyStreamingStartStop(bool isStart);
virtual size_t audioAddSamples(const rdr::U8* data, size_t size);
virtual bool audioSubmitSamples();

ServerParams server;
};
}
Expand Down
79 changes: 78 additions & 1 deletion common/rfb/CMsgReader.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <rdr/ZlibInStream.h>

#include <rfb/msgTypes.h>
#include <rfb/qemuTypes.h>
#include <rfb/clipboardTypes.h>
#include <rfb/Exception.h>
#include <rfb/LogWriter.h>
Expand All @@ -43,7 +44,7 @@ using namespace rfb;

CMsgReader::CMsgReader(CMsgHandler* handler_, rdr::InStream* is_)
: imageBufIdealSize(0), handler(handler_), is(is_),
state(MSGSTATE_IDLE), cursorEncoding(-1)
state(MSGSTATE_IDLE), cursorEncoding(-1), nAudioBytesLeft(0)
{
}

Expand Down Expand Up @@ -81,6 +82,13 @@ bool CMsgReader::readServerInit()

bool CMsgReader::readMsg()
{
if (state == MSGSTATE_AUDIO_DATA) {
if (readAudioData())
state = MSGSTATE_IDLE;
else
return false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This complexity is something that doesn't really fit well in CMsgReader. How large are these audio chunks? Can't we let the input stream handle the buffering?

Copy link
Author

@mkupchik mkupchik Jun 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Audio chunks may be large, there are no protocol limits besides UINT32_MAX. Bad-behaved VNC server may outgrow MAX_BUF_SIZE in common/rdr/BufferedInStream.cxx. That's why I prefer not to leave audio samples in the input buffer of BufferedInStream, and move them to circular playback buffer of Win32AudioOutput as soon as they arrive. Calls to memcpy happen in Win32AudioOutput::addSamples(), usually once, or twice if wraparound in the target circular buffer happens, and caller of that interface must provide a pointer and length of continuous chunk of source data. This continuous chunk of source data is owned by BufferedInStream.

When audio frame is fully received, Win32AudioOutput::submitSamples() makes one or two heap allocations of struct Win32AudioOutput::HdrInSlist, then calls waveOutPrepareHeader() WinAPI function (which boils down to allocation of struct OVERLAPPED on heap, creation of kernel event object and some pointer bookkeeping), then it calls waveOutWrite() WinAPI function (which boils down to submitting asyncronous I/O request to the NT kernel device driver, the audio mixer). There's no need to move bytes around in this context, asyncronous NT I/O request in flight refers to the audio samples in circular buffer owned by Win32AudioOutput which were added there earlier, by Win32AudioOutput::addSamples(). So while audio playback I/O request is still in flight, original socket buffer owned by BufferedInStream may contain different data, may be reallocated, may be empty etc.

When kernel completes asyncronous I/O request, it signals via kernel event object and awakes internal worker thread started by winmm.dll/wdmaud.drv. This thread invokes our Win32AudioOutput::waveOutCallback(). There we link struct Win32AudioOutput::HdrInSlist into slist to dispose it later, during the next call to Win32AudioOutput::submitSamples() or in the destructor of Win32AudioOutput. Before disposing Win32AudioOutput::HdrInSlist we call waveOutUnprepareHeader(), which boils down to deallocating struct OVERLAPPED on heap and destruction of kernel event object.

If a bad-behaved VNC server sends unreasonably large audio frame, then we clamp it at Win32AudioOutput::addSamples(): samples which do not fit into the circular buffer owned by Win32AudioOutput are de facto discarded, but reported as consumed to the caller, so the caller can release this space in BufferedInStream, and continue to do so until Win32AudioOutput::submitSamples() is called at the end of audio frame.

Summarizing all of the above, I think two buffers (socket I/O buffer and async playback buffer) are necessary.

So, what we need from BufferedInStream in this context of CMsgReader is an API to fetch some fresh data from socket and provide raw pointer to the continuous chunk of memory along with number of available bytes. There's an upper limit on the number of bytes we need here (nAudioBytesLeft), and there's no lower limit. We're happy to consume less than nAudioBytesLeft bytes here (especially if nAudioBytesLeft is large, i.e. this is a large audio frame). If there's partial audio sample/frame (e.g. less than 4 bytes for default audio format, 16-bit stereo), then up to 3 bytes will be left unconsumed in the BufferedInStream, for later use when more data arrives.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The double buffering sounds fine, given the varying lifetime.

But I'd still like to get rid of the complexity. Since there is a limit anyway (which seems to be about 1 MiB if I read the code correctly), then this extra complexity doesn't seem worth it.

We don't need to support the theoretical extremes. Supporting what is actually used out there is quite sufficient.

Copy link
Author

@mkupchik mkupchik Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this complexity can be reduced by adding a new method to rdr::InStream, which takes a number of bytes wanted by caller, similar to hasData(), but unlike hasData() does not require all of that bytes to be available. Instead it just returns a pointer and length of what is available. Length may be less than requested. Also this new method must call rdr::InStream::overrun() for actual I/O on socket to happen. Then code in CMsgReader will be shorter. What do you think?

Copy link
Author

@mkupchik mkupchik Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I think I need to provide justification for MSGSTATE_AUDIO_DATA. It's true that entire 1 MiB of audio samples can fit in socket I/O buffer and be submitted for playback in one go. However consider ill-behaved VNC server which sends 1 GiB of audio data, so 1 MiB of these data is going to be submitted for playback and the rest (1023 MiB) is going to be discarded. How we can model this logic in the terms of rdr::InStream: "keep N bytes at the beginning of the buffer, but discard everything above N?" I think we cannot do this, rdr::InStream is designed for FIFO-style consumption of incoming data, we can't cut away samples/bytes/whatever in the middle of that buffer. Hence the compexity in CMsgReader, the new state of the FSM, and the double buffering. It's just a coincidence that double buffering fits nicely with async nature of audio I/O on Windows.

}

if (state == MSGSTATE_IDLE) {
if (!is->hasData(1))
return false;
Expand Down Expand Up @@ -111,6 +119,9 @@ bool CMsgReader::readMsg()
case msgTypeEndOfContinuousUpdates:
ret = readEndOfContinuousUpdates();
break;
case msgTypeQEMUServerMessage:
ret = readQemuServerMessage();
break;
default:
throw Exception("Unknown message type %d", currentMsgType);
}
Expand Down Expand Up @@ -195,6 +206,10 @@ bool CMsgReader::readMsg()
handler->supportsQEMUKeyEvent();
ret = true;
break;
case pseudoEncodingQEMUAudio:
handler->supportsQEMUAudioAndAwaitsFormatMsgOnce();
ret = true;
break;
default:
ret = readRect(dataRect, rectEncoding);
break;
Expand Down Expand Up @@ -443,6 +458,68 @@ bool CMsgReader::readEndOfContinuousUpdates()
return true;
}

bool CMsgReader::readQemuServerMessage()
{
if (!is->hasData(1 + 2))
return false;

is->setRestorePoint();
rdr::U8 subMsgType = is->readU8();
rdr::U16 operation = is->readU16();

if (subMsgType != qemuAudio) {
is->clearRestorePoint();
throw Exception("Invalid QEMU submessage type");
}

switch (operation) {
case msgFromQemuAudioBegin:
is->clearRestorePoint();
handler->audioNotifyStreamingStartStop(true /* isStart */);
return true;

case msgFromQemuAudioEnd:
is->clearRestorePoint();
handler->audioNotifyStreamingStartStop(false /* isStart */);
return true;

case msgFromQemuAudioData:
if (!is->hasDataOrRestore(4))
return false;
is->clearRestorePoint();
nAudioBytesLeft = is->readU32();
if (nAudioBytesLeft == 0)
return true;
if ((nAudioBytesLeft % handler->audioSampleSize()) != 0)
throw Exception("QEMU audio protocol error: sample torn apart");
if (readAudioData())
return true;
state = MSGSTATE_AUDIO_DATA;
return false;

default:
is->clearRestorePoint();
throw Exception("Invalid QEMU audio operation");
}
}

bool CMsgReader::readAudioData()
{
while (nAudioBytesLeft != 0) {
is->hasData(__rfbmin(maxBufferedAudioBytes, nAudioBytesLeft)); // request as much as possible
size_t available = __rfbmin(is->avail(), nAudioBytesLeft); // see how many we've got
if (available == 0)
return false;
size_t consumed = handler->audioAddSamples(is->getptr(available), available);
if (consumed == 0)
return false;
is->skip(consumed);
nAudioBytesLeft -= consumed;
}
handler->audioSubmitSamples();
return true;
}

bool CMsgReader::readFramebufferUpdate()
{
if (!is->hasData(1 + 2))
Expand Down
6 changes: 6 additions & 0 deletions common/rfb/CMsgReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ namespace rfb {
bool readExtendedClipboard(rdr::S32 len);
bool readFence();
bool readEndOfContinuousUpdates();
bool readQemuServerMessage();
bool readAudioData();

bool readFramebufferUpdate();

Expand All @@ -79,6 +81,7 @@ namespace rfb {
MSGSTATE_MESSAGE,
MSGSTATE_RECT_HEADER,
MSGSTATE_RECT_DATA,
MSGSTATE_AUDIO_DATA,
};

stateEnum state;
Expand All @@ -90,7 +93,10 @@ namespace rfb {

int cursorEncoding;

size_t nAudioBytesLeft;

static const int maxCursorSize = 256;
static const size_t maxBufferedAudioBytes = 32768;
};
}
#endif
19 changes: 19 additions & 0 deletions common/rfb/CMsgWriter.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,25 @@ void CMsgWriter::writeClientCutText(const char* str)
endMsg();
}

void CMsgWriter::writeQemuAudioEnableOrDisable(bool enable)
{
startMsg(msgTypeQEMUClientMessage);
os->writeU8(qemuAudio);
os->writeU16(enable ? msgToQemuEnableAudio : msgToQemuDisableAudio);
endMsg();
}

void CMsgWriter::writeQemuAudioSetFormat(rdr::U8 fmt, rdr::U8 channels, rdr::U32 frequency)
{
startMsg(msgTypeQEMUClientMessage);
os->writeU8(qemuAudio);
os->writeU16(msgToQemuSetAudioFormat);
os->writeU8(fmt);
os->writeU8(channels);
os->writeU32(frequency);
endMsg();
}

void CMsgWriter::writeClipboardCaps(rdr::U32 caps,
const rdr::U32* lengths)
{
Expand Down
3 changes: 3 additions & 0 deletions common/rfb/CMsgWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ namespace rfb {

void writeClientCutText(const char* str);

void writeQemuAudioEnableOrDisable(bool enable);
void writeQemuAudioSetFormat(rdr::U8 fmt, rdr::U8 channels, rdr::U32 frequency);

void writeClipboardCaps(rdr::U32 caps, const rdr::U32* lengths);
void writeClipboardRequest(rdr::U32 flags);
void writeClipboardPeek(rdr::U32 flags);
Expand Down
4 changes: 2 additions & 2 deletions common/rfb/ServerParams.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ using namespace rfb;

ServerParams::ServerParams()
: majorVersion(0), minorVersion(0),
supportsQEMUKeyEvent(false),
supportsQEMUKeyEvent(false), supportsQEMUAudio(false),
supportsSetDesktopSize(false), supportsFence(false),
supportsContinuousUpdates(false),
supportsContinuousUpdates(false), awaitsQEMUAudioFormatMsg(false),
width_(0), height_(0), name_(0),
ledState_(ledUnknown)
{
Expand Down
3 changes: 3 additions & 0 deletions common/rfb/ServerParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,13 @@ namespace rfb {
void setClipboardCaps(rdr::U32 flags, const rdr::U32* lengths);

bool supportsQEMUKeyEvent;
bool supportsQEMUAudio;
bool supportsSetDesktopSize;
bool supportsFence;
bool supportsContinuousUpdates;

bool awaitsQEMUAudioFormatMsg;

private:

int width_;
Expand Down
1 change: 1 addition & 0 deletions common/rfb/encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ namespace rfb {
const int pseudoEncodingContinuousUpdates = -313;
const int pseudoEncodingCursorWithAlpha = -314;
const int pseudoEncodingQEMUKeyEvent = -258;
const int pseudoEncodingQEMUAudio = -259;

// TightVNC-specific
const int pseudoEncodingLastRect = -224;
Expand Down
2 changes: 2 additions & 0 deletions common/rfb/msgTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ namespace rfb {

const int msgTypeServerFence = 248;

const int msgTypeQEMUServerMessage = 255;

// client to server

const int msgTypeSetPixelFormat = 0;
Expand Down
10 changes: 10 additions & 0 deletions common/rfb/qemuTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,15 @@
namespace rfb {
const int qemuExtendedKeyEvent = 0;
const int qemuAudio = 1;

// VNC client -> QEMU server audio message IDs
const int msgToQemuEnableAudio = 0;
const int msgToQemuDisableAudio = 1;
const int msgToQemuSetAudioFormat = 2;

// QEMU server -> VNC client audio message IDs
const int msgFromQemuAudioEnd = 0;
const int msgFromQemuAudioBegin = 1;
const int msgFromQemuAudioData = 2;
}
#endif
Loading