Feature(audio): forward client PCM to web viewers with continuous playback

2026-06-02 01:56:10 +02:00
parent da024fb3fb
commit 9aca587654
4 changed files with 505 additions and 5 deletions
--- a/client/ScreenManager.cpp
+++ b/client/ScreenManager.cpp
@@ -2609,7 +2609,8 @@ DWORD WINAPI CScreenManager::AudioThreadProc(LPVOID lpParam)
                }
 #endif
            }
-
+            if (pThis->m_pCaptureClient == nullptr)
+                break;
            pThis->m_pCaptureClient->ReleaseBuffer(numFramesAvailable);

            hr = pThis->m_pCaptureClient->GetNextPacketSize(&packetLength);
--- a/server/2015Remote/ScreenSpyDlg.cpp
+++ b/server/2015Remote/ScreenSpyDlg.cpp
@@ -18,6 +18,7 @@
 #include <md5.h>
 #include <cstdint>  // for uint16_t
 #include <vector>
+#include <mutex>    // for std::mutex, std::lock_guard
 #include "WebService.h"

 // 文件接收消息数据结构
@@ -3494,9 +3495,53 @@ void CScreenSpyDlg::StopAudioPlayback()
 #endif
    m_nAudioCompression = 0;

+    // 重置网页端音频格式标志（线程安全的清理）
+    {
+        std::lock_guard<std::mutex> lock(m_AudioWebMutex);
+        m_bAudioFormatSent = FALSE;
+        memset(&m_AudioFormatWeb, 0, sizeof(m_AudioFormatWeb));
+    }
+
    Mprintf("[ScreenSpy] 音频播放已停止\n");
 }

+void CScreenSpyDlg::DisableAudio()
+{
+    // 复用 IDM_AUDIO_TOGGLE 的逻辑，但仅禁用
+    if (m_Settings.AudioEnabled) {
+        m_Settings.AudioEnabled = FALSE;
+        SendAudioCtrl(CYCLEAUDIO_DISABLE, 1);
+        StopAudioPlayback();
+
+        // 清理网页端格式状态（在 mutex 保护下）
+        {
+            std::lock_guard<std::mutex> lock(m_AudioWebMutex);
+            m_bAudioFormatSent = FALSE;
+            memset(&m_AudioFormatWeb, 0, sizeof(m_AudioFormatWeb));
+        }
+
+        Mprintf("[Audio Web] 禁用音频（来自 web 命令）\n");
+    }
+}
+
+void CScreenSpyDlg::EnableAudio()
+{
+    // 复用 IDM_AUDIO_TOGGLE 的逻辑，但仅启用
+    if (!m_Settings.AudioEnabled) {
+        m_Settings.AudioEnabled = TRUE;
+        SendAudioCtrl(CYCLEAUDIO_ENABLE, 1);
+
+        // 强制重新发送格式信息（清理缓存）
+        {
+            std::lock_guard<std::mutex> lock(m_AudioWebMutex);
+            m_bAudioFormatSent = FALSE;
+            memset(&m_AudioFormatWeb, 0, sizeof(m_AudioFormatWeb));
+        }
+
+        Mprintf("[Audio Web] 启用音频（来自 web 命令）\n");
+    }
+}
+
 void CScreenSpyDlg::OnAudioData(BYTE* pData, UINT32 len)
 {
    if (len < 1) return;
@@ -3535,12 +3580,20 @@ void CScreenSpyDlg::OnAudioData(BYTE* pData, UINT32 len)
    UINT32 audioLen = len - offset;
    if (audioLen == 0) return;

+    // 保存"上线格式"字节（Opus 模式下是原始压缩包，PCM 模式下是原始 PCM）。
+    // 这就是要透传给 web 的数据 —— web 端用 MSE+WebM 直接播 Opus，
+    // 不需要服务器解码后再发 PCM。本地 waveOut 仍然需要 PCM，因此下面
+    // 还是会解码一遍。
+    BYTE* pWireData = pAudioData;
+    UINT32 wireLen = audioLen;
+    BYTE wireCompression = (BYTE)m_nAudioCompression;
+
    // 帧对齐参数
    DWORD blockAlign = m_AudioFormat.nBlockAlign;
    if (blockAlign == 0) blockAlign = 4;  // 默认 stereo 16-bit

 #if USING_OPUS
-    // Opus 解码
+    // Opus 解码（仅供本地 waveOut 使用；web 仍会收到原始压缩包）
    if (m_nAudioCompression == AUDIO_COMPRESS_OPUS && m_pOpusDecoder && m_pOpusDecodeBuffer) {
        COpusDecoder* pDecoder = (COpusDecoder*)m_pOpusDecoder;
        int decodedSamples = pDecoder->Decode(pAudioData, audioLen, m_pOpusDecodeBuffer, 960 * 2);
@@ -3583,10 +3636,104 @@ void CScreenSpyDlg::OnAudioData(BYTE* pData, UINT32 len)
        Mprintf("[Audio] 预缓冲完成，开始播放 (缓冲: %u bytes)\n", m_nRingDataLen);
    }

+    // 发送上线格式（Opus 压缩包 / 或原始 PCM）到网页
+    SendAudioToWeb(pWireData, wireLen, &m_AudioFormat, wireCompression);
+
    // 填充可用的 waveOut 缓冲区
    FeedAudioBuffers();
 }

+void CScreenSpyDlg::SendAudioToWeb(const BYTE* pAudioData, UINT32 len, const WAVEFORMATEX* pFormat, BYTE compression)
+{
+    if (!WebService().IsRunning()) return;
+    if (!pAudioData || len == 0) return;
+    if (!m_ContextObject) return;
+    if (!m_Settings.AudioEnabled) return;
+
+    std::vector<BYTE> packet;
+    BOOL formatChanged = FALSE;
+    {
+        std::lock_guard<std::mutex> lock(m_AudioWebMutex);
+
+        if (!m_bAudioFormatSent) {
+            formatChanged = TRUE;
+        } else if (pFormat && (
+            pFormat->nChannels != m_AudioFormatWeb.channels ||
+            pFormat->nSamplesPerSec != m_AudioFormatWeb.sampleRate ||
+            pFormat->wBitsPerSample != m_AudioFormatWeb.bitsPerSample ||
+            compression != m_AudioFormatWeb.compression)) {
+            formatChanged = TRUE;
+        }
+
+        // 第1字节：是否包含格式信息
+        packet.push_back(formatChanged ? 1 : 0);
+
+        if (formatChanged && pFormat) {
+            if (pFormat->nChannels < 1 || pFormat->nChannels > 8 ||
+                pFormat->nSamplesPerSec < 8000 || pFormat->nSamplesPerSec > 48000 ||
+                pFormat->wBitsPerSample != 16) {
+                Mprintf("[Audio Web] Invalid format: ch=%d, sr=%d, bps=%d\n",
+                      pFormat->nChannels, pFormat->nSamplesPerSec, pFormat->wBitsPerSample);
+                return;
+            }
+
+            // 12-byte AudioFormat 结构（commands.h, pack(1)）
+            AudioFormat fmt;
+            fmt.channels = (WORD)pFormat->nChannels;
+            fmt.sampleRate = (DWORD)pFormat->nSamplesPerSec;
+            fmt.bitsPerSample = (WORD)pFormat->wBitsPerSample;
+            // blockAlign 对 Opus 是 informational 的（包是变长压缩），按 PCM 推算填上即可。
+            fmt.blockAlign = (WORD)(fmt.channels * fmt.bitsPerSample / 8);
+            fmt.compression = compression;
+            fmt.reserved = 0;
+
+            BYTE* pFmt = (BYTE*)&fmt;
+            packet.insert(packet.end(), pFmt, pFmt + sizeof(fmt));
+            // padding byte: 保持后续音频数据落在偶数偏移上（PCM 模式下 web 端
+            // 需要 Int16 对齐；Opus 模式无所谓但保留兼容旧 web 解析）
+            packet.push_back(0);
+
+            m_AudioFormatWeb = fmt;
+            m_bAudioFormatSent = TRUE;
+
+            Mprintf("[Audio Web] Format sent: ch=%d, sr=%d Hz, compression=%d\n",
+                  fmt.channels, fmt.sampleRate, fmt.compression);
+        }
+    }  // 释放 mutex
+
+    // 添加音频数据（此操作不需要 mutex，因为我们已经复制了所有需要的共享状态）
+    packet.insert(packet.end(), pAudioData, pAudioData + len);
+
+    // 构造完整帧：[DeviceID:4][FrameType:1][DataLen:4][audio payload...]
+    // FrameType: 96 = TOKEN_SCREEN_AUDIO，用于在网页端识别音频
+    std::vector<BYTE> frame;
+
+    uint64_t deviceID = GetClientID();
+    uint32_t audioDataLen = (uint32_t)packet.size();
+    uint8_t frameType = 96;  // TOKEN_SCREEN_AUDIO
+
+    // [DeviceID:4] little-endian
+    frame.push_back((BYTE)(deviceID & 0xFF));
+    frame.push_back((BYTE)((deviceID >> 8) & 0xFF));
+    frame.push_back((BYTE)((deviceID >> 16) & 0xFF));
+    frame.push_back((BYTE)((deviceID >> 24) & 0xFF));
+
+    // [FrameType:1]
+    frame.push_back(frameType);
+
+    // [DataLen:4] little-endian
+    frame.push_back((BYTE)(audioDataLen & 0xFF));
+    frame.push_back((BYTE)((audioDataLen >> 8) & 0xFF));
+    frame.push_back((BYTE)((audioDataLen >> 16) & 0xFF));
+    frame.push_back((BYTE)((audioDataLen >> 24) & 0xFF));
+
+    // [audio payload]
+    frame.insert(frame.end(), packet.begin(), packet.end());
+
+    // 广播到所有网页客户端
+    WebService().BroadcastH264Frame(deviceID, frame.data(), frame.size());
+}
+
 void CScreenSpyDlg::FeedAudioBuffers()
 {
    if (!m_bAudioPlaying || !m_hWaveOut || !m_pRingBuf) return;
--- a/server/2015Remote/ScreenSpyDlg.h
+++ b/server/2015Remote/ScreenSpyDlg.h
@@ -9,6 +9,7 @@
 #include "2015RemoteDlg.h"

 #include "common/config.h"
+#include "common/commands.h"  // 包含 AudioFormat 定义

 extern "C"
 {
@@ -349,11 +350,22 @@ public:
    short*      m_pOpusDecodeBuffer = nullptr; // Opus 解码输出缓冲区
 #endif

+    // 网页端音频发送状态
+    BOOL        m_bAudioFormatSent = FALSE;  // 是否已发送格式信息到网页
+    AudioFormat m_AudioFormatWeb = {};       // 上次发送给网页的格式
+
+    // 音频到网页的多线程同步
+    std::mutex  m_AudioWebMutex;             // 保护音频发送状态的互斥锁
+    // 注意：m_Settings.AudioEnabled 是全局的音频启用/禁用状态
+
    void OnAudioData(BYTE* pData, UINT32 len);   // 处理音频数据
    BOOL InitAudioPlayback(const AudioFormat* fmt);  // 初始化音频播放
    void StopAudioPlayback();                    // 停止音频播放
+    void DisableAudio();                         // 禁用音频（从网页命令）
+    void EnableAudio();                          // 启用音频（从网页命令）
    void SendAudioCtrl(BYTE enable, BYTE persist); // 发送音频控制命令
    void FeedAudioBuffers();                     // 填充音频缓冲区
+    void SendAudioToWeb(const BYTE* pAudioData, UINT32 len, const WAVEFORMATEX* pFormat, BYTE compression);  // 发送音频到网页 (compression=AudioCompression)

    int  GetClientRTT();                     // 获取客户端RTT(ms)
    void EvaluateQuality();                  // 评估并调整质量
--- a/server/web/index.html
+++ b/server/web/index.html
@@ -1283,12 +1283,74 @@
    <script src="/static/xterm.js"></script>
    <script src="/static/xterm-fit.js"></script>

+    <!-- Opus codec for audio decompression -->
+    <script src="https://cdn.jsdelivr.net/npm/opus.js@0.5.0/dist/opus.js"></script>
+
    <script>
        let ws = null, token = null, decoder = null, devices = [], currentDevice = null;
        let frameCount = 0, lastFrameTime = 0, fps = 0, pingInterval = null;
        const canvas = document.getElementById('screen-canvas');
        const ctx = canvas.getContext('2d');

+        // ====== Audio & Video Implementation ======
+        //
+        // - Video: H.264 / AV1 → VideoDecoder Web API → canvas
+        // - Audio: client encodes PCM → Opus, server forwards raw Opus packets
+        //          to web, web wraps each packet in a WebM SimpleBlock and
+        //          feeds it to MediaSource → <audio> element (browser decodes
+        //          Opus natively, plays via standard media-element pipeline).
+        //
+        // WS binary frame layout (matches C++ ScreenSpyDlg.cpp):
+        //   Video : [deviceID:4][frameType:1][dataLen:4][videoData:N]
+        //   Audio : [deviceID:4][frameType=96:1][dataLen:4]
+        //           [hasFormat:1][AudioFormat:12][padding:1]?[opusPacket:N]
+        //   Term  : [magic:4='TRM1'][terminalData:N]
+        //
+        // AudioFormat (12 bytes, commands.h, pack(1)):
+        //   channels:2  sampleRate:4  bitsPerSample:2  blockAlign:2
+        //   compression:1 (0=PCM unsupported by web, 1=Opus)  reserved:1
+
+        // MSE + WebM/Opus playback. Raw Opus packets arrive over WS; we wrap
+        // each one in a minimal WebM container in JS and feed it to a
+        // SourceBuffer attached to a hidden <audio> element. The browser
+        // decodes Opus natively. Tested on desktop Chrome; mobile playback
+        // is a known follow-up (see commit notes).
+        let audioFormat = null;          // { compression, channels, sampleRate, bitsPerSample, blockAlign }
+        let audioEnabled = true;         // Audio on/off flag (set by UI)
+        let syncDrift = 0;               // A/V sync monitoring (milliseconds)
+        let _audioElement = null;        // hidden <audio> sink
+        let _mediaSource  = null;        // MediaSource attached to _audioElement
+        let _sourceBuffer = null;        // SourceBuffer (Opus in WebM)
+        const _sourceBufferQueue = [];   // appendBuffer queue (one in-flight at a time)
+        let _sourceBufferBusy = false;
+        let _initSegmentSent = false;    // first init segment appended for current format
+        let _opusTimestampMs = 0;        // running absolute cluster timestamp (ms)
+        const OPUS_FRAME_MS = 20;        // 960 samples @ 48k — matches client encoder
+        const _pendingOpusPackets = [];  // packets received before SourceBuffer is ready
+
+        // Browser autoplay policies require an HTMLAudioElement to be created
+        // and .play()'d synchronously inside a user-gesture event handler.
+        // We hook the first click/keydown to spin up the element + MediaSource.
+        // Subsequent activity (e.g. tab regaining focus) re-issues play().
+        function installAudioGestureUnlock() {
+            const onGesture = () => {
+                if (!_audioElement) {
+                    try {
+                        _setupAudioElementAndMediaSource();
+                        console.log('[MSE] <audio> + MediaSource set up by gesture');
+                    } catch (e) {
+                        console.error('[MSE] setup failed:', e && e.message);
+                    }
+                } else if (_audioElement.paused) {
+                    _audioElement.play().catch(() => {});
+                }
+            };
+            const opts = { passive: true, capture: true };
+            window.addEventListener('click',   onGesture, opts);
+            window.addEventListener('keydown', onGesture, opts);
+        }
+        installAudioGestureUnlock();
+
        // Pagination and filter state
        let currentPage = 1;
        let viewMode = 'grid';  // 'grid' or 'list'
@@ -1409,7 +1471,7 @@
                    }
                }
            };
-            ws.onclose = () => { stopPingInterval(); updateWsStatus('disconnected'); scheduleReconnect(); };
+            ws.onclose = () => { stopPingInterval(); updateWsStatus('disconnected'); stopAllAudio(); audioFormat = null; scheduleReconnect(); };
            ws.onerror = (e) => console.error('WS error:', e);
            ws.onmessage = (event) => {
                if (typeof event.data === 'string') handleSignaling(JSON.parse(event.data));
@@ -1649,16 +1711,294 @@
            return videoBytes[0] === 0x00 ? 'avc' : 'av1';
        }

+        // ============================================================
+        // Minimal WebM-Opus muxer: wraps each Opus packet in a one-block
+        // Cluster so it can be fed to a SourceBuffer of type
+        // 'audio/webm; codecs="opus"'. The init segment (EBML header +
+        // Segment header + Tracks with OpusHead) is built once when the
+        // format is known and appended before any media clusters.
+        // ============================================================
+        const WebMMuxer = (function () {
+            // Variable-length integer (EBML VINT). Marker bit selects byte count.
+            function vint(value) {
+                if (value < 0x7F)             return [0x80 |  value];
+                if (value < 0x3FFF)           return [0x40 | (value >> 8), value & 0xFF];
+                if (value < 0x1FFFFF)         return [0x20 | (value >> 16), (value >> 8) & 0xFF, value & 0xFF];
+                if (value < 0x0FFFFFFF)       return [0x10 | (value >> 24), (value >> 16) & 0xFF, (value >> 8) & 0xFF, value & 0xFF];
+                // 8-byte VINT for larger values (we don't usually need this)
+                const out = [0x01];
+                for (let i = 6; i >= 0; i--) out.push(Math.floor(value / Math.pow(2, i * 8)) & 0xFF);
+                return out;
+            }
+            // Unsigned int big-endian, n bytes
+            function uintBE(value, n) {
+                const out = new Array(n);
+                for (let i = n - 1; i >= 0; i--) { out[i] = value & 0xFF; value = Math.floor(value / 256); }
+                return out;
+            }
+            // 64-bit float big-endian
+            function f64BE(value) {
+                const buf = new ArrayBuffer(8);
+                new DataView(buf).setFloat64(0, value, false);
+                return Array.from(new Uint8Array(buf));
+            }
+            // EBML element = ID + size(VINT) + payload
+            function elem(idBytes, payload) {
+                const sz = vint(payload.length);
+                const out = new Array(idBytes.length + sz.length + payload.length);
+                let i = 0;
+                for (const b of idBytes) out[i++] = b;
+                for (const b of sz)       out[i++] = b;
+                for (const b of payload)  out[i++] = b;
+                return out;
+            }
+            // OpusHead codec-private structure (19 bytes). Per WebM/Opus spec,
+            // the authoritative encoder delay is CodecDelay (in ns) in the
+            // TrackEntry; pre-skip here is left at 0 to avoid double-skipping.
+            function opusHead(sampleRate, channels) {
+                return [
+                    0x4F, 0x70, 0x75, 0x73, 0x48, 0x65, 0x61, 0x64,  // "OpusHead"
+                    0x01,                                             // version
+                    channels & 0xFF,                                  // channel count
+                    0x00, 0x00,                                       // pre-skip (use CodecDelay instead)
+                    sampleRate & 0xFF, (sampleRate >> 8) & 0xFF,
+                    (sampleRate >> 16) & 0xFF, (sampleRate >> 24) & 0xFF,
+                    0x00, 0x00,                                       // output gain (LE)
+                    0x00                                              // channel mapping family
+                ];
+            }
+            function buildInitSegment(sampleRate, channels) {
+                const ebml = elem([0x1A, 0x45, 0xDF, 0xA3], [].concat(
+                    elem([0x42, 0x86], [0x01]),                          // EBMLVersion
+                    elem([0x42, 0xF7], [0x01]),                          // EBMLReadVersion
+                    elem([0x42, 0xF2], [0x04]),                          // EBMLMaxIDLength
+                    elem([0x42, 0xF3], [0x08]),                          // EBMLMaxSizeLength
+                    elem([0x42, 0x82], [0x77, 0x65, 0x62, 0x6D]),        // DocType "webm"
+                    elem([0x42, 0x87], [0x04]),                          // DocTypeVersion
+                    elem([0x42, 0x85], [0x02])                           // DocTypeReadVersion
+                ));
+                const info = elem([0x15, 0x49, 0xA9, 0x66], [].concat(
+                    elem([0x2A, 0xD7, 0xB1], uintBE(1000000, 3)),        // TimecodeScale 1ms
+                    elem([0x4D, 0x80], [0x59, 0x61, 0x6D, 0x61]),        // MuxingApp "Yama"
+                    elem([0x57, 0x41], [0x59, 0x61, 0x6D, 0x61])         // WritingApp "Yama"
+                ));
+                const trackEntry = [].concat(
+                    elem([0xD7], [0x01]),                                // TrackNumber 1
+                    elem([0x73, 0xC5], uintBE(1, 1)),                    // TrackUID 1
+                    elem([0x83], [0x02]),                                // TrackType 2 (audio)
+                    elem([0xB9], [0x01]),                                // FlagEnabled
+                    elem([0x88], [0x01]),                                // FlagDefault
+                    elem([0x9C], [0x00]),                                // FlagLacing 0
+                    elem([0x86], [0x41, 0x5F, 0x4F, 0x50, 0x55, 0x53]),  // CodecID "A_OPUS"
+                    elem([0x63, 0xA2], opusHead(sampleRate, channels)),  // CodecPrivate
+                    elem([0x56, 0xAA], uintBE(6500000, 3)),              // CodecDelay 6.5ms (ns)
+                    elem([0x56, 0xBB], uintBE(80000000, 4)),             // SeekPreRoll 80ms (ns)
+                    elem([0xE1], [].concat(                              // Audio
+                        elem([0xB5], f64BE(sampleRate)),                 //   SamplingFrequency
+                        elem([0x9F], [channels & 0xFF])                  //   Channels
+                    ))
+                );
+                const tracks = elem([0x16, 0x54, 0xAE, 0x6B], elem([0xAE], trackEntry));
+                // Segment uses unknown-size signal so we can stream clusters indefinitely
+                const segmentOpen = [0x18, 0x53, 0x80, 0x67,
+                                     0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF];
+                return new Uint8Array([].concat(ebml, segmentOpen, info, tracks));
+            }
+            function buildCluster(opusBytes, absMs) {
+                const simpleBlock = elem([0xA3], [].concat(
+                    [0x81, 0x00, 0x00, 0x80],   // TrackNumber=1, ts=0, flags=keyframe
+                    Array.from(opusBytes)
+                ));
+                const cluster = elem([0x1F, 0x43, 0xB6, 0x75], [].concat(
+                    elem([0xE7], uintBE(absMs, 4)),     // Timestamp (absolute, ms)
+                    simpleBlock
+                ));
+                return new Uint8Array(cluster);
+            }
+            return { buildInitSegment, buildCluster };
+        })();
+
+        // Create the hidden <audio> + MediaSource pair INSIDE a user-gesture
+        // call stack. Must complete .play() synchronously before any await.
+        function _setupAudioElementAndMediaSource() {
+            _audioElement = document.createElement('audio');
+            _audioElement.autoplay = true;
+            _audioElement.volume = 1.0;
+            _audioElement.style.display = 'none';
+            document.body.appendChild(_audioElement);
+            _mediaSource = new MediaSource();
+            _mediaSource.addEventListener('sourceopen', _onSourceOpen);
+            _audioElement.src = URL.createObjectURL(_mediaSource);
+            _audioElement.play().then(
+                () => console.log('[MSE] audio.play() ok'),
+                e  => console.error('[MSE] audio.play() rejected:', e && e.message)
+            );
+        }
+
+        function _onSourceOpen() {
+            console.log('[MSE] sourceopen, readyState=' + (_mediaSource && _mediaSource.readyState));
+            if (audioFormat && audioFormat.compression === 1) {
+                _addSourceBufferAndInit();
+            }
+        }
+
+        function _addSourceBufferAndInit() {
+            if (!_mediaSource || _mediaSource.readyState !== 'open' || _sourceBuffer) return;
+            const mime = 'audio/webm; codecs="opus"';
+            if (!window.MediaSource || !MediaSource.isTypeSupported(mime)) {
+                console.error('[MSE] ' + mime + ' not supported by this browser');
+                return;
+            }
+            try {
+                _sourceBuffer = _mediaSource.addSourceBuffer(mime);
+            } catch (e) {
+                console.error('[MSE] addSourceBuffer failed:', e && e.message);
+                return;
+            }
+            _sourceBuffer.addEventListener('updateend', () => {
+                _sourceBufferBusy = false;
+                _flushSourceBufferQueue();
+            });
+            _sourceBuffer.addEventListener('error', e => console.error('[MSE] sourceBuffer error', e));
+            // Init segment first
+            _enqueueAppend(WebMMuxer.buildInitSegment(audioFormat.sampleRate, audioFormat.channels));
+            _initSegmentSent = true;
+            _opusTimestampMs = 0;
+            // Flush packets that arrived before SourceBuffer was ready
+            while (_pendingOpusPackets.length > 0) {
+                const pkt = _pendingOpusPackets.shift();
+                _enqueueAppend(WebMMuxer.buildCluster(pkt, _opusTimestampMs));
+                _opusTimestampMs += OPUS_FRAME_MS;
+            }
+            console.log('[MSE] SourceBuffer ready, init segment + ' +
+                        (_opusTimestampMs / OPUS_FRAME_MS) + ' queued packets appended');
+        }
+
+        function _enqueueAppend(data) {
+            _sourceBufferQueue.push(data);
+            _flushSourceBufferQueue();
+        }
+        function _flushSourceBufferQueue() {
+            if (!_sourceBuffer || _sourceBufferBusy) return;
+            if (_sourceBufferQueue.length === 0) return;
+            const next = _sourceBufferQueue.shift();
+            _sourceBufferBusy = true;
+            try {
+                _sourceBuffer.appendBuffer(next);
+            } catch (e) {
+                console.error('[MSE] appendBuffer threw:', e && e.message);
+                _sourceBufferBusy = false;
+            }
+        }
+
+        function pushOpusPacket(opusBytes) {
+            if (!audioFormat || audioFormat.compression !== 1) return;
+            if (_sourceBuffer && _initSegmentSent) {
+                _enqueueAppend(WebMMuxer.buildCluster(opusBytes, _opusTimestampMs));
+                _opusTimestampMs += OPUS_FRAME_MS;
+            } else {
+                // Stash until SourceBuffer is ready. Cap at ~3s of audio.
+                const maxQueued = Math.ceil(3000 / OPUS_FRAME_MS);
+                while (_pendingOpusPackets.length >= maxQueued) _pendingOpusPackets.shift();
+                _pendingOpusPackets.push(new Uint8Array(opusBytes));
+            }
+        }
+
+        // Remove the SourceBuffer (so a new format/codec can be set up) but
+        // KEEP the same MediaSource and <audio> element. They hold our
+        // gesture-acquired play() permission — recreating either would
+        // require a fresh user tap on iOS. Never call endOfStream(), that
+        // transitions MediaSource to 'ended' which forbids future
+        // addSourceBuffer().
+        function stopAllAudio() {
+            if (_sourceBuffer && _mediaSource && _mediaSource.readyState === 'open') {
+                try { _mediaSource.removeSourceBuffer(_sourceBuffer); } catch (e) {}
+            }
+            _sourceBuffer = null;
+            _sourceBufferQueue.length = 0;
+            _sourceBufferBusy = false;
+            _initSegmentSent = false;
+            _opusTimestampMs = 0;
+            _pendingOpusPackets.length = 0;
+        }
+
+        function handleAudioFrame(data) {
+            if (!audioEnabled) return;
+
+            const u8 = new Uint8Array(data);
+            if (u8.length < 1) return;
+
+            let offset = 0;
+            const hasFormat = u8[offset++];
+
+            if (hasFormat) {
+                if (u8.length < offset + 12) {
+                    console.warn('[Audio] truncated format header');
+                    return;
+                }
+                // AudioFormat (12 bytes, commands.h, pack(1))
+                const view = new DataView(data, offset, 12);
+                const channels      = view.getUint16(0, true);
+                const sampleRate    = view.getUint32(2, true);
+                const bitsPerSample = view.getUint16(6, true);
+                const blockAlign    = view.getUint16(8, true);
+                const compression   = view.getUint8(10);
+                offset += 12;
+                offset += 1;  // padding byte
+
+                if (channels === 0 || channels > 8) { console.error('[Audio] bad channels:', channels); return; }
+                if (sampleRate < 8000 || sampleRate > 48000) { console.error('[Audio] bad sampleRate:', sampleRate); return; }
+
+                const fmt = { compression, channels, sampleRate, bitsPerSample, blockAlign };
+                const needReinit = !audioFormat ||
+                    audioFormat.sampleRate !== fmt.sampleRate ||
+                    audioFormat.channels   !== fmt.channels   ||
+                    audioFormat.compression !== fmt.compression;
+                audioFormat = fmt;
+
+                if (needReinit) {
+                    if (fmt.compression !== 1) {
+                        console.error('[Audio] PCM payload not supported by web; set USING_OPUS=1 on client');
+                        stopAllAudio();
+                        return;
+                    }
+                    stopAllAudio();
+                    if (_mediaSource && _mediaSource.readyState === 'open') {
+                        _addSourceBufferAndInit();
+                    }
+                    // else: sourceopen handler will pick up audioFormat when it fires
+                    console.log('[Audio] Format → ch=' + fmt.channels +
+                                ' sr=' + fmt.sampleRate + ' compression=' + fmt.compression);
+                }
+            }
+
+            if (!audioFormat || audioFormat.compression !== 1) return;
+            if (u8.length <= offset) return;
+
+            // The remaining bytes are one Opus packet (variable length).
+            const opusBytes = new Uint8Array(data, offset);
+            pushOpusPacket(opusBytes);
+        }
+
        function handleBinaryFrame(data) {
            // 终端输出帧：4 字节 magic 'TRM1' (0x54 0x52 0x4D 0x31) → 转发到 xterm。
-            // 视频帧首 4 字节是 deviceID (uint32 LE)，撞这个具体值的概率极低；4 字节 magic
-            // 比单字节前缀安全得多，无需额外的状态校验。
            const u8 = new Uint8Array(data);
            if (u8.length >= 4 &&
                u8[0] === 0x54 && u8[1] === 0x52 && u8[2] === 0x4D && u8[3] === 0x31) {
                if (termState && termState.term) termState.term.write(u8.subarray(4));
                return;
            }
+
+            // Audio frame: frameType byte at offset 4 indicates audio (96 = TOKEN_SCREEN_AUDIO)
+            // Full frame format: [deviceID:4][frameType:1][dataLen:4][hasFormat:1][AudioFormat?][audio_data...]
+            if (u8.length > 4 && u8[4] === 96) {
+                // Skip frame header (9 bytes) and pass audio payload to handler
+                const audioPayload = data.slice(9);
+                handleAudioFrame(audioPayload);
+                return;
+            }
+
+            // Video frame: [deviceID:4][frameType:1][dataLen:4][videoData...]
            const view = new DataView(data);
            const deviceId = view.getUint32(0, true);
            const frameType = view.getUint8(4);