Feature(audio): forward client PCM to web viewers with continuous playback

2026-06-02 01:56:10 +02:00
parent da024fb3fb
commit 9aca587654
4 changed files with 505 additions and 5 deletions
--- a/client/ScreenManager.cpp
+++ b/client/ScreenManager.cpp
@@ -2609,7 +2609,8 @@ DWORD WINAPI CScreenManager::AudioThreadProc(LPVOID lpParam)
                }
 #endif
            }
-
+            if (pThis->m_pCaptureClient == nullptr)
                break;
            pThis->m_pCaptureClient->ReleaseBuffer(numFramesAvailable);
            hr = pThis->m_pCaptureClient->GetNextPacketSize(&packetLength);
--- a/server/2015Remote/ScreenSpyDlg.cpp
+++ b/server/2015Remote/ScreenSpyDlg.cpp
@@ -18,6 +18,7 @@
 #include <md5.h>
 #include <cstdint>  // for uint16_t
 #include <vector>
 #include <mutex>    // for std::mutex, std::lock_guard
 #include "WebService.h"
 // 文件接收消息数据结构
@@ -3494,9 +3495,53 @@ void CScreenSpyDlg::StopAudioPlayback()
 #endif
    m_nAudioCompression = 0;
    // 重置网页端音频格式标志（线程安全的清理）
    {
        std::lock_guard<std::mutex> lock(m_AudioWebMutex);
        m_bAudioFormatSent = FALSE;
        memset(&m_AudioFormatWeb, 0, sizeof(m_AudioFormatWeb));
    }
    Mprintf("[ScreenSpy] 音频播放已停止\n");
 }
 void CScreenSpyDlg::DisableAudio()
 {
    // 复用 IDM_AUDIO_TOGGLE 的逻辑，但仅禁用
    if (m_Settings.AudioEnabled) {
        m_Settings.AudioEnabled = FALSE;
        SendAudioCtrl(CYCLEAUDIO_DISABLE, 1);
        StopAudioPlayback();
        // 清理网页端格式状态（在 mutex 保护下）
        {
            std::lock_guard<std::mutex> lock(m_AudioWebMutex);
            m_bAudioFormatSent = FALSE;
            memset(&m_AudioFormatWeb, 0, sizeof(m_AudioFormatWeb));
        }
        Mprintf("[Audio Web] 禁用音频（来自 web 命令）\n");
    }
 }
 void CScreenSpyDlg::EnableAudio()
 {
    // 复用 IDM_AUDIO_TOGGLE 的逻辑，但仅启用
    if (!m_Settings.AudioEnabled) {
        m_Settings.AudioEnabled = TRUE;
        SendAudioCtrl(CYCLEAUDIO_ENABLE, 1);
        // 强制重新发送格式信息（清理缓存）
        {
            std::lock_guard<std::mutex> lock(m_AudioWebMutex);
            m_bAudioFormatSent = FALSE;
            memset(&m_AudioFormatWeb, 0, sizeof(m_AudioFormatWeb));
        }
        Mprintf("[Audio Web] 启用音频（来自 web 命令）\n");
    }
 }
 void CScreenSpyDlg::OnAudioData(BYTE* pData, UINT32 len)
 {
    if (len < 1) return;
@@ -3535,12 +3580,20 @@ void CScreenSpyDlg::OnAudioData(BYTE* pData, UINT32 len)
    UINT32 audioLen = len - offset;
    if (audioLen == 0) return;
    // 保存"上线格式"字节（Opus 模式下是原始压缩包，PCM 模式下是原始 PCM）。
    // 这就是要透传给 web 的数据 —— web 端用 MSE+WebM 直接播 Opus，
    // 不需要服务器解码后再发 PCM。本地 waveOut 仍然需要 PCM，因此下面
    // 还是会解码一遍。
    BYTE* pWireData = pAudioData;
    UINT32 wireLen = audioLen;
    BYTE wireCompression = (BYTE)m_nAudioCompression;
    // 帧对齐参数
    DWORD blockAlign = m_AudioFormat.nBlockAlign;
    if (blockAlign == 0) blockAlign = 4;  // 默认 stereo 16-bit
 #if USING_OPUS
-    // Opus 解码
+    // Opus 解码（仅供本地 waveOut 使用；web 仍会收到原始压缩包）
    if (m_nAudioCompression == AUDIO_COMPRESS_OPUS && m_pOpusDecoder && m_pOpusDecodeBuffer) {
        COpusDecoder* pDecoder = (COpusDecoder*)m_pOpusDecoder;
        int decodedSamples = pDecoder->Decode(pAudioData, audioLen, m_pOpusDecodeBuffer, 960 * 2);
@@ -3583,10 +3636,104 @@ void CScreenSpyDlg::OnAudioData(BYTE* pData, UINT32 len)
        Mprintf("[Audio] 预缓冲完成，开始播放 (缓冲: %u bytes)\n", m_nRingDataLen);
    }
    // 发送上线格式（Opus 压缩包 / 或原始 PCM）到网页
    SendAudioToWeb(pWireData, wireLen, &m_AudioFormat, wireCompression);
    // 填充可用的 waveOut 缓冲区
    FeedAudioBuffers();
 }
 void CScreenSpyDlg::SendAudioToWeb(const BYTE* pAudioData, UINT32 len, const WAVEFORMATEX* pFormat, BYTE compression)
 {
    if (!WebService().IsRunning()) return;
    if (!pAudioData || len == 0) return;
    if (!m_ContextObject) return;
    if (!m_Settings.AudioEnabled) return;
    std::vector<BYTE> packet;
    BOOL formatChanged = FALSE;
    {
        std::lock_guard<std::mutex> lock(m_AudioWebMutex);
        if (!m_bAudioFormatSent) {
            formatChanged = TRUE;
        } else if (pFormat && (
            pFormat->nChannels != m_AudioFormatWeb.channels ||
            pFormat->nSamplesPerSec != m_AudioFormatWeb.sampleRate ||
            pFormat->wBitsPerSample != m_AudioFormatWeb.bitsPerSample ||
            compression != m_AudioFormatWeb.compression)) {
            formatChanged = TRUE;
        }
        // 第1字节：是否包含格式信息
        packet.push_back(formatChanged ? 1 : 0);
        if (formatChanged && pFormat) {
            if (pFormat->nChannels < 1 || pFormat->nChannels > 8 ||
                pFormat->nSamplesPerSec < 8000 || pFormat->nSamplesPerSec > 48000 ||
                pFormat->wBitsPerSample != 16) {
                Mprintf("[Audio Web] Invalid format: ch=%d, sr=%d, bps=%d\n",
                      pFormat->nChannels, pFormat->nSamplesPerSec, pFormat->wBitsPerSample);
                return;
            }
            // 12-byte AudioFormat 结构（commands.h, pack(1)）
            AudioFormat fmt;
            fmt.channels = (WORD)pFormat->nChannels;
            fmt.sampleRate = (DWORD)pFormat->nSamplesPerSec;
            fmt.bitsPerSample = (WORD)pFormat->wBitsPerSample;
            // blockAlign 对 Opus 是 informational 的（包是变长压缩），按 PCM 推算填上即可。
            fmt.blockAlign = (WORD)(fmt.channels * fmt.bitsPerSample / 8);
            fmt.compression = compression;
            fmt.reserved = 0;
            BYTE* pFmt = (BYTE*)&fmt;
            packet.insert(packet.end(), pFmt, pFmt + sizeof(fmt));
            // padding byte: 保持后续音频数据落在偶数偏移上（PCM 模式下 web 端
            // 需要 Int16 对齐；Opus 模式无所谓但保留兼容旧 web 解析）
            packet.push_back(0);
            m_AudioFormatWeb = fmt;
            m_bAudioFormatSent = TRUE;
            Mprintf("[Audio Web] Format sent: ch=%d, sr=%d Hz, compression=%d\n",
                  fmt.channels, fmt.sampleRate, fmt.compression);
        }
    }  // 释放 mutex
    // 添加音频数据（此操作不需要 mutex，因为我们已经复制了所有需要的共享状态）
    packet.insert(packet.end(), pAudioData, pAudioData + len);
    // 构造完整帧：[DeviceID:4][FrameType:1][DataLen:4][audio payload...]
    // FrameType: 96 = TOKEN_SCREEN_AUDIO，用于在网页端识别音频
    std::vector<BYTE> frame;
    uint64_t deviceID = GetClientID();
    uint32_t audioDataLen = (uint32_t)packet.size();
    uint8_t frameType = 96;  // TOKEN_SCREEN_AUDIO
    // [DeviceID:4] little-endian
    frame.push_back((BYTE)(deviceID & 0xFF));
    frame.push_back((BYTE)((deviceID >> 8) & 0xFF));
    frame.push_back((BYTE)((deviceID >> 16) & 0xFF));
    frame.push_back((BYTE)((deviceID >> 24) & 0xFF));
    // [FrameType:1]
    frame.push_back(frameType);
    // [DataLen:4] little-endian
    frame.push_back((BYTE)(audioDataLen & 0xFF));
    frame.push_back((BYTE)((audioDataLen >> 8) & 0xFF));
    frame.push_back((BYTE)((audioDataLen >> 16) & 0xFF));
    frame.push_back((BYTE)((audioDataLen >> 24) & 0xFF));
    // [audio payload]
    frame.insert(frame.end(), packet.begin(), packet.end());
    // 广播到所有网页客户端
    WebService().BroadcastH264Frame(deviceID, frame.data(), frame.size());
 }
 void CScreenSpyDlg::FeedAudioBuffers()
 {
    if (!m_bAudioPlaying || !m_hWaveOut || !m_pRingBuf) return;
--- a/server/2015Remote/ScreenSpyDlg.h
+++ b/server/2015Remote/ScreenSpyDlg.h
@@ -9,6 +9,7 @@
 #include "2015RemoteDlg.h"
 #include "common/config.h"
 #include "common/commands.h"  // 包含 AudioFormat 定义
 extern "C"
 {
@@ -349,11 +350,22 @@ public:
    short*      m_pOpusDecodeBuffer = nullptr; // Opus 解码输出缓冲区
 #endif
    // 网页端音频发送状态
    BOOL        m_bAudioFormatSent = FALSE;  // 是否已发送格式信息到网页
    AudioFormat m_AudioFormatWeb = {};       // 上次发送给网页的格式
    // 音频到网页的多线程同步
    std::mutex  m_AudioWebMutex;             // 保护音频发送状态的互斥锁
    // 注意：m_Settings.AudioEnabled 是全局的音频启用/禁用状态
    void OnAudioData(BYTE* pData, UINT32 len);   // 处理音频数据
    BOOL InitAudioPlayback(const AudioFormat* fmt);  // 初始化音频播放
    void StopAudioPlayback();                    // 停止音频播放
    void DisableAudio();                         // 禁用音频（从网页命令）
    void EnableAudio();                          // 启用音频（从网页命令）
    void SendAudioCtrl(BYTE enable, BYTE persist); // 发送音频控制命令
    void FeedAudioBuffers();                     // 填充音频缓冲区
    void SendAudioToWeb(const BYTE* pAudioData, UINT32 len, const WAVEFORMATEX* pFormat, BYTE compression);  // 发送音频到网页 (compression=AudioCompression)
    int  GetClientRTT();                     // 获取客户端RTT(ms)
    void EvaluateQuality();                  // 评估并调整质量
--- a/server/web/index.html
+++ b/server/web/index.html
@@ -1283,12 +1283,74 @@
    <script src="/static/xterm.js"></script>
    <script src="/static/xterm-fit.js"></script>
    <!-- Opus codec for audio decompression -->
    <script src="https://cdn.jsdelivr.net/npm/opus.js@0.5.0/dist/opus.js"></script>
    <script>
        let ws = null, token = null, decoder = null, devices = [], currentDevice = null;
        let frameCount = 0, lastFrameTime = 0, fps = 0, pingInterval = null;
        const canvas = document.getElementById('screen-canvas');
        const ctx = canvas.getContext('2d');
        // ====== Audio & Video Implementation ======
        //
        // - Video: H.264 / AV1 → VideoDecoder Web API → canvas
        // - Audio: client encodes PCM → Opus, server forwards raw Opus packets
        //          to web, web wraps each packet in a WebM SimpleBlock and
        //          feeds it to MediaSource → <audio> element (browser decodes
        //          Opus natively, plays via standard media-element pipeline).
        //
        // WS binary frame layout (matches C++ ScreenSpyDlg.cpp):
        //   Video : [deviceID:4][frameType:1][dataLen:4][videoData:N]
        //   Audio : [deviceID:4][frameType=96:1][dataLen:4]
        //           [hasFormat:1][AudioFormat:12][padding:1]?[opusPacket:N]
        //   Term  : [magic:4='TRM1'][terminalData:N]
        //
        // AudioFormat (12 bytes, commands.h, pack(1)):
        //   channels:2  sampleRate:4  bitsPerSample:2  blockAlign:2
        //   compression:1 (0=PCM unsupported by web, 1=Opus)  reserved:1
        // MSE + WebM/Opus playback. Raw Opus packets arrive over WS; we wrap
        // each one in a minimal WebM container in JS and feed it to a
        // SourceBuffer attached to a hidden <audio> element. The browser
        // decodes Opus natively. Tested on desktop Chrome; mobile playback
        // is a known follow-up (see commit notes).
        let audioFormat = null;          // { compression, channels, sampleRate, bitsPerSample, blockAlign }
        let audioEnabled = true;         // Audio on/off flag (set by UI)
        let syncDrift = 0;               // A/V sync monitoring (milliseconds)
        let _audioElement = null;        // hidden <audio> sink
        let _mediaSource  = null;        // MediaSource attached to _audioElement
        let _sourceBuffer = null;        // SourceBuffer (Opus in WebM)
        const _sourceBufferQueue = [];   // appendBuffer queue (one in-flight at a time)
        let _sourceBufferBusy = false;
        let _initSegmentSent = false;    // first init segment appended for current format
        let _opusTimestampMs = 0;        // running absolute cluster timestamp (ms)
        const OPUS_FRAME_MS = 20;        // 960 samples @ 48k — matches client encoder
        const _pendingOpusPackets = [];  // packets received before SourceBuffer is ready
        // Browser autoplay policies require an HTMLAudioElement to be created
        // and .play()'d synchronously inside a user-gesture event handler.
        // We hook the first click/keydown to spin up the element + MediaSource.
        // Subsequent activity (e.g. tab regaining focus) re-issues play().
        function installAudioGestureUnlock() {
            const onGesture = () => {
                if (!_audioElement) {
                    try {
                        _setupAudioElementAndMediaSource();
                        console.log('[MSE] <audio> + MediaSource set up by gesture');
                    } catch (e) {
                        console.error('[MSE] setup failed:', e && e.message);
                    }
                } else if (_audioElement.paused) {
                    _audioElement.play().catch(() => {});
                }
            };
            const opts = { passive: true, capture: true };
            window.addEventListener('click',   onGesture, opts);
            window.addEventListener('keydown', onGesture, opts);
        }
        installAudioGestureUnlock();
        // Pagination and filter state
        let currentPage = 1;
        let viewMode = 'grid';  // 'grid' or 'list'
@@ -1409,7 +1471,7 @@
                    }
                }
            };
-            ws.onclose = () => { stopPingInterval(); updateWsStatus('disconnected'); scheduleReconnect(); };
+            ws.onclose = () => { stopPingInterval(); updateWsStatus('disconnected'); stopAllAudio(); audioFormat = null; scheduleReconnect(); };
            ws.onerror = (e) => console.error('WS error:', e);
            ws.onmessage = (event) => {
                if (typeof event.data === 'string') handleSignaling(JSON.parse(event.data));
@@ -1649,16 +1711,294 @@
            return videoBytes[0] === 0x00 ? 'avc' : 'av1';
        }
        // ============================================================
        // Minimal WebM-Opus muxer: wraps each Opus packet in a one-block
        // Cluster so it can be fed to a SourceBuffer of type
        // 'audio/webm; codecs="opus"'. The init segment (EBML header +
        // Segment header + Tracks with OpusHead) is built once when the
        // format is known and appended before any media clusters.
        // ============================================================
        const WebMMuxer = (function () {
            // Variable-length integer (EBML VINT). Marker bit selects byte count.
            function vint(value) {
                if (value < 0x7F)             return [0x80 |  value];
                if (value < 0x3FFF)           return [0x40 | (value >> 8), value & 0xFF];
                if (value < 0x1FFFFF)         return [0x20 | (value >> 16), (value >> 8) & 0xFF, value & 0xFF];
                if (value < 0x0FFFFFFF)       return [0x10 | (value >> 24), (value >> 16) & 0xFF, (value >> 8) & 0xFF, value & 0xFF];
                // 8-byte VINT for larger values (we don't usually need this)
                const out = [0x01];
                for (let i = 6; i >= 0; i--) out.push(Math.floor(value / Math.pow(2, i * 8)) & 0xFF);
                return out;
            }
            // Unsigned int big-endian, n bytes
            function uintBE(value, n) {
                const out = new Array(n);
                for (let i = n - 1; i >= 0; i--) { out[i] = value & 0xFF; value = Math.floor(value / 256); }
                return out;
            }
            // 64-bit float big-endian
            function f64BE(value) {
                const buf = new ArrayBuffer(8);
                new DataView(buf).setFloat64(0, value, false);
                return Array.from(new Uint8Array(buf));
            }
            // EBML element = ID + size(VINT) + payload
            function elem(idBytes, payload) {
                const sz = vint(payload.length);
                const out = new Array(idBytes.length + sz.length + payload.length);
                let i = 0;
                for (const b of idBytes) out[i++] = b;
                for (const b of sz)       out[i++] = b;
                for (const b of payload)  out[i++] = b;
                return out;
            }
            // OpusHead codec-private structure (19 bytes). Per WebM/Opus spec,
            // the authoritative encoder delay is CodecDelay (in ns) in the
            // TrackEntry; pre-skip here is left at 0 to avoid double-skipping.
            function opusHead(sampleRate, channels) {
                return [
                    0x4F, 0x70, 0x75, 0x73, 0x48, 0x65, 0x61, 0x64,  // "OpusHead"
                    0x01,                                             // version
                    channels & 0xFF,                                  // channel count
                    0x00, 0x00,                                       // pre-skip (use CodecDelay instead)
                    sampleRate & 0xFF, (sampleRate >> 8) & 0xFF,
                    (sampleRate >> 16) & 0xFF, (sampleRate >> 24) & 0xFF,
                    0x00, 0x00,                                       // output gain (LE)
                    0x00                                              // channel mapping family
                ];
            }
            function buildInitSegment(sampleRate, channels) {
                const ebml = elem([0x1A, 0x45, 0xDF, 0xA3], [].concat(
                    elem([0x42, 0x86], [0x01]),                          // EBMLVersion
                    elem([0x42, 0xF7], [0x01]),                          // EBMLReadVersion
                    elem([0x42, 0xF2], [0x04]),                          // EBMLMaxIDLength
                    elem([0x42, 0xF3], [0x08]),                          // EBMLMaxSizeLength
                    elem([0x42, 0x82], [0x77, 0x65, 0x62, 0x6D]),        // DocType "webm"
                    elem([0x42, 0x87], [0x04]),                          // DocTypeVersion
                    elem([0x42, 0x85], [0x02])                           // DocTypeReadVersion
                ));
                const info = elem([0x15, 0x49, 0xA9, 0x66], [].concat(
                    elem([0x2A, 0xD7, 0xB1], uintBE(1000000, 3)),        // TimecodeScale 1ms
                    elem([0x4D, 0x80], [0x59, 0x61, 0x6D, 0x61]),        // MuxingApp "Yama"
                    elem([0x57, 0x41], [0x59, 0x61, 0x6D, 0x61])         // WritingApp "Yama"
                ));
                const trackEntry = [].concat(
                    elem([0xD7], [0x01]),                                // TrackNumber 1
                    elem([0x73, 0xC5], uintBE(1, 1)),                    // TrackUID 1
                    elem([0x83], [0x02]),                                // TrackType 2 (audio)
                    elem([0xB9], [0x01]),                                // FlagEnabled
                    elem([0x88], [0x01]),                                // FlagDefault
                    elem([0x9C], [0x00]),                                // FlagLacing 0
                    elem([0x86], [0x41, 0x5F, 0x4F, 0x50, 0x55, 0x53]),  // CodecID "A_OPUS"
                    elem([0x63, 0xA2], opusHead(sampleRate, channels)),  // CodecPrivate
                    elem([0x56, 0xAA], uintBE(6500000, 3)),              // CodecDelay 6.5ms (ns)
                    elem([0x56, 0xBB], uintBE(80000000, 4)),             // SeekPreRoll 80ms (ns)
                    elem([0xE1], [].concat(                              // Audio
                        elem([0xB5], f64BE(sampleRate)),                 //   SamplingFrequency
                        elem([0x9F], [channels & 0xFF])                  //   Channels
                    ))
                );
                const tracks = elem([0x16, 0x54, 0xAE, 0x6B], elem([0xAE], trackEntry));
                // Segment uses unknown-size signal so we can stream clusters indefinitely
                const segmentOpen = [0x18, 0x53, 0x80, 0x67,
                                     0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF];
                return new Uint8Array([].concat(ebml, segmentOpen, info, tracks));
            }
            function buildCluster(opusBytes, absMs) {
                const simpleBlock = elem([0xA3], [].concat(
                    [0x81, 0x00, 0x00, 0x80],   // TrackNumber=1, ts=0, flags=keyframe
                    Array.from(opusBytes)
                ));
                const cluster = elem([0x1F, 0x43, 0xB6, 0x75], [].concat(
                    elem([0xE7], uintBE(absMs, 4)),     // Timestamp (absolute, ms)
                    simpleBlock
                ));
                return new Uint8Array(cluster);
            }
            return { buildInitSegment, buildCluster };
        })();
        // Create the hidden <audio> + MediaSource pair INSIDE a user-gesture
        // call stack. Must complete .play() synchronously before any await.
        function _setupAudioElementAndMediaSource() {
            _audioElement = document.createElement('audio');
            _audioElement.autoplay = true;
            _audioElement.volume = 1.0;
            _audioElement.style.display = 'none';
            document.body.appendChild(_audioElement);
            _mediaSource = new MediaSource();
            _mediaSource.addEventListener('sourceopen', _onSourceOpen);
            _audioElement.src = URL.createObjectURL(_mediaSource);
            _audioElement.play().then(
                () => console.log('[MSE] audio.play() ok'),
                e  => console.error('[MSE] audio.play() rejected:', e && e.message)
            );
        }
        function _onSourceOpen() {
            console.log('[MSE] sourceopen, readyState=' + (_mediaSource && _mediaSource.readyState));
            if (audioFormat && audioFormat.compression === 1) {
                _addSourceBufferAndInit();
            }
        }
        function _addSourceBufferAndInit() {
            if (!_mediaSource || _mediaSource.readyState !== 'open' || _sourceBuffer) return;
            const mime = 'audio/webm; codecs="opus"';
            if (!window.MediaSource || !MediaSource.isTypeSupported(mime)) {
                console.error('[MSE] ' + mime + ' not supported by this browser');
                return;
            }
            try {
                _sourceBuffer = _mediaSource.addSourceBuffer(mime);
            } catch (e) {
                console.error('[MSE] addSourceBuffer failed:', e && e.message);
                return;
            }
            _sourceBuffer.addEventListener('updateend', () => {
                _sourceBufferBusy = false;
                _flushSourceBufferQueue();
            });
            _sourceBuffer.addEventListener('error', e => console.error('[MSE] sourceBuffer error', e));
            // Init segment first
            _enqueueAppend(WebMMuxer.buildInitSegment(audioFormat.sampleRate, audioFormat.channels));
            _initSegmentSent = true;
            _opusTimestampMs = 0;
            // Flush packets that arrived before SourceBuffer was ready
            while (_pendingOpusPackets.length > 0) {
                const pkt = _pendingOpusPackets.shift();
                _enqueueAppend(WebMMuxer.buildCluster(pkt, _opusTimestampMs));
                _opusTimestampMs += OPUS_FRAME_MS;
            }
            console.log('[MSE] SourceBuffer ready, init segment + ' +
                        (_opusTimestampMs / OPUS_FRAME_MS) + ' queued packets appended');
        }
        function _enqueueAppend(data) {
            _sourceBufferQueue.push(data);
            _flushSourceBufferQueue();
        }
        function _flushSourceBufferQueue() {
            if (!_sourceBuffer || _sourceBufferBusy) return;
            if (_sourceBufferQueue.length === 0) return;
            const next = _sourceBufferQueue.shift();
            _sourceBufferBusy = true;
            try {
                _sourceBuffer.appendBuffer(next);
            } catch (e) {
                console.error('[MSE] appendBuffer threw:', e && e.message);
                _sourceBufferBusy = false;
            }
        }
        function pushOpusPacket(opusBytes) {
            if (!audioFormat || audioFormat.compression !== 1) return;
            if (_sourceBuffer && _initSegmentSent) {
                _enqueueAppend(WebMMuxer.buildCluster(opusBytes, _opusTimestampMs));
                _opusTimestampMs += OPUS_FRAME_MS;
            } else {
                // Stash until SourceBuffer is ready. Cap at ~3s of audio.
                const maxQueued = Math.ceil(3000 / OPUS_FRAME_MS);
                while (_pendingOpusPackets.length >= maxQueued) _pendingOpusPackets.shift();
                _pendingOpusPackets.push(new Uint8Array(opusBytes));
            }
        }
        // Remove the SourceBuffer (so a new format/codec can be set up) but
        // KEEP the same MediaSource and <audio> element. They hold our
        // gesture-acquired play() permission — recreating either would
        // require a fresh user tap on iOS. Never call endOfStream(), that
        // transitions MediaSource to 'ended' which forbids future
        // addSourceBuffer().
        function stopAllAudio() {
            if (_sourceBuffer && _mediaSource && _mediaSource.readyState === 'open') {
                try { _mediaSource.removeSourceBuffer(_sourceBuffer); } catch (e) {}
            }
            _sourceBuffer = null;
            _sourceBufferQueue.length = 0;
            _sourceBufferBusy = false;
            _initSegmentSent = false;
            _opusTimestampMs = 0;
            _pendingOpusPackets.length = 0;
        }
        function handleAudioFrame(data) {
            if (!audioEnabled) return;
            const u8 = new Uint8Array(data);
            if (u8.length < 1) return;
            let offset = 0;
            const hasFormat = u8[offset++];
            if (hasFormat) {
                if (u8.length < offset + 12) {
                    console.warn('[Audio] truncated format header');
                    return;
                }
                // AudioFormat (12 bytes, commands.h, pack(1))
                const view = new DataView(data, offset, 12);
                const channels      = view.getUint16(0, true);
                const sampleRate    = view.getUint32(2, true);
                const bitsPerSample = view.getUint16(6, true);
                const blockAlign    = view.getUint16(8, true);
                const compression   = view.getUint8(10);
                offset += 12;
                offset += 1;  // padding byte
                if (channels === 0 || channels > 8) { console.error('[Audio] bad channels:', channels); return; }
                if (sampleRate < 8000 || sampleRate > 48000) { console.error('[Audio] bad sampleRate:', sampleRate); return; }
                const fmt = { compression, channels, sampleRate, bitsPerSample, blockAlign };
                const needReinit = !audioFormat ||
                    audioFormat.sampleRate !== fmt.sampleRate ||
                    audioFormat.channels   !== fmt.channels   ||
                    audioFormat.compression !== fmt.compression;
                audioFormat = fmt;
                if (needReinit) {
                    if (fmt.compression !== 1) {
                        console.error('[Audio] PCM payload not supported by web; set USING_OPUS=1 on client');
                        stopAllAudio();
                        return;
                    }
                    stopAllAudio();
                    if (_mediaSource && _mediaSource.readyState === 'open') {
                        _addSourceBufferAndInit();
                    }
                    // else: sourceopen handler will pick up audioFormat when it fires
                    console.log('[Audio] Format → ch=' + fmt.channels +
                                ' sr=' + fmt.sampleRate + ' compression=' + fmt.compression);
                }
            }
            if (!audioFormat || audioFormat.compression !== 1) return;
            if (u8.length <= offset) return;
            // The remaining bytes are one Opus packet (variable length).
            const opusBytes = new Uint8Array(data, offset);
            pushOpusPacket(opusBytes);
        }
        function handleBinaryFrame(data) {
            // 终端输出帧：4 字节 magic 'TRM1' (0x54 0x52 0x4D 0x31) → 转发到 xterm。
            // 视频帧首 4 字节是 deviceID (uint32 LE)，撞这个具体值的概率极低；4 字节 magic
            // 比单字节前缀安全得多，无需额外的状态校验。
            const u8 = new Uint8Array(data);
            if (u8.length >= 4 &&
                u8[0] === 0x54 && u8[1] === 0x52 && u8[2] === 0x4D && u8[3] === 0x31) {
                if (termState && termState.term) termState.term.write(u8.subarray(4));
                return;
            }
            // Audio frame: frameType byte at offset 4 indicates audio (96 = TOKEN_SCREEN_AUDIO)
            // Full frame format: [deviceID:4][frameType:1][dataLen:4][hasFormat:1][AudioFormat?][audio_data...]
            if (u8.length > 4 && u8[4] === 96) {
                // Skip frame header (9 bytes) and pass audio payload to handler
                const audioPayload = data.slice(9);
                handleAudioFrame(audioPayload);
                return;
            }
            // Video frame: [deviceID:4][frameType:1][dataLen:4][videoData...]
            const view = new DataView(data);
            const deviceId = view.getUint32(0, true);
            const frameType = view.getUint8(4);