Feature(audio): forward client PCM to web viewers with continuous playback

This commit is contained in:
yuanyuanxiang
2026-06-02 01:56:10 +02:00
parent da024fb3fb
commit 9aca587654
4 changed files with 505 additions and 5 deletions

View File

@@ -1283,12 +1283,74 @@
<script src="/static/xterm.js"></script>
<script src="/static/xterm-fit.js"></script>
<!-- Opus codec for audio decompression -->
<script src="https://cdn.jsdelivr.net/npm/opus.js@0.5.0/dist/opus.js"></script>
<script>
let ws = null, token = null, decoder = null, devices = [], currentDevice = null;
let frameCount = 0, lastFrameTime = 0, fps = 0, pingInterval = null;
const canvas = document.getElementById('screen-canvas');
const ctx = canvas.getContext('2d');
// ====== Audio & Video Implementation ======
//
// - Video: H.264 / AV1 → VideoDecoder Web API → canvas
// - Audio: client encodes PCM → Opus, server forwards raw Opus packets
// to web, web wraps each packet in a WebM SimpleBlock and
// feeds it to MediaSource → <audio> element (browser decodes
// Opus natively, plays via standard media-element pipeline).
//
// WS binary frame layout (matches C++ ScreenSpyDlg.cpp):
// Video : [deviceID:4][frameType:1][dataLen:4][videoData:N]
// Audio : [deviceID:4][frameType=96:1][dataLen:4]
// [hasFormat:1][AudioFormat:12][padding:1]?[opusPacket:N]
// Term : [magic:4='TRM1'][terminalData:N]
//
// AudioFormat (12 bytes, commands.h, pack(1)):
// channels:2 sampleRate:4 bitsPerSample:2 blockAlign:2
// compression:1 (0=PCM unsupported by web, 1=Opus) reserved:1
// MSE + WebM/Opus playback. Raw Opus packets arrive over WS; we wrap
// each one in a minimal WebM container in JS and feed it to a
// SourceBuffer attached to a hidden <audio> element. The browser
// decodes Opus natively. Tested on desktop Chrome; mobile playback
// is a known follow-up (see commit notes).
let audioFormat = null; // { compression, channels, sampleRate, bitsPerSample, blockAlign }
let audioEnabled = true; // Audio on/off flag (set by UI)
let syncDrift = 0; // A/V sync monitoring (milliseconds)
let _audioElement = null; // hidden <audio> sink
let _mediaSource = null; // MediaSource attached to _audioElement
let _sourceBuffer = null; // SourceBuffer (Opus in WebM)
const _sourceBufferQueue = []; // appendBuffer queue (one in-flight at a time)
let _sourceBufferBusy = false;
let _initSegmentSent = false; // first init segment appended for current format
let _opusTimestampMs = 0; // running absolute cluster timestamp (ms)
const OPUS_FRAME_MS = 20; // 960 samples @ 48k — matches client encoder
const _pendingOpusPackets = []; // packets received before SourceBuffer is ready
// Browser autoplay policies require an HTMLAudioElement to be created
// and .play()'d synchronously inside a user-gesture event handler.
// We hook the first click/keydown to spin up the element + MediaSource.
// Subsequent activity (e.g. tab regaining focus) re-issues play().
function installAudioGestureUnlock() {
const onGesture = () => {
if (!_audioElement) {
try {
_setupAudioElementAndMediaSource();
console.log('[MSE] <audio> + MediaSource set up by gesture');
} catch (e) {
console.error('[MSE] setup failed:', e && e.message);
}
} else if (_audioElement.paused) {
_audioElement.play().catch(() => {});
}
};
const opts = { passive: true, capture: true };
window.addEventListener('click', onGesture, opts);
window.addEventListener('keydown', onGesture, opts);
}
installAudioGestureUnlock();
// Pagination and filter state
let currentPage = 1;
let viewMode = 'grid'; // 'grid' or 'list'
@@ -1409,7 +1471,7 @@
}
}
};
ws.onclose = () => { stopPingInterval(); updateWsStatus('disconnected'); scheduleReconnect(); };
ws.onclose = () => { stopPingInterval(); updateWsStatus('disconnected'); stopAllAudio(); audioFormat = null; scheduleReconnect(); };
ws.onerror = (e) => console.error('WS error:', e);
ws.onmessage = (event) => {
if (typeof event.data === 'string') handleSignaling(JSON.parse(event.data));
@@ -1649,16 +1711,294 @@
return videoBytes[0] === 0x00 ? 'avc' : 'av1';
}
// ============================================================
// Minimal WebM-Opus muxer: wraps each Opus packet in a one-block
// Cluster so it can be fed to a SourceBuffer of type
// 'audio/webm; codecs="opus"'. The init segment (EBML header +
// Segment header + Tracks with OpusHead) is built once when the
// format is known and appended before any media clusters.
// ============================================================
const WebMMuxer = (function () {
// Variable-length integer (EBML VINT). Marker bit selects byte count.
function vint(value) {
if (value < 0x7F) return [0x80 | value];
if (value < 0x3FFF) return [0x40 | (value >> 8), value & 0xFF];
if (value < 0x1FFFFF) return [0x20 | (value >> 16), (value >> 8) & 0xFF, value & 0xFF];
if (value < 0x0FFFFFFF) return [0x10 | (value >> 24), (value >> 16) & 0xFF, (value >> 8) & 0xFF, value & 0xFF];
// 8-byte VINT for larger values (we don't usually need this)
const out = [0x01];
for (let i = 6; i >= 0; i--) out.push(Math.floor(value / Math.pow(2, i * 8)) & 0xFF);
return out;
}
// Unsigned int big-endian, n bytes
function uintBE(value, n) {
const out = new Array(n);
for (let i = n - 1; i >= 0; i--) { out[i] = value & 0xFF; value = Math.floor(value / 256); }
return out;
}
// 64-bit float big-endian
function f64BE(value) {
const buf = new ArrayBuffer(8);
new DataView(buf).setFloat64(0, value, false);
return Array.from(new Uint8Array(buf));
}
// EBML element = ID + size(VINT) + payload
function elem(idBytes, payload) {
const sz = vint(payload.length);
const out = new Array(idBytes.length + sz.length + payload.length);
let i = 0;
for (const b of idBytes) out[i++] = b;
for (const b of sz) out[i++] = b;
for (const b of payload) out[i++] = b;
return out;
}
// OpusHead codec-private structure (19 bytes). Per WebM/Opus spec,
// the authoritative encoder delay is CodecDelay (in ns) in the
// TrackEntry; pre-skip here is left at 0 to avoid double-skipping.
function opusHead(sampleRate, channels) {
return [
0x4F, 0x70, 0x75, 0x73, 0x48, 0x65, 0x61, 0x64, // "OpusHead"
0x01, // version
channels & 0xFF, // channel count
0x00, 0x00, // pre-skip (use CodecDelay instead)
sampleRate & 0xFF, (sampleRate >> 8) & 0xFF,
(sampleRate >> 16) & 0xFF, (sampleRate >> 24) & 0xFF,
0x00, 0x00, // output gain (LE)
0x00 // channel mapping family
];
}
function buildInitSegment(sampleRate, channels) {
const ebml = elem([0x1A, 0x45, 0xDF, 0xA3], [].concat(
elem([0x42, 0x86], [0x01]), // EBMLVersion
elem([0x42, 0xF7], [0x01]), // EBMLReadVersion
elem([0x42, 0xF2], [0x04]), // EBMLMaxIDLength
elem([0x42, 0xF3], [0x08]), // EBMLMaxSizeLength
elem([0x42, 0x82], [0x77, 0x65, 0x62, 0x6D]), // DocType "webm"
elem([0x42, 0x87], [0x04]), // DocTypeVersion
elem([0x42, 0x85], [0x02]) // DocTypeReadVersion
));
const info = elem([0x15, 0x49, 0xA9, 0x66], [].concat(
elem([0x2A, 0xD7, 0xB1], uintBE(1000000, 3)), // TimecodeScale 1ms
elem([0x4D, 0x80], [0x59, 0x61, 0x6D, 0x61]), // MuxingApp "Yama"
elem([0x57, 0x41], [0x59, 0x61, 0x6D, 0x61]) // WritingApp "Yama"
));
const trackEntry = [].concat(
elem([0xD7], [0x01]), // TrackNumber 1
elem([0x73, 0xC5], uintBE(1, 1)), // TrackUID 1
elem([0x83], [0x02]), // TrackType 2 (audio)
elem([0xB9], [0x01]), // FlagEnabled
elem([0x88], [0x01]), // FlagDefault
elem([0x9C], [0x00]), // FlagLacing 0
elem([0x86], [0x41, 0x5F, 0x4F, 0x50, 0x55, 0x53]), // CodecID "A_OPUS"
elem([0x63, 0xA2], opusHead(sampleRate, channels)), // CodecPrivate
elem([0x56, 0xAA], uintBE(6500000, 3)), // CodecDelay 6.5ms (ns)
elem([0x56, 0xBB], uintBE(80000000, 4)), // SeekPreRoll 80ms (ns)
elem([0xE1], [].concat( // Audio
elem([0xB5], f64BE(sampleRate)), // SamplingFrequency
elem([0x9F], [channels & 0xFF]) // Channels
))
);
const tracks = elem([0x16, 0x54, 0xAE, 0x6B], elem([0xAE], trackEntry));
// Segment uses unknown-size signal so we can stream clusters indefinitely
const segmentOpen = [0x18, 0x53, 0x80, 0x67,
0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF];
return new Uint8Array([].concat(ebml, segmentOpen, info, tracks));
}
function buildCluster(opusBytes, absMs) {
const simpleBlock = elem([0xA3], [].concat(
[0x81, 0x00, 0x00, 0x80], // TrackNumber=1, ts=0, flags=keyframe
Array.from(opusBytes)
));
const cluster = elem([0x1F, 0x43, 0xB6, 0x75], [].concat(
elem([0xE7], uintBE(absMs, 4)), // Timestamp (absolute, ms)
simpleBlock
));
return new Uint8Array(cluster);
}
return { buildInitSegment, buildCluster };
})();
// Create the hidden <audio> + MediaSource pair INSIDE a user-gesture
// call stack. Must complete .play() synchronously before any await.
function _setupAudioElementAndMediaSource() {
_audioElement = document.createElement('audio');
_audioElement.autoplay = true;
_audioElement.volume = 1.0;
_audioElement.style.display = 'none';
document.body.appendChild(_audioElement);
_mediaSource = new MediaSource();
_mediaSource.addEventListener('sourceopen', _onSourceOpen);
_audioElement.src = URL.createObjectURL(_mediaSource);
_audioElement.play().then(
() => console.log('[MSE] audio.play() ok'),
e => console.error('[MSE] audio.play() rejected:', e && e.message)
);
}
function _onSourceOpen() {
console.log('[MSE] sourceopen, readyState=' + (_mediaSource && _mediaSource.readyState));
if (audioFormat && audioFormat.compression === 1) {
_addSourceBufferAndInit();
}
}
function _addSourceBufferAndInit() {
if (!_mediaSource || _mediaSource.readyState !== 'open' || _sourceBuffer) return;
const mime = 'audio/webm; codecs="opus"';
if (!window.MediaSource || !MediaSource.isTypeSupported(mime)) {
console.error('[MSE] ' + mime + ' not supported by this browser');
return;
}
try {
_sourceBuffer = _mediaSource.addSourceBuffer(mime);
} catch (e) {
console.error('[MSE] addSourceBuffer failed:', e && e.message);
return;
}
_sourceBuffer.addEventListener('updateend', () => {
_sourceBufferBusy = false;
_flushSourceBufferQueue();
});
_sourceBuffer.addEventListener('error', e => console.error('[MSE] sourceBuffer error', e));
// Init segment first
_enqueueAppend(WebMMuxer.buildInitSegment(audioFormat.sampleRate, audioFormat.channels));
_initSegmentSent = true;
_opusTimestampMs = 0;
// Flush packets that arrived before SourceBuffer was ready
while (_pendingOpusPackets.length > 0) {
const pkt = _pendingOpusPackets.shift();
_enqueueAppend(WebMMuxer.buildCluster(pkt, _opusTimestampMs));
_opusTimestampMs += OPUS_FRAME_MS;
}
console.log('[MSE] SourceBuffer ready, init segment + ' +
(_opusTimestampMs / OPUS_FRAME_MS) + ' queued packets appended');
}
function _enqueueAppend(data) {
_sourceBufferQueue.push(data);
_flushSourceBufferQueue();
}
function _flushSourceBufferQueue() {
if (!_sourceBuffer || _sourceBufferBusy) return;
if (_sourceBufferQueue.length === 0) return;
const next = _sourceBufferQueue.shift();
_sourceBufferBusy = true;
try {
_sourceBuffer.appendBuffer(next);
} catch (e) {
console.error('[MSE] appendBuffer threw:', e && e.message);
_sourceBufferBusy = false;
}
}
function pushOpusPacket(opusBytes) {
if (!audioFormat || audioFormat.compression !== 1) return;
if (_sourceBuffer && _initSegmentSent) {
_enqueueAppend(WebMMuxer.buildCluster(opusBytes, _opusTimestampMs));
_opusTimestampMs += OPUS_FRAME_MS;
} else {
// Stash until SourceBuffer is ready. Cap at ~3s of audio.
const maxQueued = Math.ceil(3000 / OPUS_FRAME_MS);
while (_pendingOpusPackets.length >= maxQueued) _pendingOpusPackets.shift();
_pendingOpusPackets.push(new Uint8Array(opusBytes));
}
}
// Remove the SourceBuffer (so a new format/codec can be set up) but
// KEEP the same MediaSource and <audio> element. They hold our
// gesture-acquired play() permission — recreating either would
// require a fresh user tap on iOS. Never call endOfStream(), that
// transitions MediaSource to 'ended' which forbids future
// addSourceBuffer().
function stopAllAudio() {
if (_sourceBuffer && _mediaSource && _mediaSource.readyState === 'open') {
try { _mediaSource.removeSourceBuffer(_sourceBuffer); } catch (e) {}
}
_sourceBuffer = null;
_sourceBufferQueue.length = 0;
_sourceBufferBusy = false;
_initSegmentSent = false;
_opusTimestampMs = 0;
_pendingOpusPackets.length = 0;
}
function handleAudioFrame(data) {
if (!audioEnabled) return;
const u8 = new Uint8Array(data);
if (u8.length < 1) return;
let offset = 0;
const hasFormat = u8[offset++];
if (hasFormat) {
if (u8.length < offset + 12) {
console.warn('[Audio] truncated format header');
return;
}
// AudioFormat (12 bytes, commands.h, pack(1))
const view = new DataView(data, offset, 12);
const channels = view.getUint16(0, true);
const sampleRate = view.getUint32(2, true);
const bitsPerSample = view.getUint16(6, true);
const blockAlign = view.getUint16(8, true);
const compression = view.getUint8(10);
offset += 12;
offset += 1; // padding byte
if (channels === 0 || channels > 8) { console.error('[Audio] bad channels:', channels); return; }
if (sampleRate < 8000 || sampleRate > 48000) { console.error('[Audio] bad sampleRate:', sampleRate); return; }
const fmt = { compression, channels, sampleRate, bitsPerSample, blockAlign };
const needReinit = !audioFormat ||
audioFormat.sampleRate !== fmt.sampleRate ||
audioFormat.channels !== fmt.channels ||
audioFormat.compression !== fmt.compression;
audioFormat = fmt;
if (needReinit) {
if (fmt.compression !== 1) {
console.error('[Audio] PCM payload not supported by web; set USING_OPUS=1 on client');
stopAllAudio();
return;
}
stopAllAudio();
if (_mediaSource && _mediaSource.readyState === 'open') {
_addSourceBufferAndInit();
}
// else: sourceopen handler will pick up audioFormat when it fires
console.log('[Audio] Format → ch=' + fmt.channels +
' sr=' + fmt.sampleRate + ' compression=' + fmt.compression);
}
}
if (!audioFormat || audioFormat.compression !== 1) return;
if (u8.length <= offset) return;
// The remaining bytes are one Opus packet (variable length).
const opusBytes = new Uint8Array(data, offset);
pushOpusPacket(opusBytes);
}
function handleBinaryFrame(data) {
// 终端输出帧4 字节 magic 'TRM1' (0x54 0x52 0x4D 0x31) → 转发到 xterm。
// 视频帧首 4 字节是 deviceID (uint32 LE)撞这个具体值的概率极低4 字节 magic
// 比单字节前缀安全得多,无需额外的状态校验。
const u8 = new Uint8Array(data);
if (u8.length >= 4 &&
u8[0] === 0x54 && u8[1] === 0x52 && u8[2] === 0x4D && u8[3] === 0x31) {
if (termState && termState.term) termState.term.write(u8.subarray(4));
return;
}
// Audio frame: frameType byte at offset 4 indicates audio (96 = TOKEN_SCREEN_AUDIO)
// Full frame format: [deviceID:4][frameType:1][dataLen:4][hasFormat:1][AudioFormat?][audio_data...]
if (u8.length > 4 && u8[4] === 96) {
// Skip frame header (9 bytes) and pass audio payload to handler
const audioPayload = data.slice(9);
handleAudioFrame(audioPayload);
return;
}
// Video frame: [deviceID:4][frameType:1][dataLen:4][videoData...]
const view = new DataView(data);
const deviceId = view.getUint32(0, true);
const frameType = view.getUint8(4);