Files
SimpleRemoter/server/go/protocol/commands.go
yuanyuanxiang 8c7f612449 Feature: Implement H.264 and AV1 hardware encoding for remote control
Remark: Need to update FFmpeg static libraries to take effort
2026-05-30 00:12:38 +02:00

576 lines
21 KiB
Go

package protocol
import (
"bytes"
"crypto/hmac"
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"strconv"
"strings"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
)
// GbkToUTF8 converts GBK encoded bytes to UTF-8 string. The input is treated
// as a null-terminated GBK buffer (typical for Windows clients); content
// after the first NUL byte is discarded. Non-printable characters are
// stripped from the result.
func GbkToUTF8(data []byte) string {
// Find the first null byte and truncate there
if idx := bytes.IndexByte(data, 0); idx >= 0 {
data = data[:idx]
}
if len(data) == 0 {
return ""
}
// Try to decode as GBK
reader := transform.NewReader(bytes.NewReader(data), simplifiedchinese.GBK.NewDecoder())
buf := new(bytes.Buffer)
_, err := buf.ReadFrom(reader)
if err != nil {
// If GBK decoding fails, try treating as UTF-8 or ASCII
return cleanString(string(data))
}
return cleanString(buf.String())
}
// Utf8CleanString trims at the first NUL and strips non-printables — the
// UTF-8 counterpart of GbkToUTF8 for clients that have the CLIENT_CAP_UTF8
// capability bit. Decoding as GBK in that case would mangle multi-byte
// sequences (the C++ comment at WebService.cpp:1530 calls out this exact
// "double-encoding" footgun).
func Utf8CleanString(data []byte) string {
if idx := bytes.IndexByte(data, 0); idx >= 0 {
data = data[:idx]
}
if len(data) == 0 {
return ""
}
return cleanString(string(data))
}
// cleanString removes non-printable characters except common whitespace
func cleanString(s string) string {
var result strings.Builder
for _, r := range s {
if r >= 32 || r == '\t' || r == '\n' || r == '\r' {
result.WriteRune(r)
}
}
return strings.TrimSpace(result.String())
}
// Client capability bitmask values, matching common/commands.h CLIENT_CAP_*.
// Reported in the hex tail of LOGIN_INFOR.moduleVersion (after the '-').
const (
ClientCapV2 uint32 = 0x0001 // CLIENT_CAP_V2 — V2 file transfer
ClientCapUTF8 uint32 = 0x0002 // CLIENT_CAP_UTF8 — UTF-8 protocol strings (activeWindow, key-log titles, ...)
ClientCapScreenPreview uint32 = 0x0004 // CLIENT_CAP_SCREEN_PREVIEW
)
// SupportsCap returns true when the client's reported capability hex string
// has the given bit set. An empty / unparseable string means "no caps" and
// matches the legacy GBK-Windows convention.
func SupportsCap(capability string, bit uint32) bool {
if capability == "" {
return false
}
caps, err := strconv.ParseUint(strings.TrimSpace(capability), 16, 32)
if err != nil {
return false
}
return uint32(caps)&bit != 0
}
// DecodeClientString decodes a fixed-length, NUL-padded buffer the client
// sent as part of a binary protocol field (typically ActiveWnd). If the
// client signals UTF-8 capability or is known to ship UTF-8 by default
// (Linux / macOS), the bytes are treated as UTF-8; otherwise they're
// decoded from GBK (CP936 — the legacy Windows default).
//
// clientType comes from LOGIN_INFOR reserved field 0 (RES_CLIENT_TYPE) and
// capability from the hex tail of moduleVersion. Both can be empty.
func DecodeClientString(data []byte, capability, clientType string) string {
if SupportsCap(capability, ClientCapUTF8) || clientType == "LNX" || clientType == "MAC" {
return Utf8CleanString(data)
}
return GbkToUTF8(data)
}
// Command tokens - matching the C++ definitions (common/commands.h).
const (
// Server -> Client commands
CommandActived byte = 0 // COMMAND_ACTIVED
CommandScreenSpy byte = 16 // COMMAND_SCREEN_SPY - start screen capture
CommandScreenControl byte = 20 // COMMAND_SCREEN_CONTROL - mouse/keyboard input (MSG64 batches)
CommandNext byte = 30 // COMMAND_NEXT - "control-side dialog is open, you may stream"
CommandShell byte = 40 // COMMAND_SHELL - ask device to open a shell sub-connection
CommandTerminalRsize byte = 81 // CMD_TERMINAL_RESIZE - [cmd:1][cols:2 LE][rows:2 LE]
CmdRestoreConsole byte = 82 // CMD_RESTORE_CONSOLE - RDP session "归位": switch back to the console session and restart capture
CommandBye byte = 204 // COMMAND_BYE - disconnect
CommandHeartbeat byte = 216 // CMD_HEARTBEAT_ACK
// Client -> Server tokens
TokenAuth byte = 100 // TOKEN_AUTH - authorization required
TokenHeartbeat byte = 101 // TOKEN_HEARTBEAT
TokenLogin byte = 102 // TOKEN_LOGIN - login packet
TokenBitmapInfo byte = 115 // TOKEN_BITMAPINFO - screen sub-connection header
TokenFirstScreen byte = 116 // TOKEN_FIRSTSCREEN - raw BGRA baseline frame (NOT H264)
TokenNextScreen byte = 117 // TOKEN_NEXTSCREEN - non-keyframe H264 (P-frame)
TokenShellStart byte = 128 // TOKEN_SHELL_START - legacy cmd-pipe shell sub-conn open
TokenKeyframe byte = 134 // TOKEN_KEYFRAME - H264 IDR (sent on GOP boundary)
TokenTerminalStart byte = 232 // TOKEN_TERMINAL_START - modern PTY shell sub-conn open
TokenTerminalClose byte = 233 // TOKEN_TERMINAL_CLOSE - shell exited / close ack
TokenConnAuth byte = 246 // TOKEN_CONN_AUTH - sub-connection identity handshake
CmdCursorImage byte = 93 // CMD_CURSOR_IMAGE - custom cursor bitmap (Phase 5+ feature)
)
// Sub-connection authentication (matches common/commands.h ConnAuth* structs).
// Each newly-opened sub-conn first sends a 512-byte ConnAuthPacket, then waits
// for a 256-byte ConnAuthAck before any further command is meaningful.
const (
ConnAuthPacketSize = 512
ConnAuthAckSize = 256
// ConnAuthPacket field offsets within the inbound 512-byte buffer.
// Layout (from common/commands.h::ConnAuthPacket):
// [token:1][clientID:8 LE][timestamp:8 LE][nonce:16][signature:64][reserved:415]
ConnAuthOffClientID = 1 // uint64 LE — pin to the sub-conn so later
// // 1-byte tokens (TOKEN_TERMINAL_START etc.) can
// // resolve the parent device.
// ConnAuthAck field offsets within the outbound 256-byte buffer.
ConnAuthAckOffStatus = 1 // uint8
ConnAuthAckOffServerTime = 2 // uint64 LE
// Status codes.
ConnAuthStatusOK byte = 0
)
// CMD_MASTERSETTING is the server's reply to a fresh client login. The
// client uses the Signature field to prove this server has the shared
// secret; without a valid signature the client's private FileUpload init
// aborts the process. Struct layout matches MasterSettings in
// common/commands.h (pragma pack 4, total 1000 bytes).
const (
CmdMasterSetting byte = 215
MasterSettingsSize = 1000
MasterSettingsOffReportInterval = 0 // int32, seconds
MasterSettingsOffSignature = 508 // Signature[64]
MasterSettingsSignatureLen = 64
// DefaultReportIntervalSec matches the C++ default. Sending 0 makes the
// client disable its active-window heartbeat field, breaking RTT /
// ActiveWindow live updates on the web UI.
DefaultReportIntervalSec = 5
)
// SignMessage computes HMAC-SHA256(key, msg) and returns the 64-char
// lowercase hex digest. Used to sign CMD_MASTERSETTING replies so the
// client can verify the response came from a legitimate server.
//
// The key is a deployment-time shared secret loaded from the
// YAMA_SIGN_PASSWORD env var so the binary doesn't carry the literal in
// cleartext; provision out-of-band and never commit it.
func SignMessage(password string, msg []byte) string {
mac := hmac.New(sha256.New, []byte(password))
mac.Write(msg)
return hex.EncodeToString(mac.Sum(nil))
}
// Screen-spy parameters that match the C++ ScreenSpy implementation.
const (
AlgorithmH264 byte = 2 // ALGORITHM_H264 — H264 encoding (the algorithm web uses)
)
// Windows message constants used inside MSG64.message. The client dispatches
// on these values verbatim (CScreenManager::ProcessCommand at
// client/ScreenManager.cpp:1617), so these MUST stay bit-identical to the
// WinUser.h definitions even though this Go server is cross-platform.
const (
WMKeyDown uint64 = 0x0100
WMKeyUp uint64 = 0x0101
WMSysKeyDown uint64 = 0x0104
WMSysKeyUp uint64 = 0x0105
WMMouseMove uint64 = 0x0200
WMLButtonDown uint64 = 0x0201
WMLButtonUp uint64 = 0x0202
WMLButtonDblClk uint64 = 0x0203
WMRButtonDown uint64 = 0x0204
WMRButtonUp uint64 = 0x0205
WMRButtonDblClk uint64 = 0x0206
WMMButtonDown uint64 = 0x0207
WMMButtonUp uint64 = 0x0208
WMMouseWheel uint64 = 0x020A
)
// Virtual-key codes referenced from the input mapping. Same numeric values
// as the Win32 VK_* constants.
const (
VKLWin = 0x5B // VK_LWIN — filtered: never forwarded
VKRWin = 0x5C // VK_RWIN — filtered: never forwarded
VKPrior = 0x21 // VK_PRIOR (Page Up) — extended-key range start
VKDown = 0x28 // VK_DOWN — extended-key range end
VKInsert = 0x2D
VKDelete = 0x2E
VKNumLock = 0x90
VKRControl = 0xA3
VKRMenu = 0xA5
VKApps = 0x5D
)
// MK_* wParam bitflags for mouse-button messages.
const (
MKLButton uint64 = 0x0001
MKRButton uint64 = 0x0002
MKMButton uint64 = 0x0010
)
// MSG64 is the 48-byte fixed layout the client expects inside a
// COMMAND_SCREEN_CONTROL packet (common/commands.h class MSG64).
//
// [hwnd:8][message:8][wParam:8][lParam:8][time:8][pt.x:4][pt.y:4]
//
// All uint64 fields are little-endian; pt is two int32 LE. The client's
// ProcessCommand validates `ulLength % 48 == 0` and treats each 48-byte
// block as one MSG64.
const Msg64Size = 48
// BuildScreenControlPacket encodes one COMMAND_SCREEN_CONTROL packet
// carrying a single MSG64 record. The cmd byte is prepended.
//
// Wire layout:
//
// [CMD:1][hwnd:8 LE][message:8 LE][wParam:8 LE][lParam:8 LE][time:8 LE][pt.x:4 LE][pt.y:4 LE]
//
// time is filled with a monotonic-ish ms value (ms since Unix epoch trimmed
// to 32 bits) so the client's GetTickCount() comparisons stay reasonable.
func BuildScreenControlPacket(message, wParam, lParam uint64, ptX, ptY int32, timeMs uint32) []byte {
buf := make([]byte, 1+Msg64Size)
buf[0] = CommandScreenControl
// hwnd left zero — the client recomputes hWnd via WindowFromPoint.
binary.LittleEndian.PutUint64(buf[1+8:1+16], message)
binary.LittleEndian.PutUint64(buf[1+16:1+24], wParam)
binary.LittleEndian.PutUint64(buf[1+24:1+32], lParam)
binary.LittleEndian.PutUint64(buf[1+32:1+40], uint64(timeMs))
binary.LittleEndian.PutUint32(buf[1+40:1+44], uint32(ptX))
binary.LittleEndian.PutUint32(buf[1+44:1+48], uint32(ptY))
return buf
}
// TerminalBinaryMagic is the 4-byte prefix the web UI uses to demultiplex
// terminal output from screen frames over the single WebSocket. Matches
// the C++ side at server/2015Remote/WebService.cpp:2013 ("TRM1"). Screen
// frames lead with a uint32 LE device ID, so collisions with this exact
// magic are astronomically rare in practice.
var TerminalBinaryMagic = [4]byte{'T', 'R', 'M', '1'}
// BuildTerminalResize encodes the 5-byte CMD_TERMINAL_RESIZE packet the
// client's ConPTYManager/TerminalManager expects on the shell sub-conn:
//
// [CMD_TERMINAL_RESIZE:1][cols:2 LE][rows:2 LE]
//
// cols/rows are signed int16 on the wire (the C++ side casts to `short`).
func BuildTerminalResize(cols, rows int) []byte {
buf := make([]byte, 5)
buf[0] = CommandTerminalRsize
binary.LittleEndian.PutUint16(buf[1:3], uint16(int16(cols)))
binary.LittleEndian.PutUint16(buf[3:5], uint16(int16(rows)))
return buf
}
// MakeLParam packs x into the low word and y into the high word — the
// Windows MAKELPARAM macro the client expects in mouse-message lParams.
func MakeLParam(x, y int32) uint64 {
return uint64(uint32(x)&0xFFFF) | (uint64(uint32(y)&0xFFFF) << 16)
}
// IsExtendedKey returns true when the given Win32 VK code should set the
// extended-key bit (bit 24) in a keyboard lParam. Matches the C++
// HandleKey logic (server/2015Remote/WebService.cpp:944).
func IsExtendedKey(vk int) bool {
if vk >= VKPrior && vk <= VKDown {
return true
}
switch vk {
case VKInsert, VKDelete, VKNumLock, VKRControl, VKRMenu, VKApps:
return true
}
return false
}
// Reserved-field indices we care about (see common/commands.h RES_* enum).
// LOGIN_INFOR.szReserved is a '|'-separated list; clients fill known slots
// even when leaving others blank ("?").
const (
ResFieldClientType = 0 // RES_CLIENT_TYPE — client kind (Windows / macOS / ...)
ResFieldFilePath = 4 // RES_FILE_PATH — install path
ResFieldInstallTime = 6 // RES_INSTALL_TIME
ResFieldClientLoc = 10 // RES_CLIENT_LOC — geo string
ResFieldClientPubIP = 11 // RES_CLIENT_PUBIP — public IP
ResFieldResolution = 15 // RES_RESOLUTION — client-formatted screen geometry: "N:W*H"
ResFieldClientID = 16 // RES_CLIENT_ID — uint64 decimal, matches TOKEN_BITMAPINFO clientID
)
// ScreenFrameHeaderLen is the size of the small per-frame header prepended by
// the device on every TOKEN_NEXTSCREEN buffer, before the H.264 NAL payload.
// Layout (excluding the leading TOKEN_* byte):
//
// [algorithm:1][cursorPos:8 (int32 x, int32 y)][cursorIdx:1] = 10 bytes
//
// (The C++ side counts the token byte into its ulHeadLength=11; we keep the
// constant strictly post-token so the call site reads `skip := 1 + headerLen`
// without confusion.) SCREENYSPY_IMPROVE adds a 4-byte frameID after the
// cursor index, which is the production-off setting per common/commands.h.
const ScreenFrameHeaderLen = 1 + 8 + 1
// IsH264Keyframe scans an Annex-B H.264 bitstream for a NAL unit indicating
// a keyframe boundary — IDR (type 5), SPS (7) or PPS (8). Returns true on
// the first hit. Matches the detection used by the C++ ScreenSpy broadcast
// path so frame-type bytes stay consistent across server implementations.
func IsH264Keyframe(data []byte) bool {
n := len(data)
for i := 0; i+4 < n; i++ {
var nalOffset int
switch {
case data[i] == 0 && data[i+1] == 0 && data[i+2] == 0 && data[i+3] == 1:
nalOffset = i + 4
case data[i] == 0 && data[i+1] == 0 && data[i+2] == 1:
nalOffset = i + 3
default:
continue
}
if nalOffset >= n {
continue
}
nalType := data[nalOffset] & 0x1F
if nalType == 5 || nalType == 7 || nalType == 8 {
return true
}
}
return false
}
// IsAnyKeyframe sniffs the codec from the first byte then dispatches to the
// matching keyframe detector. H.264 Annex B always starts with 0x00 (start
// code prefix); AV1 OBU headers have bit7=0 and bits[3:6]=obu_type in [1,15]
// so the first byte is in [0x08,0x78] and never 0x00. Lets the server stay
// codec-agnostic so the browser can run H.264 and AV1 sessions side by side.
func IsAnyKeyframe(data []byte) bool {
if len(data) == 0 {
return false
}
if data[0] == 0x00 {
return IsH264Keyframe(data)
}
return IsAv1Keyframe(data)
}
// IsAv1Keyframe walks the OBU chain and returns true on the first
// OBU_SEQUENCE_HEADER (type 1). FFmpeg's AV1 encoders prepend SEQ HDR to
// every IDR, so seeing one is equivalent to "this packet contains a key
// frame". Mirrors the C++ IsAv1Keyframe helper in ScreenSpyDlg.cpp.
//
// AV1 OBU header byte layout: 0|type:4|ext:1|size:1|reserved:1
func IsAv1Keyframe(data []byte) bool {
n := len(data)
pos := 0
for pos < n {
hdr := data[pos]
obuType := (hdr >> 3) & 0x0F
hasExt := hdr&0x04 != 0
hasSize := hdr&0x02 != 0
if obuType == 1 { // OBU_SEQUENCE_HEADER
return true
}
pos++
if hasExt {
if pos >= n {
return false
}
pos++
}
if !hasSize {
return false // unsized OBU runs to end of packet
}
// LEB128 size
var sz uint64
for i := range 8 {
if pos >= n {
return false
}
b := data[pos]
pos++
sz |= uint64(b&0x7F) << (7 * i)
if b&0x80 == 0 {
break
}
}
if uint64(pos)+sz > uint64(n) {
return false
}
pos += int(sz)
}
return false
}
// LOGIN_INFOR structure size and offsets (matching C++ struct with default alignment)
// Note: C++ struct uses default alignment (4-byte for uint32/int)
const (
LoginInfoSize = 980 // Total size of LOGIN_INFOR struct (with alignment padding)
// Field offsets (with alignment padding)
OffsetToken = 0 // 1 byte (unsigned char)
OffsetOsVerInfoEx = 1 // 156 bytes (char[156])
// 3 bytes padding here to align dwCPUMHz to 4-byte boundary
OffsetCPUMHz = 160 // 4 bytes (unsigned int) - aligned to 4
OffsetModuleVersion = 164 // 24 bytes (char[24])
OffsetPCName = 188 // 240 bytes (char[240])
OffsetMasterID = 428 // 20 bytes (char[20])
OffsetWebCamExist = 448 // 4 bytes (int) - aligned to 4
OffsetSpeed = 452 // 4 bytes (unsigned int)
OffsetStartTime = 456 // 20 bytes (char[20])
OffsetReserved = 476 // 512 bytes (char[512])
)
// LoginInfo represents client login information
type LoginInfo struct {
Token byte
OsVerInfo string // OS version info
CPUMHz uint32
ModuleVersion string
PCName string // Computer name
MasterID string
WebCamExist bool
Speed uint32
StartTime string
Reserved string // Contains additional info separated by |
}
// ParseLoginInfo parses LOGIN_INFOR from data.
//
// Encoding: text fields are GBK on legacy Windows clients and UTF-8 on modern
// clients that set CLIENT_CAP_UTF8 (always on for LNX / MAC). Picking the
// wrong codec mangles non-ASCII characters — e.g. a German location string
// "Nürnberg" sent as UTF-8 (4E C3 BC 72 ...) and force-decoded as GBK turns
// into mojibake. The heartbeat path already honors this via DecodeClientString
// (see cmd/main.go handleHeartbeat); ParseLoginInfo previously did not, so
// every login string from a UTF-8 client was being misread.
//
// To get encoding right we have a chicken-and-egg problem: capability lives
// in ModuleVersion (offset 164) and clientType lives in Reserved field 0
// (offset 476) — but Reserved itself needs that information to decode. Both
// "discriminator" values are pure ASCII (hex digits, "Windows"/"LNX"/"MAC"),
// so we can extract them with a UTF-8 read and then re-decode the actual
// user-text fields with the correct codec.
func ParseLoginInfo(data []byte) (*LoginInfo, error) {
if len(data) < 100 { // Minimum size check
return nil, ErrInvalidData
}
info := &LoginInfo{
Token: data[0],
}
// CPU MHz, WebCam, Speed — fixed-width binary, encoding-independent.
if len(data) >= OffsetCPUMHz+4 {
info.CPUMHz = binary.LittleEndian.Uint32(data[OffsetCPUMHz:])
}
if len(data) >= OffsetWebCamExist+4 {
info.WebCamExist = binary.LittleEndian.Uint32(data[OffsetWebCamExist:]) != 0
}
if len(data) >= OffsetSpeed+4 {
info.Speed = binary.LittleEndian.Uint32(data[OffsetSpeed:])
}
// ModuleVersion is "version-capabilityHex" — pure ASCII (e.g. "Dec 19
// 2025-0006"). Safe to read as UTF-8 regardless of client codec.
if len(data) >= OffsetModuleVersion+24 {
info.ModuleVersion = Utf8CleanString(data[OffsetModuleVersion : OffsetModuleVersion+24])
}
_, capability, _ := strings.Cut(info.ModuleVersion, "-")
// Peek at Reserved field 0 (RES_CLIENT_TYPE: "Windows" / "LNX" / "MAC")
// — pure ASCII, so we can read raw bytes without knowing the codec.
// LNX / MAC clients are implicitly UTF-8 even when capability is absent.
clientType := ""
if len(data) > OffsetReserved {
raw := data[OffsetReserved:min(OffsetReserved+512, len(data))]
if nul := bytes.IndexByte(raw, 0); nul >= 0 {
raw = raw[:nul]
}
head, _, _ := bytes.Cut(raw, []byte("|"))
clientType = string(head)
}
// Now decode every user-text field with the client's actual codec.
decode := func(b []byte) string { return DecodeClientString(b, capability, clientType) }
if len(data) >= OffsetOsVerInfoEx+156 {
info.OsVerInfo = decode(data[OffsetOsVerInfoEx : OffsetOsVerInfoEx+156])
}
if len(data) >= OffsetPCName+240 {
info.PCName = decode(data[OffsetPCName : OffsetPCName+240])
}
if len(data) >= OffsetMasterID+20 {
info.MasterID = decode(data[OffsetMasterID : OffsetMasterID+20])
}
if len(data) >= OffsetStartTime+20 {
info.StartTime = decode(data[OffsetStartTime : OffsetStartTime+20])
}
if len(data) >= OffsetReserved+512 {
info.Reserved = decode(data[OffsetReserved : OffsetReserved+512])
} else if len(data) > OffsetReserved {
info.Reserved = decode(data[OffsetReserved:])
}
return info, nil
}
// ParseReserved parses the reserved field into a slice of strings
func (info *LoginInfo) ParseReserved() []string {
if info.Reserved == "" {
return nil
}
return strings.Split(info.Reserved, "|")
}
// GetReservedField returns a specific field from reserved data by index
// Fields: ClientType(0), SystemBits(1), CPU(2), Memory(3), FilePath(4),
// Reserved(5), InstallTime(6), InstallInfo(7), ProgramBits(8), ExpiredDate(9),
// ClientLoc(10), ClientPubIP(11), ExeVersion(12), Username(13), IsAdmin(14)
func (info *LoginInfo) GetReservedField(index int) string {
fields := info.ParseReserved()
if index >= 0 && index < len(fields) {
return fields[index]
}
return ""
}
// Validation structure for TOKEN_AUTH
type Validation struct {
From string // Start date (20 bytes)
To string // End date (20 bytes)
Admin string // Admin address (100 bytes)
Port uint16 // Admin port (2 bytes)
MaxDepth uint16 // Max generation depth (2 bytes), 0=cannot generate sub-master
Checksum string // HMAC checksum field (16 bytes)
}
// BuildValidation creates a validation response
func BuildValidation(days float64, admin string, port int, maxDepth uint16) []byte {
// This would build the validation structure
// For now, return a simple structure
data := make([]byte, 160) // Size of Validation struct
data[0] = TokenAuth
// Fill in fields...
// From: 20 bytes (offset 0)
// To: 20 bytes (offset 20)
// Admin: 100 bytes (offset 40)
// Port: 2 bytes (offset 140)
// MaxDepth: 2 bytes (offset 142)
// Checksum: 16 bytes (offset 144)
return data
}