Perf: Optimize macOS screen capture with CGDisplayStream

Core optimization:
- Use CGDisplayStream instead of per-frame CGDisplayCreateImage
- Push model: CPU sleeps when screen is static (condition_variable wait)
- IOSurface capture avoids expensive image creation per frame
- ~47% CPU reduction during active remote desktop (45% → 24%)

Additional optimizations:
- vImageVerticalReflect (SIMD) replaces manual row-by-row flip
- Cache CGColorSpaceRef to avoid per-frame creation/release
- Cache tempBuffer to avoid per-frame memory allocation
- Throttle getCursorTypeIndex to 250ms (Accessibility API is expensive)

Bug fixes:
- Fix unreliable screen capture permission check (use actual capture test)
- Improve permission logging

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
yuanyuanxiang
2026-05-03 23:18:30 +02:00
parent b732f841d0
commit 92f3df8464
7 changed files with 483 additions and 43 deletions

View File

@@ -12,6 +12,7 @@
#import <CoreGraphics/CoreGraphics.h>
#import <ApplicationServices/ApplicationServices.h>
#import <mach/mach_time.h>
#import <Accelerate/Accelerate.h>
// Global client ID (calculated in main.mm)
extern uint64_t g_myClientID;
@@ -31,9 +32,17 @@ ScreenHandler::ScreenHandler(IOCPClient* client)
, m_qualityLevel(QUALITY_GOOD) // Use fixed QUALITY_GOOD (H264) for web compatibility
, m_h264Bitrate(3000000) // 3 Mbps (matches Windows QUALITY_GOOD)
, m_displayAssertionID(0)
, m_colorSpace(nullptr)
, m_displayStream(nullptr)
, m_streamQueue(nullptr)
, m_latestSurface(nullptr)
, m_hasNewFrame(false)
{
memset(&m_bmpHeader, 0, sizeof(m_bmpHeader));
// Cache color space (avoid per-frame creation)
m_colorSpace = CGColorSpaceCreateDeviceRGB();
// Initialize input handler for mouse/keyboard control
m_inputHandler = std::make_unique<InputHandler>();
if (m_inputHandler->init()) {
@@ -46,6 +55,13 @@ ScreenHandler::ScreenHandler(IOCPClient* client)
ScreenHandler::~ScreenHandler()
{
stop();
cleanupDisplayStream();
// Release cached color space
if (m_colorSpace) {
CGColorSpaceRelease(m_colorSpace);
m_colorSpace = nullptr;
}
}
bool ScreenHandler::init()
@@ -153,10 +169,191 @@ bool ScreenHandler::init()
NSLog(@"Display wake complete");
}
// Initialize CGDisplayStream for efficient capture
if (!initDisplayStream()) {
NSLog(@"Warning: CGDisplayStream init failed, falling back to legacy capture");
}
NSLog(@"ScreenHandler initialized: %dx%d", m_width, m_height);
return true;
}
bool ScreenHandler::initDisplayStream()
{
// Create dispatch queue for stream callbacks
m_streamQueue = dispatch_queue_create("com.ghost.screenstream", DISPATCH_QUEUE_SERIAL);
if (!m_streamQueue) {
NSLog(@"Failed to create dispatch queue for display stream");
return false;
}
// Stream properties
CFMutableDictionaryRef properties = CFDictionaryCreateMutable(
kCFAllocatorDefault, 0,
&kCFTypeDictionaryKeyCallBacks,
&kCFTypeDictionaryValueCallBacks
);
// Request minimum frame interval based on FPS (e.g., 15 FPS = 1/15 sec)
int fps = m_maxFPS.load();
if (fps <= 0) fps = 15;
double interval = 1.0 / (double)fps;
CFNumberRef intervalRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberDoubleType, &interval);
CFDictionarySetValue(properties, kCGDisplayStreamMinimumFrameTime, intervalRef);
CFRelease(intervalRef);
// Show cursor in stream
CFDictionarySetValue(properties, kCGDisplayStreamShowCursor, kCFBooleanFalse);
// Preserve aspect ratio
CFDictionarySetValue(properties, kCGDisplayStreamPreserveAspectRatio, kCFBooleanTrue);
// Create the display stream with BGRA format
__block ScreenHandler* handler = this;
m_displayStream = CGDisplayStreamCreateWithDispatchQueue(
m_displayID,
m_width,
m_height,
'BGRA', // Pixel format
properties,
m_streamQueue,
^(CGDisplayStreamFrameStatus status,
uint64_t displayTime,
IOSurfaceRef frameSurface,
CGDisplayStreamUpdateRef updateRef) {
(void)displayTime;
(void)updateRef;
if (status == kCGDisplayStreamFrameStatusFrameComplete && frameSurface) {
handler->processIOSurface(frameSurface);
} else if (status == kCGDisplayStreamFrameStatusFrameIdle) {
// Screen not changed, still notify for FPS timing
handler->m_hasNewFrame.store(true);
handler->m_surfaceCond.notify_one();
} else if (status == kCGDisplayStreamFrameStatusStopped) {
NSLog(@"CGDisplayStream stopped");
}
}
);
CFRelease(properties);
if (!m_displayStream) {
NSLog(@"Failed to create CGDisplayStream");
m_streamQueue = nullptr; // ARC manages dispatch objects
return false;
}
// Start the stream
CGError err = CGDisplayStreamStart(m_displayStream);
if (err != kCGErrorSuccess) {
NSLog(@"Failed to start CGDisplayStream: %d", err);
CFRelease(m_displayStream);
m_displayStream = nullptr;
m_streamQueue = nullptr; // ARC manages dispatch objects
return false;
}
NSLog(@"CGDisplayStream started: %dx%d @ %d FPS", m_width, m_height, fps);
return true;
}
void ScreenHandler::cleanupDisplayStream()
{
if (m_displayStream) {
CGDisplayStreamStop(m_displayStream);
CFRelease(m_displayStream);
m_displayStream = nullptr;
}
// ARC manages dispatch objects, just nil the pointer
m_streamQueue = nullptr;
std::lock_guard<std::mutex> lock(m_surfaceMutex);
if (m_latestSurface) {
CFRelease(m_latestSurface);
m_latestSurface = nullptr;
}
}
void ScreenHandler::processIOSurface(IOSurfaceRef surface)
{
// Retain the surface and store it
std::lock_guard<std::mutex> lock(m_surfaceMutex);
if (m_latestSurface) {
CFRelease(m_latestSurface);
}
m_latestSurface = (IOSurfaceRef)CFRetain(surface);
m_hasNewFrame.store(true);
m_surfaceCond.notify_one();
}
bool ScreenHandler::captureFromIOSurface(IOSurfaceRef surface, std::vector<uint8_t>& buffer)
{
if (!surface) return false;
// Lock the surface for CPU read
IOSurfaceLock(surface, kIOSurfaceLockReadOnly, nullptr);
size_t width = IOSurfaceGetWidth(surface);
size_t height = IOSurfaceGetHeight(surface);
size_t bytesPerRow = IOSurfaceGetBytesPerRow(surface);
void* baseAddr = IOSurfaceGetBaseAddress(surface);
if (!baseAddr || width != (size_t)m_width || height != (size_t)m_height) {
IOSurfaceUnlock(surface, kIOSurfaceLockReadOnly, nullptr);
return false;
}
// Ensure temp buffer is allocated
size_t requiredSize = m_width * 4 * m_height;
if (m_tempBuffer.size() != requiredSize) {
m_tempBuffer.resize(requiredSize);
}
// Copy from IOSurface to temp buffer (handle different bytesPerRow)
size_t dstBytesPerRow = m_width * 4;
if (bytesPerRow == dstBytesPerRow) {
memcpy(m_tempBuffer.data(), baseAddr, requiredSize);
} else {
// Row by row copy for different strides
uint8_t* src = (uint8_t*)baseAddr;
uint8_t* dst = m_tempBuffer.data();
for (size_t y = 0; y < height; y++) {
memcpy(dst + y * dstBytesPerRow, src + y * bytesPerRow, dstBytesPerRow);
}
}
IOSurfaceUnlock(surface, kIOSurfaceLockReadOnly, nullptr);
// Flip vertically using Accelerate framework (SIMD optimized)
vImage_Buffer src = {
.data = m_tempBuffer.data(),
.height = (vImagePixelCount)height,
.width = (vImagePixelCount)width,
.rowBytes = dstBytesPerRow
};
vImage_Buffer dst = {
.data = buffer.data(),
.height = (vImagePixelCount)height,
.width = (vImagePixelCount)width,
.rowBytes = dstBytesPerRow
};
vImage_Error err = vImageVerticalReflect_ARGB8888(&src, &dst, kvImageNoFlags);
if (err != kvImageNoError) {
// Fallback to manual flip
for (size_t y = 0; y < height; y++) {
memcpy(buffer.data() + (height - 1 - y) * dstBytesPerRow,
m_tempBuffer.data() + y * dstBytesPerRow,
dstBytesPerRow);
}
}
return true;
}
void ScreenHandler::start(IOCPClient* client, uint64_t clientID)
{
// If already running, just send TOKEN_BITMAPINFO again
@@ -190,6 +387,10 @@ void ScreenHandler::start(IOCPClient* client, uint64_t clientID)
void ScreenHandler::stop()
{
m_running = false;
// Wake up capture thread if waiting
m_surfaceCond.notify_all();
if (m_captureThread.joinable()) {
m_captureThread.join();
}
@@ -451,7 +652,27 @@ void ScreenHandler::applyQualityLevel(int8_t level, bool persist)
bool ScreenHandler::captureScreen(std::vector<uint8_t>& buffer)
{
// Create image from display
// Try to use IOSurface from display stream (more efficient)
if (m_displayStream) {
IOSurfaceRef surface = nullptr;
{
std::lock_guard<std::mutex> lock(m_surfaceMutex);
if (m_latestSurface) {
surface = (IOSurfaceRef)CFRetain(m_latestSurface);
}
}
if (surface) {
bool result = captureFromIOSurface(surface, buffer);
CFRelease(surface);
if (result) {
return true;
}
}
// Fall through to legacy method if IOSurface failed
}
// Legacy method: CGDisplayCreateImage (fallback)
CGImageRef image = CGDisplayCreateImage(m_displayID);
if (!image) {
NSLog(@"Failed to capture screen image");
@@ -462,49 +683,58 @@ bool ScreenHandler::captureScreen(std::vector<uint8_t>& buffer)
size_t height = CGImageGetHeight(image);
if (width != (size_t)m_width || height != (size_t)m_height) {
// Screen resolution changed, need to reinitialize
CGImageRelease(image);
NSLog(@"Screen resolution changed: %zux%zu", width, height);
return false;
}
// Create bitmap context to get raw pixel data
CGColorSpaceRef colorSpace = CGColorSpaceCreateDeviceRGB();
size_t bytesPerRow = width * 4;
// Temporary buffer for top-down BGRA
std::vector<uint8_t> tempBuffer(bytesPerRow * height);
size_t requiredSize = bytesPerRow * height;
if (m_tempBuffer.size() != requiredSize) {
m_tempBuffer.resize(requiredSize);
}
CGContextRef context = CGBitmapContextCreate(
tempBuffer.data(),
m_tempBuffer.data(),
width,
height,
8,
bytesPerRow,
colorSpace,
kCGImageAlphaPremultipliedFirst | kCGBitmapByteOrder32Little // BGRA
m_colorSpace,
kCGImageAlphaPremultipliedFirst | kCGBitmapByteOrder32Little
);
CGColorSpaceRelease(colorSpace);
if (!context) {
CGImageRelease(image);
NSLog(@"Failed to create bitmap context");
return false;
}
// Draw image into context
CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
CGContextRelease(context);
CGImageRelease(image);
// Flip vertically (BMP is bottom-up, CGImage is top-down)
for (size_t y = 0; y < height; y++) {
size_t srcRow = y;
size_t dstRow = height - 1 - y;
memcpy(buffer.data() + dstRow * bytesPerRow,
tempBuffer.data() + srcRow * bytesPerRow,
bytesPerRow);
// Flip vertically using Accelerate framework
vImage_Buffer src = {
.data = m_tempBuffer.data(),
.height = (vImagePixelCount)height,
.width = (vImagePixelCount)width,
.rowBytes = bytesPerRow
};
vImage_Buffer dst = {
.data = buffer.data(),
.height = (vImagePixelCount)height,
.width = (vImagePixelCount)width,
.rowBytes = bytesPerRow
};
vImage_Error err = vImageVerticalReflect_ARGB8888(&src, &dst, kvImageNoFlags);
if (err != kvImageNoError) {
for (size_t y = 0; y < height; y++) {
memcpy(buffer.data() + (height - 1 - y) * bytesPerRow,
m_tempBuffer.data() + y * bytesPerRow,
bytesPerRow);
}
}
return true;
@@ -766,10 +996,11 @@ uint8_t ScreenHandler::getCursorTypeIndex()
// Reuse cursor position from getCursorPosition (called before this)
CGPoint pos = s_cachedLogicalPos;
// Throttle: only check if cursor moved significantly or 100ms elapsed
// Throttle: only check if cursor moved significantly or 250ms elapsed
// (Accessibility API is expensive, cursor type is just a visual hint)
uint64_t now = getTickMs();
bool posChanged = (fabs(pos.x - lastPos.x) > 5 || fabs(pos.y - lastPos.y) > 5);
if (!posChanged && (now - lastCheckTime) < 100) {
bool posChanged = (fabs(pos.x - lastPos.x) > 10 || fabs(pos.y - lastPos.y) > 10);
if (!posChanged && (now - lastCheckTime) < 250) {
return cachedIndex;
}
lastCheckTime = now;
@@ -842,13 +1073,12 @@ uint8_t ScreenHandler::getCursorTypeIndex()
void ScreenHandler::captureLoop()
{
NSLog(@"ScreenHandler CaptureLoop started (%dx%d)", m_width, m_height);
NSLog(@"ScreenHandler CaptureLoop started (%dx%d)%s", m_width, m_height,
m_displayStream ? " [CGDisplayStream]" : " [Legacy]");
uint8_t currentAlgo = m_algorithm.load();
// Always send raw first frame (TOKEN_FIRSTSCREEN) to initialize server display
// This matches Windows client behavior: first frame is always raw bitmap,
// even in H264 mode. Server needs TOKEN_FIRSTSCREEN to set m_bIsFirst = FALSE.
sendFirstScreen();
// Small delay to ensure first frame is processed before H264 stream starts
@@ -857,6 +1087,23 @@ void ScreenHandler::captureLoop()
while (m_running) {
uint64_t start = getTickMs();
// Wait for new frame from display stream (push model)
// This is key optimization: CPU sleeps when screen is static
if (m_displayStream) {
std::unique_lock<std::mutex> lock(m_surfaceMutex);
int fps = m_maxFPS.load();
if (fps <= 0) fps = 15;
int waitMs = 1000 / fps;
// Wait for new frame or timeout (maintains FPS even if no change)
m_surfaceCond.wait_for(lock, std::chrono::milliseconds(waitMs), [this] {
return m_hasNewFrame.load() || !m_running;
});
m_hasNewFrame.store(false);
if (!m_running) break;
}
uint8_t algo = m_algorithm.load();
// Check if algorithm changed
@@ -864,18 +1111,14 @@ void ScreenHandler::captureLoop()
NSLog(@"Algorithm changed: %d -> %d", currentAlgo, algo);
currentAlgo = algo;
// If switching to/from H264, reset encoder
if (algo == ALGORITHM_H264) {
// Starting H264 - will be initialized in sendH264Frame
sendH264Frame(true); // First H264 frame is keyframe
} else if (m_h264Encoder) {
// Switching away from H264 - close encoder
m_h264Encoder->close();
m_h264Encoder.reset();
sendFirstScreen(); // Send full frame for DIFF modes
sendFirstScreen();
}
} else {
// Normal frame
if (algo == ALGORITHM_H264) {
sendH264Frame(false);
} else {
@@ -883,14 +1126,17 @@ void ScreenHandler::captureLoop()
}
}
int fps = m_maxFPS.load();
if (fps <= 0) fps = 10;
int sleepMs = 1000 / fps;
// Only use sleep-based FPS control for legacy mode
if (!m_displayStream) {
int fps = m_maxFPS.load();
if (fps <= 0) fps = 10;
int sleepMs = 1000 / fps;
int elapsed = (int)(getTickMs() - start);
int wait = sleepMs - elapsed;
if (wait > 0) {
usleep(wait * 1000);
int elapsed = (int)(getTickMs() - start);
int wait = sleepMs - elapsed;
if (wait > 0) {
usleep(wait * 1000);
}
}
}