Perf: Optimize macOS screen capture with CGDisplayStream
Core optimization: - Use CGDisplayStream instead of per-frame CGDisplayCreateImage - Push model: CPU sleeps when screen is static (condition_variable wait) - IOSurface capture avoids expensive image creation per frame - ~47% CPU reduction during active remote desktop (45% → 24%) Additional optimizations: - vImageVerticalReflect (SIMD) replaces manual row-by-row flip - Cache CGColorSpaceRef to avoid per-frame creation/release - Cache tempBuffer to avoid per-frame memory allocation - Throttle getCursorTypeIndex to 250ms (Accessibility API is expensive) Bug fixes: - Fix unreliable screen capture permission check (use actual capture test) - Improve permission logging Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@
|
||||
#import <CoreGraphics/CoreGraphics.h>
|
||||
#import <ApplicationServices/ApplicationServices.h>
|
||||
#import <mach/mach_time.h>
|
||||
#import <Accelerate/Accelerate.h>
|
||||
|
||||
// Global client ID (calculated in main.mm)
|
||||
extern uint64_t g_myClientID;
|
||||
@@ -31,9 +32,17 @@ ScreenHandler::ScreenHandler(IOCPClient* client)
|
||||
, m_qualityLevel(QUALITY_GOOD) // Use fixed QUALITY_GOOD (H264) for web compatibility
|
||||
, m_h264Bitrate(3000000) // 3 Mbps (matches Windows QUALITY_GOOD)
|
||||
, m_displayAssertionID(0)
|
||||
, m_colorSpace(nullptr)
|
||||
, m_displayStream(nullptr)
|
||||
, m_streamQueue(nullptr)
|
||||
, m_latestSurface(nullptr)
|
||||
, m_hasNewFrame(false)
|
||||
{
|
||||
memset(&m_bmpHeader, 0, sizeof(m_bmpHeader));
|
||||
|
||||
// Cache color space (avoid per-frame creation)
|
||||
m_colorSpace = CGColorSpaceCreateDeviceRGB();
|
||||
|
||||
// Initialize input handler for mouse/keyboard control
|
||||
m_inputHandler = std::make_unique<InputHandler>();
|
||||
if (m_inputHandler->init()) {
|
||||
@@ -46,6 +55,13 @@ ScreenHandler::ScreenHandler(IOCPClient* client)
|
||||
ScreenHandler::~ScreenHandler()
|
||||
{
|
||||
stop();
|
||||
cleanupDisplayStream();
|
||||
|
||||
// Release cached color space
|
||||
if (m_colorSpace) {
|
||||
CGColorSpaceRelease(m_colorSpace);
|
||||
m_colorSpace = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
bool ScreenHandler::init()
|
||||
@@ -153,10 +169,191 @@ bool ScreenHandler::init()
|
||||
NSLog(@"Display wake complete");
|
||||
}
|
||||
|
||||
// Initialize CGDisplayStream for efficient capture
|
||||
if (!initDisplayStream()) {
|
||||
NSLog(@"Warning: CGDisplayStream init failed, falling back to legacy capture");
|
||||
}
|
||||
|
||||
NSLog(@"ScreenHandler initialized: %dx%d", m_width, m_height);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ScreenHandler::initDisplayStream()
|
||||
{
|
||||
// Create dispatch queue for stream callbacks
|
||||
m_streamQueue = dispatch_queue_create("com.ghost.screenstream", DISPATCH_QUEUE_SERIAL);
|
||||
if (!m_streamQueue) {
|
||||
NSLog(@"Failed to create dispatch queue for display stream");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Stream properties
|
||||
CFMutableDictionaryRef properties = CFDictionaryCreateMutable(
|
||||
kCFAllocatorDefault, 0,
|
||||
&kCFTypeDictionaryKeyCallBacks,
|
||||
&kCFTypeDictionaryValueCallBacks
|
||||
);
|
||||
|
||||
// Request minimum frame interval based on FPS (e.g., 15 FPS = 1/15 sec)
|
||||
int fps = m_maxFPS.load();
|
||||
if (fps <= 0) fps = 15;
|
||||
double interval = 1.0 / (double)fps;
|
||||
CFNumberRef intervalRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberDoubleType, &interval);
|
||||
CFDictionarySetValue(properties, kCGDisplayStreamMinimumFrameTime, intervalRef);
|
||||
CFRelease(intervalRef);
|
||||
|
||||
// Show cursor in stream
|
||||
CFDictionarySetValue(properties, kCGDisplayStreamShowCursor, kCFBooleanFalse);
|
||||
|
||||
// Preserve aspect ratio
|
||||
CFDictionarySetValue(properties, kCGDisplayStreamPreserveAspectRatio, kCFBooleanTrue);
|
||||
|
||||
// Create the display stream with BGRA format
|
||||
__block ScreenHandler* handler = this;
|
||||
m_displayStream = CGDisplayStreamCreateWithDispatchQueue(
|
||||
m_displayID,
|
||||
m_width,
|
||||
m_height,
|
||||
'BGRA', // Pixel format
|
||||
properties,
|
||||
m_streamQueue,
|
||||
^(CGDisplayStreamFrameStatus status,
|
||||
uint64_t displayTime,
|
||||
IOSurfaceRef frameSurface,
|
||||
CGDisplayStreamUpdateRef updateRef) {
|
||||
(void)displayTime;
|
||||
(void)updateRef;
|
||||
|
||||
if (status == kCGDisplayStreamFrameStatusFrameComplete && frameSurface) {
|
||||
handler->processIOSurface(frameSurface);
|
||||
} else if (status == kCGDisplayStreamFrameStatusFrameIdle) {
|
||||
// Screen not changed, still notify for FPS timing
|
||||
handler->m_hasNewFrame.store(true);
|
||||
handler->m_surfaceCond.notify_one();
|
||||
} else if (status == kCGDisplayStreamFrameStatusStopped) {
|
||||
NSLog(@"CGDisplayStream stopped");
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
CFRelease(properties);
|
||||
|
||||
if (!m_displayStream) {
|
||||
NSLog(@"Failed to create CGDisplayStream");
|
||||
m_streamQueue = nullptr; // ARC manages dispatch objects
|
||||
return false;
|
||||
}
|
||||
|
||||
// Start the stream
|
||||
CGError err = CGDisplayStreamStart(m_displayStream);
|
||||
if (err != kCGErrorSuccess) {
|
||||
NSLog(@"Failed to start CGDisplayStream: %d", err);
|
||||
CFRelease(m_displayStream);
|
||||
m_displayStream = nullptr;
|
||||
m_streamQueue = nullptr; // ARC manages dispatch objects
|
||||
return false;
|
||||
}
|
||||
|
||||
NSLog(@"CGDisplayStream started: %dx%d @ %d FPS", m_width, m_height, fps);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ScreenHandler::cleanupDisplayStream()
|
||||
{
|
||||
if (m_displayStream) {
|
||||
CGDisplayStreamStop(m_displayStream);
|
||||
CFRelease(m_displayStream);
|
||||
m_displayStream = nullptr;
|
||||
}
|
||||
|
||||
// ARC manages dispatch objects, just nil the pointer
|
||||
m_streamQueue = nullptr;
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_surfaceMutex);
|
||||
if (m_latestSurface) {
|
||||
CFRelease(m_latestSurface);
|
||||
m_latestSurface = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void ScreenHandler::processIOSurface(IOSurfaceRef surface)
|
||||
{
|
||||
// Retain the surface and store it
|
||||
std::lock_guard<std::mutex> lock(m_surfaceMutex);
|
||||
|
||||
if (m_latestSurface) {
|
||||
CFRelease(m_latestSurface);
|
||||
}
|
||||
m_latestSurface = (IOSurfaceRef)CFRetain(surface);
|
||||
m_hasNewFrame.store(true);
|
||||
m_surfaceCond.notify_one();
|
||||
}
|
||||
|
||||
bool ScreenHandler::captureFromIOSurface(IOSurfaceRef surface, std::vector<uint8_t>& buffer)
|
||||
{
|
||||
if (!surface) return false;
|
||||
|
||||
// Lock the surface for CPU read
|
||||
IOSurfaceLock(surface, kIOSurfaceLockReadOnly, nullptr);
|
||||
|
||||
size_t width = IOSurfaceGetWidth(surface);
|
||||
size_t height = IOSurfaceGetHeight(surface);
|
||||
size_t bytesPerRow = IOSurfaceGetBytesPerRow(surface);
|
||||
void* baseAddr = IOSurfaceGetBaseAddress(surface);
|
||||
|
||||
if (!baseAddr || width != (size_t)m_width || height != (size_t)m_height) {
|
||||
IOSurfaceUnlock(surface, kIOSurfaceLockReadOnly, nullptr);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Ensure temp buffer is allocated
|
||||
size_t requiredSize = m_width * 4 * m_height;
|
||||
if (m_tempBuffer.size() != requiredSize) {
|
||||
m_tempBuffer.resize(requiredSize);
|
||||
}
|
||||
|
||||
// Copy from IOSurface to temp buffer (handle different bytesPerRow)
|
||||
size_t dstBytesPerRow = m_width * 4;
|
||||
if (bytesPerRow == dstBytesPerRow) {
|
||||
memcpy(m_tempBuffer.data(), baseAddr, requiredSize);
|
||||
} else {
|
||||
// Row by row copy for different strides
|
||||
uint8_t* src = (uint8_t*)baseAddr;
|
||||
uint8_t* dst = m_tempBuffer.data();
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
memcpy(dst + y * dstBytesPerRow, src + y * bytesPerRow, dstBytesPerRow);
|
||||
}
|
||||
}
|
||||
|
||||
IOSurfaceUnlock(surface, kIOSurfaceLockReadOnly, nullptr);
|
||||
|
||||
// Flip vertically using Accelerate framework (SIMD optimized)
|
||||
vImage_Buffer src = {
|
||||
.data = m_tempBuffer.data(),
|
||||
.height = (vImagePixelCount)height,
|
||||
.width = (vImagePixelCount)width,
|
||||
.rowBytes = dstBytesPerRow
|
||||
};
|
||||
vImage_Buffer dst = {
|
||||
.data = buffer.data(),
|
||||
.height = (vImagePixelCount)height,
|
||||
.width = (vImagePixelCount)width,
|
||||
.rowBytes = dstBytesPerRow
|
||||
};
|
||||
|
||||
vImage_Error err = vImageVerticalReflect_ARGB8888(&src, &dst, kvImageNoFlags);
|
||||
if (err != kvImageNoError) {
|
||||
// Fallback to manual flip
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
memcpy(buffer.data() + (height - 1 - y) * dstBytesPerRow,
|
||||
m_tempBuffer.data() + y * dstBytesPerRow,
|
||||
dstBytesPerRow);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ScreenHandler::start(IOCPClient* client, uint64_t clientID)
|
||||
{
|
||||
// If already running, just send TOKEN_BITMAPINFO again
|
||||
@@ -190,6 +387,10 @@ void ScreenHandler::start(IOCPClient* client, uint64_t clientID)
|
||||
void ScreenHandler::stop()
|
||||
{
|
||||
m_running = false;
|
||||
|
||||
// Wake up capture thread if waiting
|
||||
m_surfaceCond.notify_all();
|
||||
|
||||
if (m_captureThread.joinable()) {
|
||||
m_captureThread.join();
|
||||
}
|
||||
@@ -451,7 +652,27 @@ void ScreenHandler::applyQualityLevel(int8_t level, bool persist)
|
||||
|
||||
bool ScreenHandler::captureScreen(std::vector<uint8_t>& buffer)
|
||||
{
|
||||
// Create image from display
|
||||
// Try to use IOSurface from display stream (more efficient)
|
||||
if (m_displayStream) {
|
||||
IOSurfaceRef surface = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_surfaceMutex);
|
||||
if (m_latestSurface) {
|
||||
surface = (IOSurfaceRef)CFRetain(m_latestSurface);
|
||||
}
|
||||
}
|
||||
|
||||
if (surface) {
|
||||
bool result = captureFromIOSurface(surface, buffer);
|
||||
CFRelease(surface);
|
||||
if (result) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Fall through to legacy method if IOSurface failed
|
||||
}
|
||||
|
||||
// Legacy method: CGDisplayCreateImage (fallback)
|
||||
CGImageRef image = CGDisplayCreateImage(m_displayID);
|
||||
if (!image) {
|
||||
NSLog(@"Failed to capture screen image");
|
||||
@@ -462,49 +683,58 @@ bool ScreenHandler::captureScreen(std::vector<uint8_t>& buffer)
|
||||
size_t height = CGImageGetHeight(image);
|
||||
|
||||
if (width != (size_t)m_width || height != (size_t)m_height) {
|
||||
// Screen resolution changed, need to reinitialize
|
||||
CGImageRelease(image);
|
||||
NSLog(@"Screen resolution changed: %zux%zu", width, height);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create bitmap context to get raw pixel data
|
||||
CGColorSpaceRef colorSpace = CGColorSpaceCreateDeviceRGB();
|
||||
size_t bytesPerRow = width * 4;
|
||||
|
||||
// Temporary buffer for top-down BGRA
|
||||
std::vector<uint8_t> tempBuffer(bytesPerRow * height);
|
||||
size_t requiredSize = bytesPerRow * height;
|
||||
if (m_tempBuffer.size() != requiredSize) {
|
||||
m_tempBuffer.resize(requiredSize);
|
||||
}
|
||||
|
||||
CGContextRef context = CGBitmapContextCreate(
|
||||
tempBuffer.data(),
|
||||
m_tempBuffer.data(),
|
||||
width,
|
||||
height,
|
||||
8,
|
||||
bytesPerRow,
|
||||
colorSpace,
|
||||
kCGImageAlphaPremultipliedFirst | kCGBitmapByteOrder32Little // BGRA
|
||||
m_colorSpace,
|
||||
kCGImageAlphaPremultipliedFirst | kCGBitmapByteOrder32Little
|
||||
);
|
||||
|
||||
CGColorSpaceRelease(colorSpace);
|
||||
|
||||
if (!context) {
|
||||
CGImageRelease(image);
|
||||
NSLog(@"Failed to create bitmap context");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Draw image into context
|
||||
CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
|
||||
CGContextRelease(context);
|
||||
CGImageRelease(image);
|
||||
|
||||
// Flip vertically (BMP is bottom-up, CGImage is top-down)
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
size_t srcRow = y;
|
||||
size_t dstRow = height - 1 - y;
|
||||
memcpy(buffer.data() + dstRow * bytesPerRow,
|
||||
tempBuffer.data() + srcRow * bytesPerRow,
|
||||
bytesPerRow);
|
||||
// Flip vertically using Accelerate framework
|
||||
vImage_Buffer src = {
|
||||
.data = m_tempBuffer.data(),
|
||||
.height = (vImagePixelCount)height,
|
||||
.width = (vImagePixelCount)width,
|
||||
.rowBytes = bytesPerRow
|
||||
};
|
||||
vImage_Buffer dst = {
|
||||
.data = buffer.data(),
|
||||
.height = (vImagePixelCount)height,
|
||||
.width = (vImagePixelCount)width,
|
||||
.rowBytes = bytesPerRow
|
||||
};
|
||||
|
||||
vImage_Error err = vImageVerticalReflect_ARGB8888(&src, &dst, kvImageNoFlags);
|
||||
if (err != kvImageNoError) {
|
||||
for (size_t y = 0; y < height; y++) {
|
||||
memcpy(buffer.data() + (height - 1 - y) * bytesPerRow,
|
||||
m_tempBuffer.data() + y * bytesPerRow,
|
||||
bytesPerRow);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -766,10 +996,11 @@ uint8_t ScreenHandler::getCursorTypeIndex()
|
||||
// Reuse cursor position from getCursorPosition (called before this)
|
||||
CGPoint pos = s_cachedLogicalPos;
|
||||
|
||||
// Throttle: only check if cursor moved significantly or 100ms elapsed
|
||||
// Throttle: only check if cursor moved significantly or 250ms elapsed
|
||||
// (Accessibility API is expensive, cursor type is just a visual hint)
|
||||
uint64_t now = getTickMs();
|
||||
bool posChanged = (fabs(pos.x - lastPos.x) > 5 || fabs(pos.y - lastPos.y) > 5);
|
||||
if (!posChanged && (now - lastCheckTime) < 100) {
|
||||
bool posChanged = (fabs(pos.x - lastPos.x) > 10 || fabs(pos.y - lastPos.y) > 10);
|
||||
if (!posChanged && (now - lastCheckTime) < 250) {
|
||||
return cachedIndex;
|
||||
}
|
||||
lastCheckTime = now;
|
||||
@@ -842,13 +1073,12 @@ uint8_t ScreenHandler::getCursorTypeIndex()
|
||||
|
||||
void ScreenHandler::captureLoop()
|
||||
{
|
||||
NSLog(@"ScreenHandler CaptureLoop started (%dx%d)", m_width, m_height);
|
||||
NSLog(@"ScreenHandler CaptureLoop started (%dx%d)%s", m_width, m_height,
|
||||
m_displayStream ? " [CGDisplayStream]" : " [Legacy]");
|
||||
|
||||
uint8_t currentAlgo = m_algorithm.load();
|
||||
|
||||
// Always send raw first frame (TOKEN_FIRSTSCREEN) to initialize server display
|
||||
// This matches Windows client behavior: first frame is always raw bitmap,
|
||||
// even in H264 mode. Server needs TOKEN_FIRSTSCREEN to set m_bIsFirst = FALSE.
|
||||
sendFirstScreen();
|
||||
|
||||
// Small delay to ensure first frame is processed before H264 stream starts
|
||||
@@ -857,6 +1087,23 @@ void ScreenHandler::captureLoop()
|
||||
while (m_running) {
|
||||
uint64_t start = getTickMs();
|
||||
|
||||
// Wait for new frame from display stream (push model)
|
||||
// This is key optimization: CPU sleeps when screen is static
|
||||
if (m_displayStream) {
|
||||
std::unique_lock<std::mutex> lock(m_surfaceMutex);
|
||||
int fps = m_maxFPS.load();
|
||||
if (fps <= 0) fps = 15;
|
||||
int waitMs = 1000 / fps;
|
||||
|
||||
// Wait for new frame or timeout (maintains FPS even if no change)
|
||||
m_surfaceCond.wait_for(lock, std::chrono::milliseconds(waitMs), [this] {
|
||||
return m_hasNewFrame.load() || !m_running;
|
||||
});
|
||||
m_hasNewFrame.store(false);
|
||||
|
||||
if (!m_running) break;
|
||||
}
|
||||
|
||||
uint8_t algo = m_algorithm.load();
|
||||
|
||||
// Check if algorithm changed
|
||||
@@ -864,18 +1111,14 @@ void ScreenHandler::captureLoop()
|
||||
NSLog(@"Algorithm changed: %d -> %d", currentAlgo, algo);
|
||||
currentAlgo = algo;
|
||||
|
||||
// If switching to/from H264, reset encoder
|
||||
if (algo == ALGORITHM_H264) {
|
||||
// Starting H264 - will be initialized in sendH264Frame
|
||||
sendH264Frame(true); // First H264 frame is keyframe
|
||||
} else if (m_h264Encoder) {
|
||||
// Switching away from H264 - close encoder
|
||||
m_h264Encoder->close();
|
||||
m_h264Encoder.reset();
|
||||
sendFirstScreen(); // Send full frame for DIFF modes
|
||||
sendFirstScreen();
|
||||
}
|
||||
} else {
|
||||
// Normal frame
|
||||
if (algo == ALGORITHM_H264) {
|
||||
sendH264Frame(false);
|
||||
} else {
|
||||
@@ -883,14 +1126,17 @@ void ScreenHandler::captureLoop()
|
||||
}
|
||||
}
|
||||
|
||||
int fps = m_maxFPS.load();
|
||||
if (fps <= 0) fps = 10;
|
||||
int sleepMs = 1000 / fps;
|
||||
// Only use sleep-based FPS control for legacy mode
|
||||
if (!m_displayStream) {
|
||||
int fps = m_maxFPS.load();
|
||||
if (fps <= 0) fps = 10;
|
||||
int sleepMs = 1000 / fps;
|
||||
|
||||
int elapsed = (int)(getTickMs() - start);
|
||||
int wait = sleepMs - elapsed;
|
||||
if (wait > 0) {
|
||||
usleep(wait * 1000);
|
||||
int elapsed = (int)(getTickMs() - start);
|
||||
int wait = sleepMs - elapsed;
|
||||
if (wait > 0) {
|
||||
usleep(wait * 1000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user