#import "H264Encoder.h" #import #import #import #import H264Encoder::H264Encoder() : m_session(nullptr) , m_width(0) , m_height(0) , m_fps(30) , m_bitrate(0) , m_forceKeyframe(false) , m_frameCount(0) { m_lastError[0] = '\0'; } H264Encoder::~H264Encoder() { close(); } bool H264Encoder::open(int width, int height, int fps, int bitrate) { close(); // Width and height must be even for H264 m_width = width & ~1; m_height = height & ~1; m_fps = fps > 0 ? fps : 30; m_bitrate = bitrate > 0 ? bitrate : (m_width * m_height * 3); // ~3 bits per pixel default // Allocate YUV buffers int ySize = m_width * m_height; int uvSize = (m_width / 2) * (m_height / 2); m_yPlane.resize(ySize); m_uPlane.resize(uvSize); m_vPlane.resize(uvSize); // Reserve output buffer m_outputBuffer.reserve(m_width * m_height); // Create compression session CFMutableDictionaryRef encoderSpec = CFDictionaryCreateMutable( kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks ); // Prefer hardware encoder CFDictionarySetValue(encoderSpec, kVTVideoEncoderSpecification_EnableHardwareAcceleratedVideoEncoder, kCFBooleanTrue); // Source image attributes CFMutableDictionaryRef sourceAttrs = CFDictionaryCreateMutable( kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks ); int32_t pixelFormat = kCVPixelFormatType_420YpCbCr8Planar; // I420 CFNumberRef pixelFormatNum = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &pixelFormat); CFDictionarySetValue(sourceAttrs, kCVPixelBufferPixelFormatTypeKey, pixelFormatNum); CFRelease(pixelFormatNum); int32_t widthNum = m_width; int32_t heightNum = m_height; CFNumberRef widthRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &widthNum); CFNumberRef heightRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &heightNum); CFDictionarySetValue(sourceAttrs, kCVPixelBufferWidthKey, widthRef); CFDictionarySetValue(sourceAttrs, kCVPixelBufferHeightKey, heightRef); CFRelease(widthRef); CFRelease(heightRef); // Create compression session OSStatus status = VTCompressionSessionCreate( kCFAllocatorDefault, m_width, m_height, kCMVideoCodecType_H264, encoderSpec, sourceAttrs, kCFAllocatorDefault, compressionCallback, this, &m_session ); CFRelease(encoderSpec); CFRelease(sourceAttrs); if (status != noErr) { snprintf(m_lastError, sizeof(m_lastError), "VTCompressionSessionCreate failed: %d", (int)status); NSLog(@"H264Encoder: %s", m_lastError); return false; } // Configure session properties // Real-time encoding VTSessionSetProperty(m_session, kVTCompressionPropertyKey_RealTime, kCFBooleanTrue); // Profile: Baseline for compatibility VTSessionSetProperty(m_session, kVTCompressionPropertyKey_ProfileLevel, kVTProfileLevel_H264_Baseline_AutoLevel); // Allow frame reordering: false for low latency VTSessionSetProperty(m_session, kVTCompressionPropertyKey_AllowFrameReordering, kCFBooleanFalse); // Max keyframe interval (GOP size) - match Windows x264 setting (15 seconds) int32_t keyframeInterval = m_fps * 15; // Keyframe every 15 seconds CFNumberRef keyframeRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &keyframeInterval); VTSessionSetProperty(m_session, kVTCompressionPropertyKey_MaxKeyFrameInterval, keyframeRef); CFRelease(keyframeRef); // Expected frame rate CFNumberRef fpsRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &m_fps); VTSessionSetProperty(m_session, kVTCompressionPropertyKey_ExpectedFrameRate, fpsRef); CFRelease(fpsRef); // Average bitrate CFNumberRef bitrateRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &m_bitrate); VTSessionSetProperty(m_session, kVTCompressionPropertyKey_AverageBitRate, bitrateRef); CFRelease(bitrateRef); // Data rate limits (for more consistent bitrate) // [bytes per second, duration in seconds] int64_t dataRateLimit = m_bitrate / 8; double duration = 1.0; CFNumberRef bytesRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt64Type, &dataRateLimit); CFNumberRef durationRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberFloat64Type, &duration); CFTypeRef limits[2] = { bytesRef, durationRef }; CFArrayRef limitsArray = CFArrayCreate(kCFAllocatorDefault, limits, 2, &kCFTypeArrayCallBacks); VTSessionSetProperty(m_session, kVTCompressionPropertyKey_DataRateLimits, limitsArray); CFRelease(bytesRef); CFRelease(durationRef); CFRelease(limitsArray); // Prepare to encode status = VTCompressionSessionPrepareToEncodeFrames(m_session); if (status != noErr) { snprintf(m_lastError, sizeof(m_lastError), "VTCompressionSessionPrepareToEncodeFrames failed: %d", (int)status); NSLog(@"H264Encoder: %s", m_lastError); close(); return false; } m_frameCount = 0; m_forceKeyframe = true; // First frame is always keyframe NSLog(@"H264Encoder opened: %dx%d @ %d fps, bitrate=%d", m_width, m_height, m_fps, m_bitrate); return true; } void H264Encoder::close() { if (m_session) { VTCompressionSessionInvalidate(m_session); CFRelease(m_session); m_session = nullptr; } m_yPlane.clear(); m_uPlane.clear(); m_vPlane.clear(); m_outputBuffer.clear(); } void H264Encoder::convertBGRAtoI420(const uint8_t* bgra, uint32_t stride, uint32_t width, uint32_t height, bool flipVertical) { // Convert BGRA to I420 (YUV 4:2:0 planar) // Y = 0.299*R + 0.587*G + 0.114*B // U = -0.169*R - 0.331*G + 0.500*B + 128 // V = 0.500*R - 0.419*G - 0.081*B + 128 uint8_t* yDst = m_yPlane.data(); uint8_t* uDst = m_uPlane.data(); uint8_t* vDst = m_vPlane.data(); int uvWidth = width / 2; for (uint32_t y = 0; y < height; y++) { // Source row (handle vertical flip) uint32_t srcY = flipVertical ? (height - 1 - y) : y; const uint8_t* srcRow = bgra + srcY * stride; // Y plane destination uint8_t* yRow = yDst + y * width; for (uint32_t x = 0; x < width; x++) { uint8_t b = srcRow[x * 4 + 0]; uint8_t g = srcRow[x * 4 + 1]; uint8_t r = srcRow[x * 4 + 2]; // Y component int yVal = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16; yRow[x] = (uint8_t)(yVal < 0 ? 0 : (yVal > 255 ? 255 : yVal)); } // UV planes (subsampled 2x2) if (y % 2 == 0) { uint8_t* uRow = uDst + (y / 2) * uvWidth; uint8_t* vRow = vDst + (y / 2) * uvWidth; for (uint32_t x = 0; x < width; x += 2) { // Average 2x2 block uint32_t srcY2 = flipVertical ? (height - 2 - y) : (y + 1); if (srcY2 >= height) srcY2 = srcY; const uint8_t* srcRow2 = bgra + srcY2 * stride; int r = 0, g = 0, b = 0; // Top-left b += srcRow[x * 4 + 0]; g += srcRow[x * 4 + 1]; r += srcRow[x * 4 + 2]; // Top-right if (x + 1 < width) { b += srcRow[(x + 1) * 4 + 0]; g += srcRow[(x + 1) * 4 + 1]; r += srcRow[(x + 1) * 4 + 2]; } // Bottom-left b += srcRow2[x * 4 + 0]; g += srcRow2[x * 4 + 1]; r += srcRow2[x * 4 + 2]; // Bottom-right if (x + 1 < width) { b += srcRow2[(x + 1) * 4 + 0]; g += srcRow2[(x + 1) * 4 + 1]; r += srcRow2[(x + 1) * 4 + 2]; } r /= 4; g /= 4; b /= 4; // U component int uVal = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; uRow[x / 2] = (uint8_t)(uVal < 0 ? 0 : (uVal > 255 ? 255 : uVal)); // V component int vVal = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; vRow[x / 2] = (uint8_t)(vVal < 0 ? 0 : (vVal > 255 ? 255 : vVal)); } } } } int H264Encoder::encode(const uint8_t* bgra, uint8_t bpp, uint32_t stride, uint32_t width, uint32_t height, uint8_t** outData, uint32_t* outSize, bool flipVertical) { if (!m_session) { snprintf(m_lastError, sizeof(m_lastError), "Encoder not initialized"); return 0; } if (width != (uint32_t)m_width || height != (uint32_t)m_height) { snprintf(m_lastError, sizeof(m_lastError), "Frame size mismatch: expected %dx%d, got %dx%d", m_width, m_height, (int)width, (int)height); return 0; } // Convert BGRA to I420 convertBGRAtoI420(bgra, stride, width, height, flipVertical); // Create CVPixelBuffer CVPixelBufferRef pixelBuffer = nullptr; NSDictionary* options = @{ (id)kCVPixelBufferIOSurfacePropertiesKey: @{} }; CVReturn cvRet = CVPixelBufferCreate( kCFAllocatorDefault, m_width, m_height, kCVPixelFormatType_420YpCbCr8Planar, (__bridge CFDictionaryRef)options, &pixelBuffer ); if (cvRet != kCVReturnSuccess) { snprintf(m_lastError, sizeof(m_lastError), "CVPixelBufferCreate failed: %d", (int)cvRet); return 0; } // Lock and copy YUV data CVPixelBufferLockBaseAddress(pixelBuffer, 0); size_t planeCount = CVPixelBufferGetPlaneCount(pixelBuffer); if (planeCount < 3) { CVPixelBufferUnlockBaseAddress(pixelBuffer, 0); CVPixelBufferRelease(pixelBuffer); snprintf(m_lastError, sizeof(m_lastError), "CVPixelBuffer has %zu planes, expected 3", planeCount); return 0; } // Y plane uint8_t* yDst = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0); size_t yStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 0); for (int y = 0; y < m_height; y++) { memcpy(yDst + y * yStride, m_yPlane.data() + y * m_width, m_width); } // U plane uint8_t* uDst = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1); size_t uStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 1); int uvHeight = m_height / 2; int uvWidth = m_width / 2; for (int y = 0; y < uvHeight; y++) { memcpy(uDst + y * uStride, m_uPlane.data() + y * uvWidth, uvWidth); } // V plane uint8_t* vDst = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 2); size_t vStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 2); for (int y = 0; y < uvHeight; y++) { memcpy(vDst + y * vStride, m_vPlane.data() + y * uvWidth, uvWidth); } CVPixelBufferUnlockBaseAddress(pixelBuffer, 0); // Prepare frame properties CFMutableDictionaryRef frameProps = nullptr; if (m_forceKeyframe.exchange(false)) { frameProps = CFDictionaryCreateMutable( kCFAllocatorDefault, 1, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks ); CFDictionarySetValue(frameProps, kVTEncodeFrameOptionKey_ForceKeyFrame, kCFBooleanTrue); } // Clear output buffer { std::lock_guard lock(m_outputMutex); m_outputBuffer.clear(); } // Presentation timestamp CMTime pts = CMTimeMake(m_frameCount++, m_fps); // Encode frame OSStatus status = VTCompressionSessionEncodeFrame( m_session, pixelBuffer, pts, kCMTimeInvalid, frameProps, nullptr, nullptr ); if (frameProps) { CFRelease(frameProps); } CVPixelBufferRelease(pixelBuffer); if (status != noErr) { snprintf(m_lastError, sizeof(m_lastError), "VTCompressionSessionEncodeFrame failed: %d", (int)status); return 0; } // Wait for encoding to complete VTCompressionSessionCompleteFrames(m_session, kCMTimeInvalid); // Return encoded data std::lock_guard lock(m_outputMutex); if (m_outputBuffer.empty()) { return 0; } *outData = m_outputBuffer.data(); *outSize = (uint32_t)m_outputBuffer.size(); return (int)m_outputBuffer.size(); } void H264Encoder::compressionCallback(void* outputCallbackRefCon, void* sourceFrameRefCon, OSStatus status, VTEncodeInfoFlags infoFlags, CMSampleBufferRef sampleBuffer) { (void)sourceFrameRefCon; (void)infoFlags; H264Encoder* encoder = (H264Encoder*)outputCallbackRefCon; if (status != noErr) { NSLog(@"H264Encoder: Compression callback error: %d", (int)status); return; } if (!sampleBuffer) { return; } encoder->processSampleBuffer(sampleBuffer); } void H264Encoder::processSampleBuffer(CMSampleBufferRef sampleBuffer) { // Check if keyframe CFArrayRef attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, false); bool isKeyframe = false; if (attachments && CFArrayGetCount(attachments) > 0) { CFDictionaryRef dict = (CFDictionaryRef)CFArrayGetValueAtIndex(attachments, 0); CFBooleanRef notSync = (CFBooleanRef)CFDictionaryGetValue(dict, kCMSampleAttachmentKey_NotSync); isKeyframe = (notSync == nullptr || !CFBooleanGetValue(notSync)); } std::lock_guard lock(m_outputMutex); m_outputBuffer.clear(); // Get format description for SPS/PPS CMFormatDescriptionRef formatDesc = CMSampleBufferGetFormatDescription(sampleBuffer); // If keyframe, prepend SPS and PPS if (isKeyframe && formatDesc) { // Get SPS size_t spsSize = 0; size_t spsCount = 0; const uint8_t* sps = nullptr; OSStatus status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex( formatDesc, 0, &sps, &spsSize, &spsCount, nullptr); if (status == noErr && sps && spsSize > 0) { // Write NAL start code + SPS uint8_t startCode[] = {0x00, 0x00, 0x00, 0x01}; m_outputBuffer.insert(m_outputBuffer.end(), startCode, startCode + 4); m_outputBuffer.insert(m_outputBuffer.end(), sps, sps + spsSize); } // Get PPS size_t ppsSize = 0; size_t ppsCount = 0; const uint8_t* pps = nullptr; status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex( formatDesc, 1, &pps, &ppsSize, &ppsCount, nullptr); if (status == noErr && pps && ppsSize > 0) { // Write NAL start code + PPS uint8_t startCode[] = {0x00, 0x00, 0x00, 0x01}; m_outputBuffer.insert(m_outputBuffer.end(), startCode, startCode + 4); m_outputBuffer.insert(m_outputBuffer.end(), pps, pps + ppsSize); } } // Get encoded data CMBlockBufferRef blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer); if (!blockBuffer) { return; } size_t totalLength = 0; size_t lengthAtOffset = 0; char* dataPointer = nullptr; OSStatus status = CMBlockBufferGetDataPointer( blockBuffer, 0, &lengthAtOffset, &totalLength, &dataPointer); if (status != noErr || !dataPointer) { return; } // Get NAL unit length size from format description (usually 4 bytes) int nalLengthSize = 4; if (formatDesc) { int tmpNalLengthSize = 0; status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex( formatDesc, 0, nullptr, nullptr, nullptr, &tmpNalLengthSize); if (status == noErr && tmpNalLengthSize > 0 && tmpNalLengthSize <= 4) { nalLengthSize = tmpNalLengthSize; } } // Convert AVCC format (length-prefixed) to Annex B (start code prefixed) size_t offset = 0; while (offset < totalLength) { // Read NAL unit length (big-endian, variable size) uint32_t nalLength = 0; const uint8_t* lengthPtr = (const uint8_t*)dataPointer + offset; for (int i = 0; i < nalLengthSize; i++) { nalLength = (nalLength << 8) | lengthPtr[i]; } offset += nalLengthSize; if (nalLength > 0 && offset + nalLength <= totalLength) { // Write NAL start code uint8_t startCode[] = {0x00, 0x00, 0x00, 0x01}; m_outputBuffer.insert(m_outputBuffer.end(), startCode, startCode + 4); // Write NAL data m_outputBuffer.insert(m_outputBuffer.end(), (uint8_t*)dataPointer + offset, (uint8_t*)dataPointer + offset + nalLength); } offset += nalLength; } }