#import "H264Encoder.h"
#import <VideoToolbox/VideoToolbox.h>
#import <CoreMedia/CoreMedia.h>
#import <CoreVideo/CoreVideo.h>
#import <Cocoa/Cocoa.h>

H264Encoder::H264Encoder()
    : m_session(nullptr)
    , m_width(0)
    , m_height(0)
    , m_fps(30)
    , m_bitrate(0)
    , m_forceKeyframe(false)
    , m_frameCount(0)
{
    m_lastError[0] = '\0';
}

H264Encoder::~H264Encoder()
{
    close();
}

bool H264Encoder::open(int width, int height, int fps, int bitrate)
{
    close();

    // Width and height must be even for H264
    m_width = width & ~1;
    m_height = height & ~1;
    m_fps = fps > 0 ? fps : 30;
    m_bitrate = bitrate > 0 ? bitrate : (m_width * m_height * 3);  // ~3 bits per pixel default

    // Allocate YUV buffers
    int ySize = m_width * m_height;
    int uvSize = (m_width / 2) * (m_height / 2);
    m_yPlane.resize(ySize);
    m_uPlane.resize(uvSize);
    m_vPlane.resize(uvSize);

    // Reserve output buffer
    m_outputBuffer.reserve(m_width * m_height);

    // Create compression session
    CFMutableDictionaryRef encoderSpec = CFDictionaryCreateMutable(
        kCFAllocatorDefault, 0,
        &kCFTypeDictionaryKeyCallBacks,
        &kCFTypeDictionaryValueCallBacks
    );

    // Prefer hardware encoder
    CFDictionarySetValue(encoderSpec,
        kVTVideoEncoderSpecification_EnableHardwareAcceleratedVideoEncoder,
        kCFBooleanTrue);

    // Source image attributes
    CFMutableDictionaryRef sourceAttrs = CFDictionaryCreateMutable(
        kCFAllocatorDefault, 0,
        &kCFTypeDictionaryKeyCallBacks,
        &kCFTypeDictionaryValueCallBacks
    );

    int32_t pixelFormat = kCVPixelFormatType_420YpCbCr8Planar;  // I420
    CFNumberRef pixelFormatNum = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &pixelFormat);
    CFDictionarySetValue(sourceAttrs, kCVPixelBufferPixelFormatTypeKey, pixelFormatNum);
    CFRelease(pixelFormatNum);

    int32_t widthNum = m_width;
    int32_t heightNum = m_height;
    CFNumberRef widthRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &widthNum);
    CFNumberRef heightRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &heightNum);
    CFDictionarySetValue(sourceAttrs, kCVPixelBufferWidthKey, widthRef);
    CFDictionarySetValue(sourceAttrs, kCVPixelBufferHeightKey, heightRef);
    CFRelease(widthRef);
    CFRelease(heightRef);

    // Create compression session
    OSStatus status = VTCompressionSessionCreate(
        kCFAllocatorDefault,
        m_width,
        m_height,
        kCMVideoCodecType_H264,
        encoderSpec,
        sourceAttrs,
        kCFAllocatorDefault,
        compressionCallback,
        this,
        &m_session
    );

    CFRelease(encoderSpec);
    CFRelease(sourceAttrs);

    if (status != noErr) {
        snprintf(m_lastError, sizeof(m_lastError),
                 "VTCompressionSessionCreate failed: %d", (int)status);
        NSLog(@"H264Encoder: %s", m_lastError);
        return false;
    }

    // Configure session properties

    // Real-time encoding
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_RealTime, kCFBooleanTrue);

    // Profile: Baseline for compatibility
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_ProfileLevel,
                         kVTProfileLevel_H264_Baseline_AutoLevel);

    // Allow frame reordering: false for low latency
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_AllowFrameReordering, kCFBooleanFalse);

    // Max keyframe interval (GOP size) - match Windows x264 setting (15 seconds)
    int32_t keyframeInterval = m_fps * 15;  // Keyframe every 15 seconds
    CFNumberRef keyframeRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &keyframeInterval);
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_MaxKeyFrameInterval, keyframeRef);
    CFRelease(keyframeRef);

    // Expected frame rate
    CFNumberRef fpsRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &m_fps);
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_ExpectedFrameRate, fpsRef);
    CFRelease(fpsRef);

    // Average bitrate
    CFNumberRef bitrateRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &m_bitrate);
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_AverageBitRate, bitrateRef);
    CFRelease(bitrateRef);

    // Data rate limits (for more consistent bitrate)
    // [bytes per second, duration in seconds]
    int64_t dataRateLimit = m_bitrate / 8;
    double duration = 1.0;
    CFNumberRef bytesRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt64Type, &dataRateLimit);
    CFNumberRef durationRef = CFNumberCreate(kCFAllocatorDefault, kCFNumberFloat64Type, &duration);
    CFTypeRef limits[2] = { bytesRef, durationRef };
    CFArrayRef limitsArray = CFArrayCreate(kCFAllocatorDefault, limits, 2, &kCFTypeArrayCallBacks);
    VTSessionSetProperty(m_session, kVTCompressionPropertyKey_DataRateLimits, limitsArray);
    CFRelease(bytesRef);
    CFRelease(durationRef);
    CFRelease(limitsArray);

    // Prepare to encode
    status = VTCompressionSessionPrepareToEncodeFrames(m_session);
    if (status != noErr) {
        snprintf(m_lastError, sizeof(m_lastError),
                 "VTCompressionSessionPrepareToEncodeFrames failed: %d", (int)status);
        NSLog(@"H264Encoder: %s", m_lastError);
        close();
        return false;
    }

    m_frameCount = 0;
    m_forceKeyframe = true;  // First frame is always keyframe

    NSLog(@"H264Encoder opened: %dx%d @ %d fps, bitrate=%d",
          m_width, m_height, m_fps, m_bitrate);

    return true;
}

void H264Encoder::close()
{
    if (m_session) {
        VTCompressionSessionInvalidate(m_session);
        CFRelease(m_session);
        m_session = nullptr;
    }

    m_yPlane.clear();
    m_uPlane.clear();
    m_vPlane.clear();
    m_outputBuffer.clear();
}

void H264Encoder::convertBGRAtoI420(const uint8_t* bgra, uint32_t stride,
                                     uint32_t width, uint32_t height,
                                     bool flipVertical)
{
    // Convert BGRA to I420 (YUV 4:2:0 planar)
    // Y = 0.299*R + 0.587*G + 0.114*B
    // U = -0.169*R - 0.331*G + 0.500*B + 128
    // V = 0.500*R - 0.419*G - 0.081*B + 128

    uint8_t* yDst = m_yPlane.data();
    uint8_t* uDst = m_uPlane.data();
    uint8_t* vDst = m_vPlane.data();

    int uvWidth = width / 2;

    for (uint32_t y = 0; y < height; y++) {
        // Source row (handle vertical flip)
        uint32_t srcY = flipVertical ? (height - 1 - y) : y;
        const uint8_t* srcRow = bgra + srcY * stride;

        // Y plane destination
        uint8_t* yRow = yDst + y * width;

        for (uint32_t x = 0; x < width; x++) {
            uint8_t b = srcRow[x * 4 + 0];
            uint8_t g = srcRow[x * 4 + 1];
            uint8_t r = srcRow[x * 4 + 2];

            // Y component
            int yVal = ((66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
            yRow[x] = (uint8_t)(yVal < 0 ? 0 : (yVal > 255 ? 255 : yVal));
        }

        // UV planes (subsampled 2x2)
        if (y % 2 == 0) {
            uint8_t* uRow = uDst + (y / 2) * uvWidth;
            uint8_t* vRow = vDst + (y / 2) * uvWidth;

            for (uint32_t x = 0; x < width; x += 2) {
                // Average 2x2 block
                uint32_t srcY2 = flipVertical ? (height - 2 - y) : (y + 1);
                if (srcY2 >= height) srcY2 = srcY;
                const uint8_t* srcRow2 = bgra + srcY2 * stride;

                int r = 0, g = 0, b = 0;

                // Top-left
                b += srcRow[x * 4 + 0];
                g += srcRow[x * 4 + 1];
                r += srcRow[x * 4 + 2];

                // Top-right
                if (x + 1 < width) {
                    b += srcRow[(x + 1) * 4 + 0];
                    g += srcRow[(x + 1) * 4 + 1];
                    r += srcRow[(x + 1) * 4 + 2];
                }

                // Bottom-left
                b += srcRow2[x * 4 + 0];
                g += srcRow2[x * 4 + 1];
                r += srcRow2[x * 4 + 2];

                // Bottom-right
                if (x + 1 < width) {
                    b += srcRow2[(x + 1) * 4 + 0];
                    g += srcRow2[(x + 1) * 4 + 1];
                    r += srcRow2[(x + 1) * 4 + 2];
                }

                r /= 4;
                g /= 4;
                b /= 4;

                // U component
                int uVal = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
                uRow[x / 2] = (uint8_t)(uVal < 0 ? 0 : (uVal > 255 ? 255 : uVal));

                // V component
                int vVal = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
                vRow[x / 2] = (uint8_t)(vVal < 0 ? 0 : (vVal > 255 ? 255 : vVal));
            }
        }
    }
}

int H264Encoder::encode(const uint8_t* bgra, uint8_t bpp, uint32_t stride,
                         uint32_t width, uint32_t height,
                         uint8_t** outData, uint32_t* outSize,
                         bool flipVertical)
{
    if (!m_session) {
        snprintf(m_lastError, sizeof(m_lastError), "Encoder not initialized");
        return 0;
    }

    if (width != (uint32_t)m_width || height != (uint32_t)m_height) {
        snprintf(m_lastError, sizeof(m_lastError),
                 "Frame size mismatch: expected %dx%d, got %dx%d",
                 m_width, m_height, (int)width, (int)height);
        return 0;
    }

    // Convert BGRA to I420
    convertBGRAtoI420(bgra, stride, width, height, flipVertical);

    // Create CVPixelBuffer
    CVPixelBufferRef pixelBuffer = nullptr;
    NSDictionary* options = @{
        (id)kCVPixelBufferIOSurfacePropertiesKey: @{}
    };

    CVReturn cvRet = CVPixelBufferCreate(
        kCFAllocatorDefault,
        m_width,
        m_height,
        kCVPixelFormatType_420YpCbCr8Planar,
        (__bridge CFDictionaryRef)options,
        &pixelBuffer
    );

    if (cvRet != kCVReturnSuccess) {
        snprintf(m_lastError, sizeof(m_lastError),
                 "CVPixelBufferCreate failed: %d", (int)cvRet);
        return 0;
    }

    // Lock and copy YUV data
    CVPixelBufferLockBaseAddress(pixelBuffer, 0);

    size_t planeCount = CVPixelBufferGetPlaneCount(pixelBuffer);
    if (planeCount < 3) {
        CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
        CVPixelBufferRelease(pixelBuffer);
        snprintf(m_lastError, sizeof(m_lastError),
                 "CVPixelBuffer has %zu planes, expected 3", planeCount);
        return 0;
    }

    // Y plane
    uint8_t* yDst = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0);
    size_t yStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 0);
    for (int y = 0; y < m_height; y++) {
        memcpy(yDst + y * yStride, m_yPlane.data() + y * m_width, m_width);
    }

    // U plane
    uint8_t* uDst = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1);
    size_t uStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 1);
    int uvHeight = m_height / 2;
    int uvWidth = m_width / 2;
    for (int y = 0; y < uvHeight; y++) {
        memcpy(uDst + y * uStride, m_uPlane.data() + y * uvWidth, uvWidth);
    }

    // V plane
    uint8_t* vDst = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 2);
    size_t vStride = CVPixelBufferGetBytesPerRowOfPlane(pixelBuffer, 2);
    for (int y = 0; y < uvHeight; y++) {
        memcpy(vDst + y * vStride, m_vPlane.data() + y * uvWidth, uvWidth);
    }

    CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);

    // Prepare frame properties
    CFMutableDictionaryRef frameProps = nullptr;
    if (m_forceKeyframe.exchange(false)) {
        frameProps = CFDictionaryCreateMutable(
            kCFAllocatorDefault, 1,
            &kCFTypeDictionaryKeyCallBacks,
            &kCFTypeDictionaryValueCallBacks
        );
        CFDictionarySetValue(frameProps,
            kVTEncodeFrameOptionKey_ForceKeyFrame,
            kCFBooleanTrue);
    }

    // Clear output buffer
    {
        std::lock_guard<std::mutex> lock(m_outputMutex);
        m_outputBuffer.clear();
    }

    // Presentation timestamp
    CMTime pts = CMTimeMake(m_frameCount++, m_fps);

    // Encode frame
    OSStatus status = VTCompressionSessionEncodeFrame(
        m_session,
        pixelBuffer,
        pts,
        kCMTimeInvalid,
        frameProps,
        nullptr,
        nullptr
    );

    if (frameProps) {
        CFRelease(frameProps);
    }
    CVPixelBufferRelease(pixelBuffer);

    if (status != noErr) {
        snprintf(m_lastError, sizeof(m_lastError),
                 "VTCompressionSessionEncodeFrame failed: %d", (int)status);
        return 0;
    }

    // Wait for encoding to complete
    VTCompressionSessionCompleteFrames(m_session, kCMTimeInvalid);

    // Return encoded data
    std::lock_guard<std::mutex> lock(m_outputMutex);
    if (m_outputBuffer.empty()) {
        return 0;
    }

    *outData = m_outputBuffer.data();
    *outSize = (uint32_t)m_outputBuffer.size();
    return (int)m_outputBuffer.size();
}

void H264Encoder::compressionCallback(void* outputCallbackRefCon,
                                       void* sourceFrameRefCon,
                                       OSStatus status,
                                       VTEncodeInfoFlags infoFlags,
                                       CMSampleBufferRef sampleBuffer)
{
    (void)sourceFrameRefCon;
    (void)infoFlags;

    H264Encoder* encoder = (H264Encoder*)outputCallbackRefCon;

    if (status != noErr) {
        NSLog(@"H264Encoder: Compression callback error: %d", (int)status);
        return;
    }

    if (!sampleBuffer) {
        return;
    }

    encoder->processSampleBuffer(sampleBuffer);
}

void H264Encoder::processSampleBuffer(CMSampleBufferRef sampleBuffer)
{
    // Check if keyframe
    CFArrayRef attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, false);
    bool isKeyframe = false;
    if (attachments && CFArrayGetCount(attachments) > 0) {
        CFDictionaryRef dict = (CFDictionaryRef)CFArrayGetValueAtIndex(attachments, 0);
        CFBooleanRef notSync = (CFBooleanRef)CFDictionaryGetValue(dict,
            kCMSampleAttachmentKey_NotSync);
        isKeyframe = (notSync == nullptr || !CFBooleanGetValue(notSync));
    }

    std::lock_guard<std::mutex> lock(m_outputMutex);
    m_outputBuffer.clear();

    // Get format description for SPS/PPS
    CMFormatDescriptionRef formatDesc = CMSampleBufferGetFormatDescription(sampleBuffer);

    // If keyframe, prepend SPS and PPS
    if (isKeyframe && formatDesc) {
        // Get SPS
        size_t spsSize = 0;
        size_t spsCount = 0;
        const uint8_t* sps = nullptr;
        OSStatus status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(
            formatDesc, 0, &sps, &spsSize, &spsCount, nullptr);

        if (status == noErr && sps && spsSize > 0) {
            // Write NAL start code + SPS
            uint8_t startCode[] = {0x00, 0x00, 0x00, 0x01};
            m_outputBuffer.insert(m_outputBuffer.end(), startCode, startCode + 4);
            m_outputBuffer.insert(m_outputBuffer.end(), sps, sps + spsSize);
        }

        // Get PPS
        size_t ppsSize = 0;
        size_t ppsCount = 0;
        const uint8_t* pps = nullptr;
        status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(
            formatDesc, 1, &pps, &ppsSize, &ppsCount, nullptr);

        if (status == noErr && pps && ppsSize > 0) {
            // Write NAL start code + PPS
            uint8_t startCode[] = {0x00, 0x00, 0x00, 0x01};
            m_outputBuffer.insert(m_outputBuffer.end(), startCode, startCode + 4);
            m_outputBuffer.insert(m_outputBuffer.end(), pps, pps + ppsSize);
        }
    }

    // Get encoded data
    CMBlockBufferRef blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer);
    if (!blockBuffer) {
        return;
    }

    size_t totalLength = 0;
    size_t lengthAtOffset = 0;
    char* dataPointer = nullptr;

    OSStatus status = CMBlockBufferGetDataPointer(
        blockBuffer, 0, &lengthAtOffset, &totalLength, &dataPointer);

    if (status != noErr || !dataPointer) {
        return;
    }

    // Get NAL unit length size from format description (usually 4 bytes)
    int nalLengthSize = 4;
    if (formatDesc) {
        int tmpNalLengthSize = 0;
        status = CMVideoFormatDescriptionGetH264ParameterSetAtIndex(
            formatDesc, 0, nullptr, nullptr, nullptr, &tmpNalLengthSize);
        if (status == noErr && tmpNalLengthSize > 0 && tmpNalLengthSize <= 4) {
            nalLengthSize = tmpNalLengthSize;
        }
    }

    // Convert AVCC format (length-prefixed) to Annex B (start code prefixed)
    size_t offset = 0;
    while (offset < totalLength) {
        // Read NAL unit length (big-endian, variable size)
        uint32_t nalLength = 0;
        const uint8_t* lengthPtr = (const uint8_t*)dataPointer + offset;
        for (int i = 0; i < nalLengthSize; i++) {
            nalLength = (nalLength << 8) | lengthPtr[i];
        }
        offset += nalLengthSize;

        if (nalLength > 0 && offset + nalLength <= totalLength) {
            // Write NAL start code
            uint8_t startCode[] = {0x00, 0x00, 0x00, 0x01};
            m_outputBuffer.insert(m_outputBuffer.end(), startCode, startCode + 4);

            // Write NAL data
            m_outputBuffer.insert(m_outputBuffer.end(),
                                  (uint8_t*)dataPointer + offset,
                                  (uint8_t*)dataPointer + offset + nalLength);
        }

        offset += nalLength;
    }
}