#include "mf_encoder.h" #include "audio_sample_utils.h" #include #include #include #include #include #include namespace { bool succeeded(HRESULT hr, const char* label) { if (SUCCEEDED(hr)) { return true; } std::cerr << "ERROR: " << label << " failed (hr=0x" << std::hex << hr << std::dec << ")" << std::endl; return false; } void setFrameSize(IMFMediaType* type, UINT32 width, UINT32 height) { MFSetAttributeSize(type, MF_MT_FRAME_SIZE, width, height); } void setFrameRate(IMFMediaType* type, UINT32 fps) { MFSetAttributeRatio(type, MF_MT_FRAME_RATE, fps, 1); } void setPixelAspectRatio(IMFMediaType* type) { MFSetAttributeRatio(type, MF_MT_PIXEL_ASPECT_RATIO, 1, 1); } void setAudioFormat(IMFMediaType* type, UINT32 channels, UINT32 sampleRate, UINT32 bitsPerSample) { type->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS, channels); type->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, sampleRate); type->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, bitsPerSample); } void compositeWebcam(BYTE* destination, int width, int height, const BgraFrameView& webcamFrame) { if (!webcamFrame.data || webcamFrame.width <= 0 || webcamFrame.height <= 0 || width <= 0 || height <= 0) { return; } const int margin = std::max(16, std::min(width, height) / 60); const int maxOverlayWidth = std::max(2, width / 4); int overlayWidth = maxOverlayWidth; int overlayHeight = static_cast( (static_cast(overlayWidth) * webcamFrame.height) / std::max(1, webcamFrame.width)); const int maxOverlayHeight = std::max(2, height / 3); if (overlayHeight > maxOverlayHeight) { overlayHeight = maxOverlayHeight; overlayWidth = static_cast( (static_cast(overlayHeight) * webcamFrame.width) / std::max(1, webcamFrame.height)); } overlayWidth = std::max(2, std::min(overlayWidth, width - margin * 2)); overlayHeight = std::max(2, std::min(overlayHeight, height - margin * 2)); const int originX = std::max(0, width - overlayWidth - margin); const int originY = std::max(0, height - overlayHeight - margin); for (int y = 0; y < overlayHeight; y += 1) { const int sourceY = static_cast((static_cast(y) * webcamFrame.height) / overlayHeight); BYTE* destinationRow = destination + ((originY + y) * width + originX) * 4; for (int x = 0; x < overlayWidth; x += 1) { const int sourceX = static_cast((static_cast(x) * webcamFrame.width) / overlayWidth); const BYTE* source = webcamFrame.data + (sourceY * webcamFrame.width + sourceX) * 4; BYTE* target = destinationRow + x * 4; target[0] = source[0]; target[1] = source[1]; target[2] = source[2]; target[3] = 255; } } } } // namespace MFEncoder::~MFEncoder() { finalize(); } bool MFEncoder::initialize( const std::wstring& outputPath, int width, int height, int fps, int bitrate, ID3D11Device* device, ID3D11DeviceContext* context, const AudioInputFormat* audioFormat) { width_ = (std::max(2, width) / 2) * 2; height_ = (std::max(2, height) / 2) * 2; fps_ = std::max(1, fps); device_ = device; context_ = context; if (!succeeded(MFStartup(MF_VERSION), "MFStartup")) { return false; } Microsoft::WRL::ComPtr outputType; if (!succeeded(MFCreateMediaType(&outputType), "MFCreateMediaType(output)")) { return false; } outputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video); outputType->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_H264); outputType->SetUINT32(MF_MT_AVG_BITRATE, static_cast(std::max(1, bitrate))); outputType->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive); setFrameSize(outputType.Get(), static_cast(width_), static_cast(height_)); setFrameRate(outputType.Get(), static_cast(fps_)); setPixelAspectRatio(outputType.Get()); if (!succeeded(MFCreateSinkWriterFromURL(outputPath.c_str(), nullptr, nullptr, &sinkWriter_), "MFCreateSinkWriterFromURL")) { return false; } if (!succeeded(sinkWriter_->AddStream(outputType.Get(), &videoStreamIndex_), "AddStream")) { return false; } if (audioFormat && !configureAudioStream(*audioFormat)) { return false; } Microsoft::WRL::ComPtr inputType; if (!succeeded(MFCreateMediaType(&inputType), "MFCreateMediaType(input)")) { return false; } inputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video); inputType->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_RGB32); inputType->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive); inputType->SetUINT32(MF_MT_DEFAULT_STRIDE, static_cast(width_ * 4)); setFrameSize(inputType.Get(), static_cast(width_), static_cast(height_)); setFrameRate(inputType.Get(), static_cast(fps_)); setPixelAspectRatio(inputType.Get()); if (!succeeded(sinkWriter_->SetInputMediaType(videoStreamIndex_, inputType.Get(), nullptr), "SetInputMediaType")) { return false; } if (!succeeded(sinkWriter_->BeginWriting(), "BeginWriting")) { return false; } return true; } bool MFEncoder::configureAudioStream(const AudioInputFormat& audioFormat) { if (!sinkWriter_) { return false; } if (audioFormat.sampleRate == 0 || audioFormat.channels == 0 || audioFormat.blockAlign == 0) { std::cerr << "ERROR: Invalid audio input format" << std::endl; return false; } const AudioInputFormat encoderFormat = makeAacCompatibleAudioFormat(audioFormat); const UINT32 aacBytesPerSecond = 24'000; Microsoft::WRL::ComPtr outputType; if (!succeeded(MFCreateMediaType(&outputType), "MFCreateMediaType(audio output)")) { return false; } outputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio); outputType->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_AAC); setAudioFormat(outputType.Get(), encoderFormat.channels, encoderFormat.sampleRate, 16); outputType->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, aacBytesPerSecond); outputType->SetUINT32(MF_MT_AAC_PAYLOAD_TYPE, 0); if (!succeeded(sinkWriter_->AddStream(outputType.Get(), &audioStreamIndex_), "AddStream(audio)")) { return false; } Microsoft::WRL::ComPtr inputType; if (!succeeded(MFCreateMediaType(&inputType), "MFCreateMediaType(audio input)")) { return false; } inputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio); inputType->SetGUID(MF_MT_SUBTYPE, encoderFormat.subtype); setAudioFormat(inputType.Get(), encoderFormat.channels, encoderFormat.sampleRate, encoderFormat.bitsPerSample); inputType->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, encoderFormat.blockAlign); inputType->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, encoderFormat.avgBytesPerSec); inputType->SetUINT32(MF_MT_ALL_SAMPLES_INDEPENDENT, TRUE); if (!succeeded(sinkWriter_->SetInputMediaType(audioStreamIndex_, inputType.Get(), nullptr), "SetInputMediaType(audio)")) { return false; } hasAudioStream_ = true; return true; } bool MFEncoder::ensureStagingTexture(ID3D11Texture2D* texture) { if (stagingTexture_) { return true; } D3D11_TEXTURE2D_DESC desc{}; texture->GetDesc(&desc); desc.Width = static_cast(width_); desc.Height = static_cast(height_); desc.MipLevels = 1; desc.ArraySize = 1; desc.Format = DXGI_FORMAT_B8G8R8A8_UNORM; desc.SampleDesc.Count = 1; desc.SampleDesc.Quality = 0; desc.Usage = D3D11_USAGE_STAGING; desc.BindFlags = 0; desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; desc.MiscFlags = 0; return succeeded(device_->CreateTexture2D(&desc, nullptr, &stagingTexture_), "CreateTexture2D(staging)"); } bool MFEncoder::copyFrameToBuffer( ID3D11Texture2D* texture, BYTE* destination, DWORD destinationSize, const BgraFrameView* webcamFrame) { if (!ensureStagingTexture(texture)) { return false; } context_->CopyResource(stagingTexture_.Get(), texture); D3D11_MAPPED_SUBRESOURCE mapped{}; if (!succeeded(context_->Map(stagingTexture_.Get(), 0, D3D11_MAP_READ, 0, &mapped), "Map")) { return false; } const DWORD rowBytes = static_cast(width_ * 4); const DWORD requiredBytes = rowBytes * static_cast(height_); if (destinationSize < requiredBytes) { context_->Unmap(stagingTexture_.Get(), 0); std::cerr << "ERROR: Media Foundation buffer is too small" << std::endl; return false; } auto* source = static_cast(mapped.pData); for (int y = 0; y < height_; y += 1) { std::memcpy(destination + rowBytes * y, source + mapped.RowPitch * y, rowBytes); } if (webcamFrame) { compositeWebcam(destination, width_, height_, *webcamFrame); } context_->Unmap(stagingTexture_.Get(), 0); return true; } bool MFEncoder::copyBgraFrameToBuffer(const BgraFrameView& frame, BYTE* destination, DWORD destinationSize) { if (!frame.data || frame.width <= 0 || frame.height <= 0) { return false; } const DWORD rowBytes = static_cast(width_ * 4); const DWORD requiredBytes = rowBytes * static_cast(height_); if (destinationSize < requiredBytes) { std::cerr << "ERROR: Media Foundation webcam buffer is too small" << std::endl; return false; } if (frame.width == width_ && frame.height == height_) { for (DWORD i = 0; i < requiredBytes; i += 4) { destination[i] = frame.data[i]; destination[i + 1] = frame.data[i + 1]; destination[i + 2] = frame.data[i + 2]; destination[i + 3] = 255; } return true; } for (int y = 0; y < height_; y += 1) { const int sourceY = static_cast((static_cast(y) * frame.height) / height_); BYTE* destinationRow = destination + rowBytes * y; for (int x = 0; x < width_; x += 1) { const int sourceX = static_cast((static_cast(x) * frame.width) / width_); const BYTE* source = frame.data + (sourceY * frame.width + sourceX) * 4; BYTE* target = destinationRow + x * 4; target[0] = source[0]; target[1] = source[1]; target[2] = source[2]; target[3] = 255; } } return true; } bool MFEncoder::writeFrame(ID3D11Texture2D* texture, int64_t timestampHns, const BgraFrameView* webcamFrame) { std::scoped_lock writerLock(writerMutex_); if (!sinkWriter_ || finalized_) { return false; } if (firstTimestampHns_ < 0) { firstTimestampHns_ = timestampHns; } int64_t sampleTime = timestampHns - firstTimestampHns_; if (sampleTime <= lastTimestampHns_) { sampleTime = lastTimestampHns_ + (10'000'000LL / fps_); } const int64_t sampleDuration = 10'000'000LL / fps_; lastTimestampHns_ = sampleTime; Microsoft::WRL::ComPtr buffer; const DWORD frameBytes = static_cast(width_ * height_ * 4); if (!succeeded(MFCreateMemoryBuffer(frameBytes, &buffer), "MFCreateMemoryBuffer")) { return false; } BYTE* data = nullptr; DWORD maxLength = 0; DWORD currentLength = 0; if (!succeeded(buffer->Lock(&data, &maxLength, ¤tLength), "IMFMediaBuffer::Lock")) { return false; } const bool copied = copyFrameToBuffer(texture, data, maxLength, webcamFrame); buffer->Unlock(); if (!copied) { return false; } buffer->SetCurrentLength(frameBytes); Microsoft::WRL::ComPtr sample; if (!succeeded(MFCreateSample(&sample), "MFCreateSample")) { return false; } sample->AddBuffer(buffer.Get()); sample->SetSampleTime(sampleTime); sample->SetSampleDuration(sampleDuration); return succeeded(sinkWriter_->WriteSample(videoStreamIndex_, sample.Get()), "WriteSample"); } bool MFEncoder::writeBgraFrame(const BgraFrameView& frame, int64_t timestampHns) { std::scoped_lock writerLock(writerMutex_); if (!sinkWriter_ || finalized_) { return false; } if (firstTimestampHns_ < 0) { firstTimestampHns_ = timestampHns; } int64_t sampleTime = timestampHns - firstTimestampHns_; if (sampleTime <= lastTimestampHns_) { sampleTime = lastTimestampHns_ + (10'000'000LL / fps_); } const int64_t sampleDuration = 10'000'000LL / fps_; lastTimestampHns_ = sampleTime; Microsoft::WRL::ComPtr buffer; const DWORD frameBytes = static_cast(width_ * height_ * 4); if (!succeeded(MFCreateMemoryBuffer(frameBytes, &buffer), "MFCreateMemoryBuffer(webcam)")) { return false; } BYTE* data = nullptr; DWORD maxLength = 0; DWORD currentLength = 0; if (!succeeded(buffer->Lock(&data, &maxLength, ¤tLength), "IMFMediaBuffer::Lock(webcam)")) { return false; } const bool copied = copyBgraFrameToBuffer(frame, data, maxLength); buffer->Unlock(); if (!copied) { return false; } buffer->SetCurrentLength(frameBytes); Microsoft::WRL::ComPtr sample; if (!succeeded(MFCreateSample(&sample), "MFCreateSample(webcam)")) { return false; } sample->AddBuffer(buffer.Get()); sample->SetSampleTime(sampleTime); sample->SetSampleDuration(sampleDuration); return succeeded(sinkWriter_->WriteSample(videoStreamIndex_, sample.Get()), "WriteSample(webcam)"); } bool MFEncoder::writeAudio(const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) { std::scoped_lock writerLock(writerMutex_); if (!sinkWriter_ || finalized_ || !hasAudioStream_) { return false; } if (!data || byteCount == 0 || durationHns <= 0) { return true; } Microsoft::WRL::ComPtr buffer; if (!succeeded(MFCreateMemoryBuffer(byteCount, &buffer), "MFCreateMemoryBuffer(audio)")) { return false; } BYTE* destination = nullptr; DWORD maxLength = 0; DWORD currentLength = 0; if (!succeeded(buffer->Lock(&destination, &maxLength, ¤tLength), "IMFMediaBuffer::Lock(audio)")) { return false; } if (maxLength < byteCount) { buffer->Unlock(); std::cerr << "ERROR: Media Foundation audio buffer is too small" << std::endl; return false; } std::memcpy(destination, data, byteCount); buffer->Unlock(); buffer->SetCurrentLength(byteCount); Microsoft::WRL::ComPtr sample; if (!succeeded(MFCreateSample(&sample), "MFCreateSample(audio)")) { return false; } sample->AddBuffer(buffer.Get()); sample->SetSampleTime(std::max(0, timestampHns)); sample->SetSampleDuration(durationHns); return succeeded(sinkWriter_->WriteSample(audioStreamIndex_, sample.Get()), "WriteSample(audio)"); } bool MFEncoder::finalize() { std::scoped_lock writerLock(writerMutex_); if (finalized_) { return true; } finalized_ = true; bool ok = true; if (sinkWriter_) { ok = succeeded(sinkWriter_->Finalize(), "SinkWriter::Finalize"); sinkWriter_.Reset(); } stagingTexture_.Reset(); context_.Reset(); device_.Reset(); MFShutdown(); return ok; }