feat: add native Windows webcam composition

This commit is contained in:
EtienneLescot
2026-05-05 17:50:22 +02:00
parent 048189da72
commit fb85f66875
11 changed files with 510 additions and 31 deletions
+8 -4
View File
@@ -35,15 +35,18 @@ Current V2 JSON shape:
"captureMic": false,
"microphoneDeviceId": "default",
"microphoneGain": 1.4,
"webcamEnabled": false,
"webcamEnabled": true,
"webcamDeviceId": "default",
"webcamWidth": 1280,
"webcamHeight": 720,
"webcamFps": 30,
"outputs": {
"screenPath": "C:\\path\\recording-123.mp4",
"webcamPath": "C:\\path\\recording-123-webcam.mp4"
"screenPath": "C:\\path\\recording-123.mp4"
}
}
```
The current helper implementation supports display/window video capture, system audio loopback, and initial default-microphone capture. Webcam capture now fails explicitly in the helper rather than silently falling back to Electron capture on Windows. See `docs/engineering/windows-native-recorder-roadmap.md` for the phased implementation plan.
The current helper implementation supports display/window video capture, system audio loopback, default-microphone capture, and Media Foundation webcam capture. Webcam frames are currently composed into the primary MP4 as a bottom-right picture-in-picture overlay. Browser `deviceId` values do not always map to Media Foundation symbolic links; when the requested webcam is not matched, the helper logs a warning and uses the default webcam.
Smoke-test the helper with:
@@ -53,4 +56,5 @@ npm run test:wgc-window:win
npm run test:wgc-audio:win
npm run test:wgc-mic:win
npm run test:wgc-mixed-audio:win
npm run test:wgc-webcam:win
```
@@ -23,6 +23,8 @@ add_executable(wgc-capture
src/monitor_utils.h
src/wasapi_loopback_capture.cpp
src/wasapi_loopback_capture.h
src/webcam_capture.cpp
src/webcam_capture.h
src/wgc_session.cpp
src/wgc_session.h
)
+54 -6
View File
@@ -2,6 +2,7 @@
#include "mf_encoder.h"
#include "monitor_utils.h"
#include "wasapi_loopback_capture.h"
#include "webcam_capture.h"
#include "wgc_session.h"
#include <winrt/Windows.Foundation.h>
@@ -303,11 +304,6 @@ int main(int argc, char* argv[]) {
std::cout << "{\"event\":\"ready\",\"schemaVersion\":2}" << std::endl;
if (config.webcamEnabled) {
std::cerr << "ERROR: Native webcam capture is not implemented in this helper yet" << std::endl;
return 1;
}
WgcSession session;
if (config.sourceType == "display") {
HMONITOR monitor = findMonitorForCapture(
@@ -347,6 +343,22 @@ int main(int argc, char* argv[]) {
const int pixels = width * height;
const int bitrate = pixels >= 3840 * 2160 ? 45'000'000 : pixels >= 2560 * 1440 ? 28'000'000 : 18'000'000;
WebcamCapture webcamCapture;
bool webcamActive = false;
if (config.webcamEnabled) {
if (!webcamCapture.initialize(
utf8ToWide(config.webcamDeviceId),
config.webcamWidth,
config.webcamHeight,
config.webcamFps > 0 ? config.webcamFps : config.fps)) {
std::cerr << "ERROR: Failed to initialize native webcam capture" << std::endl;
return 1;
}
std::cout << "{\"event\":\"webcam-format\",\"schemaVersion\":2,\"width\":" << webcamCapture.width()
<< ",\"height\":" << webcamCapture.height()
<< ",\"fps\":" << webcamCapture.fps() << "}" << std::endl;
}
WasapiLoopbackCapture loopbackCapture;
WasapiLoopbackCapture microphoneCapture;
const AudioInputFormat* audioFormat = nullptr;
@@ -398,6 +410,9 @@ int main(int argc, char* argv[]) {
std::atomic<bool> firstFrameWritten = false;
std::atomic<bool> encodeFailed = false;
Microsoft::WRL::ComPtr<ID3D11Texture2D> latestFrameTexture;
std::vector<BYTE> latestWebcamFrame;
int latestWebcamWidth = 0;
int latestWebcamHeight = 0;
session.setFrameCallback([&](ID3D11Texture2D* texture, int64_t timestampHns) {
(void)timestampHns;
@@ -433,9 +448,18 @@ int main(int argc, char* argv[]) {
while (!stopRequested && !encodeFailed) {
{
std::scoped_lock lock(mutex);
if (webcamActive) {
webcamCapture.copyLatestFrame(latestWebcamFrame, latestWebcamWidth, latestWebcamHeight);
}
const BgraFrameView webcamFrame{
latestWebcamFrame.empty() ? nullptr : latestWebcamFrame.data(),
latestWebcamWidth,
latestWebcamHeight,
};
if (latestFrameTexture && !encoder.writeFrame(
latestFrameTexture.Get(),
static_cast<int64_t>((frameIndex * 10'000'000ULL) / config.fps))) {
static_cast<int64_t>((frameIndex * 10'000'000ULL) / config.fps),
webcamFrame.data ? &webcamFrame : nullptr)) {
encodeFailed = true;
stopRequested = true;
cv.notify_all();
@@ -528,8 +552,30 @@ int main(int argc, char* argv[]) {
if (!startAudioCaptures()) {
return 1;
}
if (config.webcamEnabled) {
if (!webcamCapture.start()) {
microphoneCapture.stop();
loopbackCapture.stop();
if (audioMixer) {
audioMixer->stop();
}
std::cerr << "ERROR: Failed to start native webcam capture" << std::endl;
return 1;
}
webcamActive = true;
const auto webcamDeadline = std::chrono::steady_clock::now() + std::chrono::seconds(3);
while (std::chrono::steady_clock::now() < webcamDeadline &&
!webcamCapture.copyLatestFrame(latestWebcamFrame, latestWebcamWidth, latestWebcamHeight)) {
std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
if (latestWebcamFrame.empty()) {
std::cerr << "WARNING: Native webcam started but no frame was available before screen capture"
<< std::endl;
}
}
if (!session.start()) {
webcamCapture.stop();
microphoneCapture.stop();
loopbackCapture.stop();
if (audioMixer) {
@@ -554,6 +600,7 @@ int main(int argc, char* argv[]) {
}
microphoneCapture.stop();
loopbackCapture.stop();
webcamCapture.stop();
if (audioMixer) {
audioMixer->stop();
}
@@ -580,6 +627,7 @@ int main(int argc, char* argv[]) {
microphoneCapture.stop();
loopbackCapture.stop();
webcamCapture.stop();
if (audioMixer) {
audioMixer->stop();
}
+47 -3
View File
@@ -38,6 +38,43 @@ void setAudioFormat(IMFMediaType* type, UINT32 channels, UINT32 sampleRate, UINT
type->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, bitsPerSample);
}
void compositeWebcam(BYTE* destination, int width, int height, const BgraFrameView& webcamFrame) {
if (!webcamFrame.data || webcamFrame.width <= 0 || webcamFrame.height <= 0 || width <= 0 || height <= 0) {
return;
}
const int margin = std::max(16, std::min(width, height) / 60);
const int maxOverlayWidth = std::max(2, width / 4);
int overlayWidth = maxOverlayWidth;
int overlayHeight = static_cast<int>(
(static_cast<int64_t>(overlayWidth) * webcamFrame.height) / std::max(1, webcamFrame.width));
const int maxOverlayHeight = std::max(2, height / 3);
if (overlayHeight > maxOverlayHeight) {
overlayHeight = maxOverlayHeight;
overlayWidth = static_cast<int>(
(static_cast<int64_t>(overlayHeight) * webcamFrame.width) / std::max(1, webcamFrame.height));
}
overlayWidth = std::max(2, std::min(overlayWidth, width - margin * 2));
overlayHeight = std::max(2, std::min(overlayHeight, height - margin * 2));
const int originX = std::max(0, width - overlayWidth - margin);
const int originY = std::max(0, height - overlayHeight - margin);
for (int y = 0; y < overlayHeight; y += 1) {
const int sourceY = static_cast<int>((static_cast<int64_t>(y) * webcamFrame.height) / overlayHeight);
BYTE* destinationRow = destination + ((originY + y) * width + originX) * 4;
for (int x = 0; x < overlayWidth; x += 1) {
const int sourceX = static_cast<int>((static_cast<int64_t>(x) * webcamFrame.width) / overlayWidth);
const BYTE* source = webcamFrame.data + (sourceY * webcamFrame.width + sourceX) * 4;
BYTE* target = destinationRow + x * 4;
target[0] = source[0];
target[1] = source[1];
target[2] = source[2];
target[3] = 255;
}
}
}
} // namespace
MFEncoder::~MFEncoder() {
@@ -179,7 +216,11 @@ bool MFEncoder::ensureStagingTexture(ID3D11Texture2D* texture) {
"CreateTexture2D(staging)");
}
bool MFEncoder::copyFrameToBuffer(ID3D11Texture2D* texture, BYTE* destination, DWORD destinationSize) {
bool MFEncoder::copyFrameToBuffer(
ID3D11Texture2D* texture,
BYTE* destination,
DWORD destinationSize,
const BgraFrameView* webcamFrame) {
if (!ensureStagingTexture(texture)) {
return false;
}
@@ -203,12 +244,15 @@ bool MFEncoder::copyFrameToBuffer(ID3D11Texture2D* texture, BYTE* destination, D
for (int y = 0; y < height_; y += 1) {
std::memcpy(destination + rowBytes * y, source + mapped.RowPitch * y, rowBytes);
}
if (webcamFrame) {
compositeWebcam(destination, width_, height_, *webcamFrame);
}
context_->Unmap(stagingTexture_.Get(), 0);
return true;
}
bool MFEncoder::writeFrame(ID3D11Texture2D* texture, int64_t timestampHns) {
bool MFEncoder::writeFrame(ID3D11Texture2D* texture, int64_t timestampHns, const BgraFrameView* webcamFrame) {
std::scoped_lock writerLock(writerMutex_);
if (!sinkWriter_ || finalized_) {
return false;
@@ -238,7 +282,7 @@ bool MFEncoder::writeFrame(ID3D11Texture2D* texture, int64_t timestampHns) {
return false;
}
const bool copied = copyFrameToBuffer(texture, data, maxLength);
const bool copied = copyFrameToBuffer(texture, data, maxLength, webcamFrame);
buffer->Unlock();
if (!copied) {
return false;
+12 -2
View File
@@ -11,6 +11,12 @@
#include <mutex>
#include <string>
struct BgraFrameView {
const BYTE* data = nullptr;
int width = 0;
int height = 0;
};
struct AudioInputFormat {
GUID subtype = MFAudioFormat_PCM;
UINT32 sampleRate = 0;
@@ -37,13 +43,17 @@ public:
ID3D11Device* device,
ID3D11DeviceContext* context,
const AudioInputFormat* audioFormat = nullptr);
bool writeFrame(ID3D11Texture2D* texture, int64_t timestampHns);
bool writeFrame(ID3D11Texture2D* texture, int64_t timestampHns, const BgraFrameView* webcamFrame = nullptr);
bool writeAudio(const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns);
bool finalize();
private:
bool ensureStagingTexture(ID3D11Texture2D* texture);
bool copyFrameToBuffer(ID3D11Texture2D* texture, BYTE* destination, DWORD destinationSize);
bool copyFrameToBuffer(
ID3D11Texture2D* texture,
BYTE* destination,
DWORD destinationSize,
const BgraFrameView* webcamFrame);
bool configureAudioStream(const AudioInputFormat& audioFormat);
Microsoft::WRL::ComPtr<IMFSinkWriter> sinkWriter_;
@@ -0,0 +1,275 @@
#include "webcam_capture.h"
#include <mfapi.h>
#include <mferror.h>
#include <propvarutil.h>
#include <algorithm>
#include <chrono>
#include <iostream>
namespace {
bool succeeded(HRESULT hr, const char* label) {
if (SUCCEEDED(hr)) {
return true;
}
std::cerr << "ERROR: " << label << " failed (hr=0x" << std::hex << hr << std::dec << ")"
<< std::endl;
return false;
}
std::wstring readAllocatedString(IMFActivate* activate, REFGUID key) {
WCHAR* value = nullptr;
UINT32 length = 0;
if (FAILED(activate->GetAllocatedString(key, &value, &length)) || !value) {
return {};
}
std::wstring result(value, value + length);
CoTaskMemFree(value);
return result;
}
bool containsInsensitive(const std::wstring& haystack, const std::wstring& needle) {
if (haystack.empty() || needle.empty()) {
return false;
}
std::wstring lowerHaystack = haystack;
std::wstring lowerNeedle = needle;
std::transform(lowerHaystack.begin(), lowerHaystack.end(), lowerHaystack.begin(), ::towlower);
std::transform(lowerNeedle.begin(), lowerNeedle.end(), lowerNeedle.begin(), ::towlower);
return lowerHaystack.find(lowerNeedle) != std::wstring::npos ||
lowerNeedle.find(lowerHaystack) != std::wstring::npos;
}
} // namespace
WebcamCapture::~WebcamCapture() {
stop();
}
bool WebcamCapture::initialize(const std::wstring& deviceId, int requestedWidth, int requestedHeight, int requestedFps) {
fps_ = std::clamp(requestedFps > 0 ? requestedFps : 30, 1, 60);
if (!succeeded(MFStartup(MF_VERSION), "MFStartup(webcam)")) {
return false;
}
mfStarted_ = true;
if (!selectDevice(deviceId)) {
return false;
}
return configureReader(requestedWidth, requestedHeight, fps_);
}
bool WebcamCapture::selectDevice(const std::wstring& deviceId) {
Microsoft::WRL::ComPtr<IMFAttributes> attributes;
if (!succeeded(MFCreateAttributes(&attributes, 1), "MFCreateAttributes(webcam enumeration)")) {
return false;
}
if (!succeeded(attributes->SetGUID(
MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE,
MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID),
"SetGUID(webcam source type)")) {
return false;
}
IMFActivate** devices = nullptr;
UINT32 deviceCount = 0;
HRESULT hr = MFEnumDeviceSources(attributes.Get(), &devices, &deviceCount);
if (!succeeded(hr, "MFEnumDeviceSources") || deviceCount == 0) {
if (devices) {
CoTaskMemFree(devices);
}
std::cerr << "ERROR: No native Windows webcam devices were found" << std::endl;
return false;
}
UINT32 selectedIndex = 0;
for (UINT32 index = 0; index < deviceCount; index += 1) {
const std::wstring name = readAllocatedString(devices[index], MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME);
const std::wstring symbolicLink = readAllocatedString(devices[index], MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_SYMBOLIC_LINK);
if (!deviceId.empty() && (containsInsensitive(symbolicLink, deviceId) || containsInsensitive(name, deviceId))) {
selectedIndex = index;
break;
}
}
if (!deviceId.empty() && selectedIndex == 0) {
const std::wstring firstName = readAllocatedString(devices[0], MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME);
const std::wstring firstLink = readAllocatedString(devices[0], MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_SYMBOLIC_LINK);
if (!containsInsensitive(firstLink, deviceId) && !containsInsensitive(firstName, deviceId)) {
std::cerr << "WARNING: Requested webcam device was not found by Media Foundation; using default webcam"
<< std::endl;
}
}
selectedDeviceName_ = readAllocatedString(devices[selectedIndex], MF_DEVSOURCE_ATTRIBUTE_FRIENDLY_NAME);
hr = devices[selectedIndex]->ActivateObject(IID_PPV_ARGS(&mediaSource_));
for (UINT32 index = 0; index < deviceCount; index += 1) {
devices[index]->Release();
}
CoTaskMemFree(devices);
return succeeded(hr, "ActivateObject(webcam)");
}
bool WebcamCapture::configureReader(int requestedWidth, int requestedHeight, int requestedFps) {
Microsoft::WRL::ComPtr<IMFAttributes> attributes;
if (!succeeded(MFCreateAttributes(&attributes, 2), "MFCreateAttributes(webcam reader)")) {
return false;
}
attributes->SetUINT32(MF_SOURCE_READER_ENABLE_VIDEO_PROCESSING, TRUE);
attributes->SetUINT32(MF_READWRITE_DISABLE_CONVERTERS, FALSE);
if (!succeeded(MFCreateSourceReaderFromMediaSource(mediaSource_.Get(), attributes.Get(), &sourceReader_),
"MFCreateSourceReaderFromMediaSource(webcam)")) {
return false;
}
Microsoft::WRL::ComPtr<IMFMediaType> mediaType;
if (!succeeded(MFCreateMediaType(&mediaType), "MFCreateMediaType(webcam output)")) {
return false;
}
mediaType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
mediaType->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_RGB32);
if (requestedWidth > 0 && requestedHeight > 0) {
MFSetAttributeSize(mediaType.Get(), MF_MT_FRAME_SIZE, static_cast<UINT32>(requestedWidth), static_cast<UINT32>(requestedHeight));
}
MFSetAttributeRatio(mediaType.Get(), MF_MT_FRAME_RATE, static_cast<UINT32>(std::max(1, requestedFps)), 1);
if (!succeeded(sourceReader_->SetCurrentMediaType(MF_SOURCE_READER_FIRST_VIDEO_STREAM, nullptr, mediaType.Get()),
"SetCurrentMediaType(webcam RGB32)")) {
return false;
}
sourceReader_->SetStreamSelection(MF_SOURCE_READER_ALL_STREAMS, FALSE);
sourceReader_->SetStreamSelection(MF_SOURCE_READER_FIRST_VIDEO_STREAM, TRUE);
Microsoft::WRL::ComPtr<IMFMediaType> currentType;
if (!succeeded(sourceReader_->GetCurrentMediaType(MF_SOURCE_READER_FIRST_VIDEO_STREAM, &currentType),
"GetCurrentMediaType(webcam)")) {
return false;
}
UINT32 width = 0;
UINT32 height = 0;
if (FAILED(MFGetAttributeSize(currentType.Get(), MF_MT_FRAME_SIZE, &width, &height)) || width == 0 || height == 0) {
width = static_cast<UINT32>(requestedWidth > 0 ? requestedWidth : 1280);
height = static_cast<UINT32>(requestedHeight > 0 ? requestedHeight : 720);
}
width_ = static_cast<int>(width);
height_ = static_cast<int>(height);
return true;
}
bool WebcamCapture::start() {
if (!sourceReader_ || thread_.joinable()) {
return false;
}
stopRequested_ = false;
thread_ = std::thread(&WebcamCapture::captureLoop, this);
return true;
}
void WebcamCapture::stop() {
stopRequested_ = true;
if (thread_.joinable()) {
thread_.join();
}
if (mediaSource_) {
mediaSource_->Shutdown();
}
sourceReader_.Reset();
mediaSource_.Reset();
if (mfStarted_) {
MFShutdown();
mfStarted_ = false;
}
}
void WebcamCapture::captureLoop() {
CoInitializeEx(nullptr, COINIT_MULTITHREADED);
while (!stopRequested_) {
DWORD streamIndex = 0;
DWORD flags = 0;
LONGLONG timestamp = 0;
Microsoft::WRL::ComPtr<IMFSample> sample;
HRESULT hr = sourceReader_->ReadSample(
MF_SOURCE_READER_FIRST_VIDEO_STREAM,
0,
&streamIndex,
&flags,
&timestamp,
&sample);
(void)streamIndex;
(void)timestamp;
if (FAILED(hr)) {
std::cerr << "WARNING: Failed to read webcam sample (hr=0x" << std::hex << hr << std::dec << ")"
<< std::endl;
std::this_thread::sleep_for(std::chrono::milliseconds(20));
continue;
}
if ((flags & MF_SOURCE_READERF_ENDOFSTREAM) != 0) {
break;
}
if (!sample) {
continue;
}
Microsoft::WRL::ComPtr<IMFMediaBuffer> buffer;
if (FAILED(sample->ConvertToContiguousBuffer(&buffer)) || !buffer) {
continue;
}
BYTE* data = nullptr;
DWORD maxLength = 0;
DWORD currentLength = 0;
if (FAILED(buffer->Lock(&data, &maxLength, &currentLength)) || !data) {
continue;
}
const DWORD expectedLength = static_cast<DWORD>(std::max(0, width_) * std::max(0, height_) * 4);
if (currentLength >= expectedLength && expectedLength > 0) {
std::scoped_lock lock(frameMutex_);
latestFrame_.assign(data, data + expectedLength);
}
buffer->Unlock();
}
CoUninitialize();
}
bool WebcamCapture::copyLatestFrame(std::vector<BYTE>& destination, int& width, int& height) {
std::scoped_lock lock(frameMutex_);
if (latestFrame_.empty() || width_ <= 0 || height_ <= 0) {
return false;
}
destination = latestFrame_;
width = width_;
height = height_;
return true;
}
int WebcamCapture::width() const {
return width_;
}
int WebcamCapture::height() const {
return height_;
}
int WebcamCapture::fps() const {
return fps_;
}
const std::wstring& WebcamCapture::selectedDeviceName() const {
return selectedDeviceName_;
}
@@ -0,0 +1,49 @@
#pragma once
#include <Windows.h>
#include <mfidl.h>
#include <mfreadwrite.h>
#include <wrl/client.h>
#include <atomic>
#include <cstdint>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
class WebcamCapture {
public:
WebcamCapture() = default;
~WebcamCapture();
WebcamCapture(const WebcamCapture&) = delete;
WebcamCapture& operator=(const WebcamCapture&) = delete;
bool initialize(const std::wstring& deviceId, int requestedWidth, int requestedHeight, int requestedFps);
bool start();
void stop();
bool copyLatestFrame(std::vector<BYTE>& destination, int& width, int& height);
int width() const;
int height() const;
int fps() const;
const std::wstring& selectedDeviceName() const;
private:
bool selectDevice(const std::wstring& deviceId);
bool configureReader(int requestedWidth, int requestedHeight, int requestedFps);
void captureLoop();
Microsoft::WRL::ComPtr<IMFMediaSource> mediaSource_;
Microsoft::WRL::ComPtr<IMFSourceReader> sourceReader_;
std::thread thread_;
std::atomic<bool> stopRequested_ = false;
std::mutex frameMutex_;
std::vector<BYTE> latestFrame_;
int width_ = 0;
int height_ = 0;
int fps_ = 30;
bool mfStarted_ = false;
std::wstring selectedDeviceName_;
};