fix: resolve selected Windows microphone

This commit is contained in:
EtienneLescot
2026-05-05 20:36:59 +02:00
parent 0ebf5c143b
commit c7b43a50ef
11 changed files with 355 additions and 21 deletions
+10 -1
View File
@@ -34,6 +34,7 @@ Current V2 JSON shape:
"captureSystemAudio": false,
"captureMic": false,
"microphoneDeviceId": "default",
"microphoneDeviceName": "Microphone (NVIDIA Broadcast)",
"microphoneGain": 1.4,
"webcamEnabled": true,
"webcamDeviceId": "default",
@@ -47,7 +48,7 @@ Current V2 JSON shape:
}
```
The current helper implementation supports display/window video capture, system audio loopback, default-microphone capture, Media Foundation webcam capture, and a DirectShow webcam fallback for virtual cameras that are not exposed through Media Foundation. Webcam frames are currently composed into the primary MP4 as a bottom-right picture-in-picture overlay. Browser `deviceId` values do not always map to Media Foundation symbolic links, so the renderer passes both `webcamDeviceId` and `webcamDeviceName`. Electron resolves a matching DirectShow filter CLSID for the selected label; the helper uses Media Foundation first, then that exact DirectShow filter when the requested camera is absent from Media Foundation.
The current helper implementation supports display/window video capture, system audio loopback, selected-microphone capture, Media Foundation webcam capture, and a DirectShow webcam fallback for virtual cameras that are not exposed through Media Foundation. Webcam frames are currently composed into the primary MP4 as a bottom-right picture-in-picture overlay. Browser `deviceId` values do not always map to Media Foundation symbolic links or WASAPI endpoint IDs, so the renderer passes both browser IDs and user-visible device names. For microphones, the helper tries the requested WASAPI endpoint ID first, then resolves an active capture endpoint by `microphoneDeviceName`, then falls back to the default endpoint. For webcams, Electron resolves a matching DirectShow filter CLSID for the selected label; the helper uses Media Foundation first, then that exact DirectShow filter when the requested camera is absent from Media Foundation.
Smoke-test the helper with:
@@ -67,3 +68,11 @@ $env:OPENSCREEN_WGC_TEST_WEBCAM_DEVICE_NAME = "NVIDIA Broadcast"
npm run test:wgc-webcam:win
Remove-Item Env:OPENSCREEN_WGC_TEST_WEBCAM_DEVICE_NAME
```
To validate a specific native microphone manually:
```powershell
$env:OPENSCREEN_WGC_TEST_MICROPHONE_DEVICE_NAME = "Microphone (NVIDIA Broadcast)"
npm run test:wgc-mic:win
Remove-Item Env:OPENSCREEN_WGC_TEST_MICROPHONE_DEVICE_NAME
```
@@ -25,6 +25,68 @@ T clampTo(double value) {
return static_cast<T>(std::clamp(std::round(value), minValue, maxValue));
}
size_t bytesPerSample(const AudioInputFormat& format) {
return format.bitsPerSample / 8;
}
double readSampleAsDouble(const BYTE* source, const AudioInputFormat& format, size_t frameIndex, UINT32 channelIndex) {
if (!source || format.blockAlign == 0 || channelIndex >= format.channels) {
return 0.0;
}
const size_t offset = frameIndex * format.blockAlign + channelIndex * bytesPerSample(format);
if (isFloatFormat(format)) {
return static_cast<double>(*reinterpret_cast<const float*>(source + offset));
}
if (isPcmFormat(format, 16)) {
return static_cast<double>(*reinterpret_cast<const int16_t*>(source + offset)) / 32768.0;
}
if (isPcmFormat(format, 32)) {
return static_cast<double>(*reinterpret_cast<const int32_t*>(source + offset)) / 2147483648.0;
}
return 0.0;
}
void writeSampleFromDouble(BYTE* destination, const AudioInputFormat& format, size_t frameIndex, UINT32 channelIndex, double value) {
if (!destination || format.blockAlign == 0 || channelIndex >= format.channels) {
return;
}
const double clamped = std::clamp(value, -1.0, 1.0);
const size_t offset = frameIndex * format.blockAlign + channelIndex * bytesPerSample(format);
if (isFloatFormat(format)) {
*reinterpret_cast<float*>(destination + offset) = static_cast<float>(clamped);
return;
}
if (isPcmFormat(format, 16)) {
*reinterpret_cast<int16_t*>(destination + offset) = clampTo<int16_t>(clamped * 32767.0);
return;
}
if (isPcmFormat(format, 32)) {
*reinterpret_cast<int32_t*>(destination + offset) = clampTo<int32_t>(clamped * 2147483647.0);
}
}
double readMappedChannel(const BYTE* source, const AudioInputFormat& format, size_t frameIndex, UINT32 targetChannel, UINT32 targetChannels) {
if (format.channels == 0) {
return 0.0;
}
if (format.channels == targetChannels && targetChannel < format.channels) {
return readSampleAsDouble(source, format, frameIndex, targetChannel);
}
if (format.channels == 1) {
return readSampleAsDouble(source, format, frameIndex, 0);
}
if (targetChannels == 1) {
double sum = 0.0;
for (UINT32 channel = 0; channel < format.channels; ++channel) {
sum += readSampleAsDouble(source, format, frameIndex, channel);
}
return sum / static_cast<double>(format.channels);
}
return readSampleAsDouble(source, format, frameIndex, std::min(targetChannel, format.channels - 1));
}
} // namespace
constexpr int64_t HnsPerSecond = 10'000'000;
@@ -88,6 +150,53 @@ void copyAudioWithGain(
std::memcpy(destination.data(), source, byteCount);
}
void convertAudioWithGain(
const BYTE* source,
DWORD byteCount,
const AudioInputFormat& sourceFormat,
const AudioInputFormat& targetFormat,
double gain,
std::vector<BYTE>& destination) {
if (!source || byteCount == 0 || sourceFormat.blockAlign == 0 || targetFormat.blockAlign == 0 ||
sourceFormat.sampleRate == 0 || targetFormat.sampleRate == 0 || sourceFormat.channels == 0 ||
targetFormat.channels == 0) {
destination.clear();
return;
}
if (sameAudioFormatForMixing(sourceFormat, targetFormat)) {
copyAudioWithGain(source, byteCount, targetFormat, gain, destination);
return;
}
const size_t sourceFrames = byteCount / sourceFormat.blockAlign;
if (sourceFrames == 0) {
destination.clear();
return;
}
const double rateRatio = static_cast<double>(targetFormat.sampleRate) /
static_cast<double>(sourceFormat.sampleRate);
const size_t targetFrames = std::max<size_t>(1, static_cast<size_t>(std::llround(sourceFrames * rateRatio)));
destination.assign(targetFrames * targetFormat.blockAlign, 0);
for (size_t targetFrame = 0; targetFrame < targetFrames; ++targetFrame) {
const double sourcePosition = static_cast<double>(targetFrame) / rateRatio;
const size_t sourceFrame = std::min(
sourceFrames - 1,
static_cast<size_t>(std::llround(sourcePosition)));
for (UINT32 channel = 0; channel < targetFormat.channels; ++channel) {
const double sample = readMappedChannel(
source,
sourceFormat,
sourceFrame,
channel,
targetFormat.channels);
writeSampleFromDouble(destination.data(), targetFormat, targetFrame, channel, sample * gain);
}
}
}
void mixAudioInPlace(
std::vector<BYTE>& destination,
const BYTE* source,
@@ -133,11 +242,15 @@ void mixAudioInPlace(
AudioMixer::AudioMixer(
const AudioInputFormat& format,
const AudioInputFormat& systemFormat,
const AudioInputFormat& microphoneFormat,
bool includeSystem,
bool includeMicrophone,
double microphoneGain,
OutputCallback output)
: format_(format),
systemFormat_(systemFormat),
microphoneFormat_(microphoneFormat),
includeSystem_(includeSystem),
includeMicrophone_(includeMicrophone),
microphoneGain_(microphoneGain),
@@ -187,7 +300,7 @@ void AudioMixer::pushSystem(const BYTE* data, DWORD byteCount) {
{
std::scoped_lock lock(mutex_);
append(systemQueue_, data, byteCount, 1.0);
append(systemQueue_, data, byteCount, systemFormat_, 1.0);
}
cv_.notify_all();
}
@@ -199,17 +312,22 @@ void AudioMixer::pushMicrophone(const BYTE* data, DWORD byteCount) {
{
std::scoped_lock lock(mutex_);
append(microphoneQueue_, data, byteCount, microphoneGain_);
append(microphoneQueue_, data, byteCount, microphoneFormat_, microphoneGain_);
}
cv_.notify_all();
}
void AudioMixer::append(std::vector<BYTE>& queue, const BYTE* data, DWORD byteCount, double gain) {
void AudioMixer::append(
std::vector<BYTE>& queue,
const BYTE* data,
DWORD byteCount,
const AudioInputFormat& sourceFormat,
double gain) {
if (!data || byteCount == 0) {
return;
}
copyAudioWithGain(data, byteCount, format_, gain, gainBuffer_);
convertAudioWithGain(data, byteCount, sourceFormat, format_, gain, gainBuffer_);
queue.insert(queue.end(), gainBuffer_.begin(), gainBuffer_.end());
}
@@ -19,6 +19,13 @@ void copyAudioWithGain(
const AudioInputFormat& format,
double gain,
std::vector<BYTE>& destination);
void convertAudioWithGain(
const BYTE* source,
DWORD byteCount,
const AudioInputFormat& sourceFormat,
const AudioInputFormat& targetFormat,
double gain,
std::vector<BYTE>& destination);
void mixAudioInPlace(
std::vector<BYTE>& destination,
const BYTE* source,
@@ -31,6 +38,8 @@ public:
AudioMixer(
const AudioInputFormat& format,
const AudioInputFormat& systemFormat,
const AudioInputFormat& microphoneFormat,
bool includeSystem,
bool includeMicrophone,
double microphoneGain,
@@ -47,11 +56,18 @@ public:
void pushMicrophone(const BYTE* data, DWORD byteCount);
private:
void append(std::vector<BYTE>& queue, const BYTE* data, DWORD byteCount, double gain);
void append(
std::vector<BYTE>& queue,
const BYTE* data,
DWORD byteCount,
const AudioInputFormat& sourceFormat,
double gain);
bool pop(std::vector<BYTE>& queue, std::vector<BYTE>& chunk, size_t byteCount);
void mixLoop();
AudioInputFormat format_{};
AudioInputFormat systemFormat_{};
AudioInputFormat microphoneFormat_{};
bool includeSystem_ = false;
bool includeMicrophone_ = false;
double microphoneGain_ = 1.0;
+17 -6
View File
@@ -38,6 +38,7 @@ struct CaptureConfig {
bool captureMic = false;
bool webcamEnabled = false;
std::string microphoneDeviceId;
std::string microphoneDeviceName;
double microphoneGain = 1.0;
std::string webcamDeviceId;
std::string webcamDeviceName;
@@ -303,6 +304,7 @@ bool parseConfig(const std::string& json, CaptureConfig& config) {
config.captureMic = findBool(json, "captureMic", false);
config.webcamEnabled = findBool(json, "webcamEnabled", false);
config.microphoneDeviceId = findString(json, "microphoneDeviceId");
config.microphoneDeviceName = findString(json, "microphoneDeviceName");
config.microphoneGain = findDouble(json, "microphoneGain", 1.0);
config.webcamDeviceId = findString(json, "webcamDeviceId");
config.webcamDeviceName = findString(json, "webcamDeviceName");
@@ -406,24 +408,26 @@ int main(int argc, char* argv[]) {
WasapiLoopbackCapture loopbackCapture;
WasapiLoopbackCapture microphoneCapture;
const AudioInputFormat* audioFormat = nullptr;
AudioInputFormat systemAudioFormat{};
AudioInputFormat microphoneAudioFormat{};
if (config.captureSystemAudio) {
if (!loopbackCapture.initializeSystemLoopback()) {
std::cerr << "ERROR: Failed to initialize WASAPI loopback capture" << std::endl;
return 1;
}
systemAudioFormat = loopbackCapture.inputFormat();
audioFormat = &loopbackCapture.inputFormat();
}
if (config.captureMic) {
if (!microphoneCapture.initializeMicrophone(utf8ToWide(config.microphoneDeviceId))) {
if (!microphoneCapture.initializeMicrophone(
utf8ToWide(config.microphoneDeviceId),
utf8ToWide(config.microphoneDeviceName))) {
std::cerr << "ERROR: Failed to initialize WASAPI microphone capture" << std::endl;
return 1;
}
microphoneAudioFormat = microphoneCapture.inputFormat();
if (!audioFormat) {
audioFormat = &microphoneCapture.inputFormat();
} else if (!sameAudioFormatForMixing(*audioFormat, microphoneCapture.inputFormat())) {
std::cerr << "ERROR: System audio and microphone formats differ; native mixing is not supported yet"
<< std::endl;
return 1;
}
}
if (audioFormat) {
@@ -431,7 +435,12 @@ int main(int argc, char* argv[]) {
<< ",\"channels\":" << audioFormat->channels
<< ",\"bitsPerSample\":" << audioFormat->bitsPerSample
<< ",\"system\":" << (config.captureSystemAudio ? "true" : "false")
<< ",\"microphone\":" << (config.captureMic ? "true" : "false") << "}" << std::endl;
<< ",\"microphone\":" << (config.captureMic ? "true" : "false");
if (config.captureMic) {
std::cout << ",\"microphoneDeviceName\":\""
<< jsonEscape(wideToUtf8(microphoneCapture.selectedDeviceName())) << "\"";
}
std::cout << "}" << std::endl;
}
MFEncoder encoder;
@@ -549,6 +558,8 @@ int main(int argc, char* argv[]) {
audioMixer = std::make_unique<AudioMixer>(
*audioFormat,
config.captureSystemAudio ? systemAudioFormat : *audioFormat,
config.captureMic ? microphoneAudioFormat : *audioFormat,
config.captureSystemAudio,
config.captureMic,
config.microphoneGain,
@@ -1,9 +1,12 @@
#include "wasapi_loopback_capture.h"
#include <Functiondiscoverykeys_devpkey.h>
#include <ksmedia.h>
#include <propvarutil.h>
#include <algorithm>
#include <chrono>
#include <cwctype>
#include <iostream>
namespace {
@@ -41,6 +44,86 @@ GUID audioSubtypeFromFormat(WAVEFORMATEX* format) {
return GUID_NULL;
}
std::wstring normalizeDeviceName(const std::wstring& value) {
std::wstring result;
result.reserve(value.size());
bool lastWasSpace = true;
for (const wchar_t c : value) {
if (std::iswalnum(c)) {
result.push_back(static_cast<wchar_t>(std::towlower(c)));
lastWasSpace = false;
} else if (!lastWasSpace) {
result.push_back(L' ');
lastWasSpace = true;
}
}
if (!result.empty() && result.back() == L' ') {
result.pop_back();
}
return result;
}
int scoreDeviceName(const std::wstring& candidateName, const std::wstring& candidateId, const std::wstring& requestedName) {
const std::wstring candidate = normalizeDeviceName(candidateName);
const std::wstring id = normalizeDeviceName(candidateId);
const std::wstring requested = normalizeDeviceName(requestedName);
if (requested.empty()) {
return 0;
}
if (candidate == requested) {
return 1000;
}
if (!candidate.empty() && (candidate.find(requested) != std::wstring::npos || requested.find(candidate) != std::wstring::npos)) {
return 900;
}
if (!id.empty() && (id.find(requested) != std::wstring::npos || requested.find(id) != std::wstring::npos)) {
return 800;
}
int score = 0;
size_t pos = 0;
while (pos < requested.size()) {
const size_t end = requested.find(L' ', pos);
const std::wstring word = requested.substr(pos, end == std::wstring::npos ? std::wstring::npos : end - pos);
if (word.size() > 1 && word != L"microphone" && word != L"mic" && word != L"audio" && word != L"input") {
if (candidate.find(word) != std::wstring::npos) {
score += 100;
} else if (id.find(word) != std::wstring::npos) {
score += 50;
}
}
if (end == std::wstring::npos) {
break;
}
pos = end + 1;
}
return score;
}
std::wstring getDeviceFriendlyName(IMMDevice* device) {
if (!device) {
return {};
}
Microsoft::WRL::ComPtr<IPropertyStore> properties;
HRESULT hr = device->OpenPropertyStore(STGM_READ, &properties);
if (FAILED(hr) || !properties) {
return {};
}
PROPVARIANT value;
PropVariantInit(&value);
hr = properties->GetValue(PKEY_Device_FriendlyName, &value);
std::wstring name;
if (SUCCEEDED(hr) && value.vt == VT_LPWSTR && value.pwszVal) {
name = value.pwszVal;
}
PropVariantClear(&value);
return name;
}
} // namespace
WasapiLoopbackCapture::~WasapiLoopbackCapture() {
@@ -52,14 +135,14 @@ WasapiLoopbackCapture::~WasapiLoopbackCapture() {
}
bool WasapiLoopbackCapture::initializeSystemLoopback() {
return initialize(WasapiCaptureEndpoint::SystemLoopback, {});
return initialize(WasapiCaptureEndpoint::SystemLoopback, {}, {});
}
bool WasapiLoopbackCapture::initializeMicrophone(const std::wstring& deviceId) {
return initialize(WasapiCaptureEndpoint::Microphone, deviceId);
bool WasapiLoopbackCapture::initializeMicrophone(const std::wstring& deviceId, const std::wstring& deviceName) {
return initialize(WasapiCaptureEndpoint::Microphone, deviceId, deviceName);
}
bool WasapiLoopbackCapture::initialize(WasapiCaptureEndpoint endpoint, const std::wstring& deviceId) {
bool WasapiLoopbackCapture::initialize(WasapiCaptureEndpoint endpoint, const std::wstring& deviceId, const std::wstring& deviceName) {
HRESULT hr = CoCreateInstance(
__uuidof(MMDeviceEnumerator),
nullptr,
@@ -72,12 +155,19 @@ bool WasapiLoopbackCapture::initialize(WasapiCaptureEndpoint endpoint, const std
if (endpoint == WasapiCaptureEndpoint::Microphone && !deviceId.empty() && deviceId != L"default") {
hr = deviceEnumerator_->GetDevice(deviceId.c_str(), &device_);
if (FAILED(hr)) {
std::wcerr << L"WARNING: Could not resolve microphone device id; using default capture endpoint"
std::wcerr << L"WARNING: Could not resolve microphone device id directly"
<< std::endl;
device_.Reset();
}
}
if (endpoint == WasapiCaptureEndpoint::Microphone && !device_ && !deviceName.empty()) {
if (!resolveMicrophoneByName(deviceName)) {
std::wcerr << L"WARNING: Could not resolve microphone by name; using default capture endpoint"
<< std::endl;
}
}
if (!device_) {
const EDataFlow flow =
endpoint == WasapiCaptureEndpoint::SystemLoopback ? eRender : eCapture;
@@ -87,6 +177,8 @@ bool WasapiLoopbackCapture::initialize(WasapiCaptureEndpoint endpoint, const std
}
}
selectedDeviceName_ = getDeviceFriendlyName(device_.Get());
hr = device_->Activate(__uuidof(IAudioClient), CLSCTX_ALL, nullptr, &audioClient_);
if (!succeeded(hr, "IMMDevice::Activate(IAudioClient)")) {
return false;
@@ -123,6 +215,61 @@ bool WasapiLoopbackCapture::initialize(WasapiCaptureEndpoint endpoint, const std
return true;
}
bool WasapiLoopbackCapture::resolveMicrophoneByName(const std::wstring& deviceName) {
if (!deviceEnumerator_ || deviceName.empty()) {
return false;
}
Microsoft::WRL::ComPtr<IMMDeviceCollection> devices;
HRESULT hr = deviceEnumerator_->EnumAudioEndpoints(eCapture, DEVICE_STATE_ACTIVE, &devices);
if (!succeeded(hr, "IMMDeviceEnumerator::EnumAudioEndpoints(eCapture)")) {
return false;
}
UINT count = 0;
hr = devices->GetCount(&count);
if (!succeeded(hr, "IMMDeviceCollection::GetCount")) {
return false;
}
Microsoft::WRL::ComPtr<IMMDevice> bestDevice;
std::wstring bestId;
std::wstring bestName;
int bestScore = 0;
for (UINT i = 0; i < count; ++i) {
Microsoft::WRL::ComPtr<IMMDevice> candidate;
hr = devices->Item(i, &candidate);
if (FAILED(hr) || !candidate) {
continue;
}
LPWSTR rawId = nullptr;
std::wstring candidateId;
if (SUCCEEDED(candidate->GetId(&rawId)) && rawId) {
candidateId = rawId;
CoTaskMemFree(rawId);
}
const std::wstring candidateName = getDeviceFriendlyName(candidate.Get());
const int score = scoreDeviceName(candidateName, candidateId, deviceName);
std::wcerr << L"Native microphone candidate: " << candidateName << L" score=" << score << std::endl;
if (score > bestScore) {
bestScore = score;
bestDevice = candidate;
bestId = candidateId;
bestName = candidateName;
}
}
if (!bestDevice || bestScore <= 0) {
return false;
}
device_ = bestDevice;
std::wcerr << L"Selected native microphone endpoint: " << bestName << L" id=" << bestId << std::endl;
return true;
}
bool WasapiLoopbackCapture::resolveInputFormat(WAVEFORMATEX* mixFormat) {
const GUID subtype = audioSubtypeFromFormat(mixFormat);
if (subtype == GUID_NULL) {
@@ -172,6 +319,10 @@ const AudioInputFormat& WasapiLoopbackCapture::inputFormat() const {
return inputFormat_;
}
const std::wstring& WasapiLoopbackCapture::selectedDeviceName() const {
return selectedDeviceName_;
}
void WasapiLoopbackCapture::captureLoop() {
while (!stopRequested_) {
UINT32 packetFrames = 0;
@@ -30,14 +30,16 @@ public:
WasapiLoopbackCapture& operator=(const WasapiLoopbackCapture&) = delete;
bool initializeSystemLoopback();
bool initializeMicrophone(const std::wstring& deviceId);
bool initializeMicrophone(const std::wstring& deviceId, const std::wstring& deviceName);
bool start(AudioCallback callback);
void stop();
const AudioInputFormat& inputFormat() const;
const std::wstring& selectedDeviceName() const;
private:
bool initialize(WasapiCaptureEndpoint endpoint, const std::wstring& deviceId);
bool initialize(WasapiCaptureEndpoint endpoint, const std::wstring& deviceId, const std::wstring& deviceName);
bool resolveMicrophoneByName(const std::wstring& deviceName);
void captureLoop();
bool resolveInputFormat(WAVEFORMATEX* mixFormat);
@@ -47,6 +49,7 @@ private:
Microsoft::WRL::ComPtr<IAudioCaptureClient> captureClient_;
WAVEFORMATEX* mixFormat_ = nullptr;
AudioInputFormat inputFormat_{};
std::wstring selectedDeviceName_;
AudioCallback callback_;
std::thread thread_;
std::atomic<bool> stopRequested_ = false;