diff --git a/docs/engineering/windows-native-recorder-roadmap.md b/docs/engineering/windows-native-recorder-roadmap.md index 2fb6ab5..12c6d49 100644 --- a/docs/engineering/windows-native-recorder-roadmap.md +++ b/docs/engineering/windows-native-recorder-roadmap.md @@ -138,7 +138,7 @@ SSOT rules for this phase: ### 3. WASAPI Microphone -Status: initial implementation in progress. The helper can open the default WASAPI capture endpoint, apply the OpenScreen microphone gain, encode mic-only audio, and mix mic into system-loopback packets when both endpoints expose the same runtime format. Browser `deviceId` to MMDevice id mapping, resampling between mismatched endpoint formats, and drift correction remain follow-up hardening work. +Status: initial implementation in progress. The helper can open the default WASAPI capture endpoint, apply the OpenScreen microphone gain, encode mic-only audio, and mix system loopback plus microphone through a single queued `AudioMixer` timeline when both endpoints expose the same runtime format. Audio endpoints are warmed before WGC starts, the mixer drops pre-roll and begins its paced timeline on the first encoded video frame, then cuts queued tail audio on stop so the MP4 does not drift past the video. Browser `deviceId` to MMDevice id mapping, resampling between mismatched endpoint formats, and drift correction remain follow-up hardening work. - Add microphone device enumeration and stable device-id mapping. - Capture selected/default microphone through WASAPI. diff --git a/electron/native/wgc-capture/src/audio_sample_utils.cpp b/electron/native/wgc-capture/src/audio_sample_utils.cpp index 6537d8e..1e5e1bb 100644 --- a/electron/native/wgc-capture/src/audio_sample_utils.cpp +++ b/electron/native/wgc-capture/src/audio_sample_utils.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,8 @@ T clampTo(double value) { } // namespace +constexpr int64_t HnsPerSecond = 10'000'000; + bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right) { return left.subtype == right.subtype && left.sampleRate == right.sampleRate && @@ -43,6 +46,7 @@ void copyAudioWithGain( std::vector& destination) { destination.resize(byteCount); if (!source || byteCount == 0) { + std::fill(destination.begin(), destination.end(), static_cast(0)); return; } @@ -126,3 +130,162 @@ void mixAudioInPlace( } } } + +AudioMixer::AudioMixer( + const AudioInputFormat& format, + bool includeSystem, + bool includeMicrophone, + double microphoneGain, + OutputCallback output) + : format_(format), + includeSystem_(includeSystem), + includeMicrophone_(includeMicrophone), + microphoneGain_(microphoneGain), + output_(std::move(output)) {} + +AudioMixer::~AudioMixer() { + stop(); +} + +bool AudioMixer::start() { + if (!output_ || format_.sampleRate == 0 || format_.blockAlign == 0) { + return false; + } + + stopRequested_ = false; + emittedFrames_ = 0; + timelineStarted_ = false; + thread_ = std::thread([this] { + mixLoop(); + }); + return true; +} + +void AudioMixer::beginTimeline() { + { + std::scoped_lock lock(mutex_); + systemQueue_.clear(); + microphoneQueue_.clear(); + emittedFrames_ = 0; + timelineStarted_ = true; + } + cv_.notify_all(); +} + +void AudioMixer::stop() { + stopRequested_ = true; + cv_.notify_all(); + if (thread_.joinable()) { + thread_.join(); + } +} + +void AudioMixer::pushSystem(const BYTE* data, DWORD byteCount) { + if (!includeSystem_ || stopRequested_) { + return; + } + + { + std::scoped_lock lock(mutex_); + append(systemQueue_, data, byteCount, 1.0); + } + cv_.notify_all(); +} + +void AudioMixer::pushMicrophone(const BYTE* data, DWORD byteCount) { + if (!includeMicrophone_ || stopRequested_) { + return; + } + + { + std::scoped_lock lock(mutex_); + append(microphoneQueue_, data, byteCount, microphoneGain_); + } + cv_.notify_all(); +} + +void AudioMixer::append(std::vector& queue, const BYTE* data, DWORD byteCount, double gain) { + if (!data || byteCount == 0) { + return; + } + + copyAudioWithGain(data, byteCount, format_, gain, gainBuffer_); + queue.insert(queue.end(), gainBuffer_.begin(), gainBuffer_.end()); +} + +bool AudioMixer::pop(std::vector& queue, std::vector& chunk, size_t byteCount) { + if (queue.empty()) { + chunk.assign(byteCount, 0); + return false; + } + + chunk.assign(byteCount, 0); + const size_t copiedBytes = std::min(byteCount, queue.size()); + std::memcpy(chunk.data(), queue.data(), copiedBytes); + queue.erase(queue.begin(), queue.begin() + static_cast(copiedBytes)); + return copiedBytes > 0; +} + +void AudioMixer::mixLoop() { + const uint32_t chunkFrames = std::max(1, format_.sampleRate / 100); + const size_t chunkBytes = static_cast(chunkFrames) * format_.blockAlign; + std::vector mixedChunk; + std::vector sourceChunk; + std::chrono::steady_clock::time_point audioClockStart; + bool audioClockStarted = false; + + while (true) { + { + std::unique_lock lock(mutex_); + cv_.wait_for(lock, std::chrono::milliseconds(20), [&] { + const bool hasSystem = !includeSystem_ || systemQueue_.size() >= chunkBytes; + const bool hasMicrophone = !includeMicrophone_ || microphoneQueue_.size() >= chunkBytes; + const bool hasAnySource = !systemQueue_.empty() || !microphoneQueue_.empty(); + return stopRequested_.load() || + (timelineStarted_ && (hasSystem || hasMicrophone) && hasAnySource); + }); + + if (stopRequested_) { + break; + } + if (!timelineStarted_) { + continue; + } + + const bool hasAnyQueuedAudio = !systemQueue_.empty() || !microphoneQueue_.empty(); + if (!hasAnyQueuedAudio) { + continue; + } + + mixedChunk.assign(chunkBytes, 0); + if (includeSystem_) { + pop(systemQueue_, sourceChunk, chunkBytes); + mixAudioInPlace(mixedChunk, sourceChunk.data(), static_cast(sourceChunk.size()), format_); + } + if (includeMicrophone_) { + pop(microphoneQueue_, sourceChunk, chunkBytes); + mixAudioInPlace(mixedChunk, sourceChunk.data(), static_cast(sourceChunk.size()), format_); + } + } + + if (!audioClockStarted) { + audioClockStart = std::chrono::steady_clock::now(); + audioClockStarted = true; + } + + const int64_t timestampHns = + static_cast((emittedFrames_ * HnsPerSecond) / format_.sampleRate); + const int64_t durationHns = + static_cast((static_cast(chunkFrames) * HnsPerSecond) / format_.sampleRate); + if (!output_(mixedChunk.data(), static_cast(mixedChunk.size()), timestampHns, durationHns)) { + stopRequested_ = true; + break; + } + emittedFrames_ += chunkFrames; + + const auto nextDeadline = audioClockStart + + std::chrono::duration_cast( + std::chrono::duration(static_cast(emittedFrames_) / format_.sampleRate)); + std::this_thread::sleep_until(nextDeadline); + } +} diff --git a/electron/native/wgc-capture/src/audio_sample_utils.h b/electron/native/wgc-capture/src/audio_sample_utils.h index 8022ae3..b2b6821 100644 --- a/electron/native/wgc-capture/src/audio_sample_utils.h +++ b/electron/native/wgc-capture/src/audio_sample_utils.h @@ -4,6 +4,12 @@ #include +#include +#include +#include +#include +#include +#include #include bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right); @@ -18,3 +24,45 @@ void mixAudioInPlace( const BYTE* source, DWORD byteCount, const AudioInputFormat& format); + +class AudioMixer { +public: + using OutputCallback = std::function; + + AudioMixer( + const AudioInputFormat& format, + bool includeSystem, + bool includeMicrophone, + double microphoneGain, + OutputCallback output); + ~AudioMixer(); + + AudioMixer(const AudioMixer&) = delete; + AudioMixer& operator=(const AudioMixer&) = delete; + + bool start(); + void beginTimeline(); + void stop(); + void pushSystem(const BYTE* data, DWORD byteCount); + void pushMicrophone(const BYTE* data, DWORD byteCount); + +private: + void append(std::vector& queue, const BYTE* data, DWORD byteCount, double gain); + bool pop(std::vector& queue, std::vector& chunk, size_t byteCount); + void mixLoop(); + + AudioInputFormat format_{}; + bool includeSystem_ = false; + bool includeMicrophone_ = false; + double microphoneGain_ = 1.0; + OutputCallback output_; + std::mutex mutex_; + std::condition_variable cv_; + std::vector systemQueue_; + std::vector microphoneQueue_; + std::vector gainBuffer_; + std::thread thread_; + std::atomic stopRequested_ = false; + bool timelineStarted_ = false; + uint64_t emittedFrames_ = 0; +}; diff --git a/electron/native/wgc-capture/src/main.cpp b/electron/native/wgc-capture/src/main.cpp index 603fda3..50e55ef 100644 --- a/electron/native/wgc-capture/src/main.cpp +++ b/electron/native/wgc-capture/src/main.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -372,82 +373,78 @@ int main(int argc, char* argv[]) { } }); - std::mutex microphoneAudioMutex; - std::vector latestMicrophoneAudio; - std::vector mixedAudioBuffer; - std::vector microphoneGainBuffer; + std::unique_ptr audioMixer; + auto startAudioCaptures = [&]() -> bool { + if (!audioFormat) { + return true; + } - if (config.captureMic) { - if (!microphoneCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) { - if (stopRequested || !audioFormat) { - return; - } - - copyAudioWithGain( - data, - byteCount, - microphoneCapture.inputFormat(), - config.microphoneGain, - microphoneGainBuffer); - - if (config.captureSystemAudio) { - std::scoped_lock lock(microphoneAudioMutex); - latestMicrophoneAudio = microphoneGainBuffer; - return; - } - - if (!encoder.writeAudio( - microphoneGainBuffer.data(), - static_cast(microphoneGainBuffer.size()), - timestampHns, - durationHns)) { + audioMixer = std::make_unique( + *audioFormat, + config.captureSystemAudio, + config.captureMic, + config.microphoneGain, + [&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) { + if (!encoder.writeAudio(data, byteCount, timestampHns, durationHns)) { encodeFailed = true; stopRequested = true; cv.notify_all(); + return false; } - })) { - std::cerr << "ERROR: Failed to start WASAPI microphone capture" << std::endl; - return 1; + return true; + }); + + if (!audioMixer->start()) { + std::cerr << "ERROR: Failed to start native audio mixer" << std::endl; + return false; } - } - if (config.captureSystemAudio) { - if (!loopbackCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) { - if (stopRequested) { - return; - } - - const BYTE* encodedData = data; - DWORD encodedByteCount = byteCount; - if (config.captureMic && audioFormat) { - mixedAudioBuffer.assign(data, data + byteCount); - { - std::scoped_lock lock(microphoneAudioMutex); - mixAudioInPlace( - mixedAudioBuffer, - latestMicrophoneAudio.data(), - static_cast(latestMicrophoneAudio.size()), - *audioFormat); + if (config.captureMic) { + if (!microphoneCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) { + (void)timestampHns; + (void)durationHns; + if (stopRequested || !audioMixer) { + return; } - encodedData = mixedAudioBuffer.data(); - encodedByteCount = static_cast(mixedAudioBuffer.size()); - } - if (!encoder.writeAudio(encodedData, encodedByteCount, timestampHns, durationHns)) { - encodeFailed = true; - stopRequested = true; - cv.notify_all(); - } - })) { - std::cerr << "ERROR: Failed to start WASAPI loopback capture" << std::endl; - microphoneCapture.stop(); - return 1; + audioMixer->pushMicrophone(data, byteCount); + })) { + std::cerr << "ERROR: Failed to start WASAPI microphone capture" << std::endl; + audioMixer->stop(); + return false; + } } + + if (config.captureSystemAudio) { + if (!loopbackCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) { + (void)timestampHns; + (void)durationHns; + if (stopRequested || !audioMixer) { + return; + } + + audioMixer->pushSystem(data, byteCount); + })) { + std::cerr << "ERROR: Failed to start WASAPI loopback capture" << std::endl; + microphoneCapture.stop(); + audioMixer->stop(); + return false; + } + } + + return true; + }; + + if (!startAudioCaptures()) { + return 1; } if (!session.start()) { microphoneCapture.stop(); loopbackCapture.stop(); + if (audioMixer) { + audioMixer->stop(); + } std::cerr << "ERROR: Failed to start WGC session" << std::endl; return 1; } @@ -467,11 +464,19 @@ int main(int argc, char* argv[]) { } microphoneCapture.stop(); loopbackCapture.stop(); + if (audioMixer) { + audioMixer->stop(); + } + session.stop(); std::cerr << "ERROR: Timed out waiting for first WGC frame" << std::endl; return 1; } } + if (audioMixer) { + audioMixer->beginTimeline(); + } + std::cout << "{\"event\":\"recording-started\",\"schemaVersion\":2}" << std::endl; std::cout << "Recording started" << std::endl; @@ -484,6 +489,9 @@ int main(int argc, char* argv[]) { microphoneCapture.stop(); loopbackCapture.stop(); + if (audioMixer) { + audioMixer->stop(); + } session.stop(); { std::scoped_lock lock(mutex); diff --git a/scripts/test-windows-wgc-helper.mjs b/scripts/test-windows-wgc-helper.mjs index 45dab7d..bb69819 100644 --- a/scripts/test-windows-wgc-helper.mjs +++ b/scripts/test-windows-wgc-helper.mjs @@ -1,4 +1,5 @@ import { spawn, spawnSync } from "node:child_process"; +import { randomUUID } from "node:crypto"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; @@ -105,7 +106,7 @@ if (!fs.existsSync(HELPER_PATH)) { const outputPath = path.join( os.tmpdir(), - `openscreen-wgc-helper-${WITH_SYSTEM_AUDIO || WITH_MICROPHONE ? "audio" : "video"}-${Date.now()}.mp4`, + `openscreen-wgc-helper-${WITH_SYSTEM_AUDIO || WITH_MICROPHONE ? "audio" : "video"}-${process.pid}-${Date.now()}-${randomUUID()}.mp4`, ); const config = {