fix: align native mixed audio timeline

This commit is contained in:
EtienneLescot
2026-05-05 16:49:07 +02:00
parent 588a0a7be8
commit 7929aea908
5 changed files with 283 additions and 63 deletions
@@ -138,7 +138,7 @@ SSOT rules for this phase:
### 3. WASAPI Microphone
Status: initial implementation in progress. The helper can open the default WASAPI capture endpoint, apply the OpenScreen microphone gain, encode mic-only audio, and mix mic into system-loopback packets when both endpoints expose the same runtime format. Browser `deviceId` to MMDevice id mapping, resampling between mismatched endpoint formats, and drift correction remain follow-up hardening work.
Status: initial implementation in progress. The helper can open the default WASAPI capture endpoint, apply the OpenScreen microphone gain, encode mic-only audio, and mix system loopback plus microphone through a single queued `AudioMixer` timeline when both endpoints expose the same runtime format. Audio endpoints are warmed before WGC starts, the mixer drops pre-roll and begins its paced timeline on the first encoded video frame, then cuts queued tail audio on stop so the MP4 does not drift past the video. Browser `deviceId` to MMDevice id mapping, resampling between mismatched endpoint formats, and drift correction remain follow-up hardening work.
- Add microphone device enumeration and stable device-id mapping.
- Capture selected/default microphone through WASAPI.
@@ -3,6 +3,7 @@
#include <mfapi.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstring>
#include <limits>
@@ -26,6 +27,8 @@ T clampTo(double value) {
} // namespace
constexpr int64_t HnsPerSecond = 10'000'000;
bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right) {
return left.subtype == right.subtype &&
left.sampleRate == right.sampleRate &&
@@ -43,6 +46,7 @@ void copyAudioWithGain(
std::vector<BYTE>& destination) {
destination.resize(byteCount);
if (!source || byteCount == 0) {
std::fill(destination.begin(), destination.end(), static_cast<BYTE>(0));
return;
}
@@ -126,3 +130,162 @@ void mixAudioInPlace(
}
}
}
AudioMixer::AudioMixer(
const AudioInputFormat& format,
bool includeSystem,
bool includeMicrophone,
double microphoneGain,
OutputCallback output)
: format_(format),
includeSystem_(includeSystem),
includeMicrophone_(includeMicrophone),
microphoneGain_(microphoneGain),
output_(std::move(output)) {}
AudioMixer::~AudioMixer() {
stop();
}
bool AudioMixer::start() {
if (!output_ || format_.sampleRate == 0 || format_.blockAlign == 0) {
return false;
}
stopRequested_ = false;
emittedFrames_ = 0;
timelineStarted_ = false;
thread_ = std::thread([this] {
mixLoop();
});
return true;
}
void AudioMixer::beginTimeline() {
{
std::scoped_lock lock(mutex_);
systemQueue_.clear();
microphoneQueue_.clear();
emittedFrames_ = 0;
timelineStarted_ = true;
}
cv_.notify_all();
}
void AudioMixer::stop() {
stopRequested_ = true;
cv_.notify_all();
if (thread_.joinable()) {
thread_.join();
}
}
void AudioMixer::pushSystem(const BYTE* data, DWORD byteCount) {
if (!includeSystem_ || stopRequested_) {
return;
}
{
std::scoped_lock lock(mutex_);
append(systemQueue_, data, byteCount, 1.0);
}
cv_.notify_all();
}
void AudioMixer::pushMicrophone(const BYTE* data, DWORD byteCount) {
if (!includeMicrophone_ || stopRequested_) {
return;
}
{
std::scoped_lock lock(mutex_);
append(microphoneQueue_, data, byteCount, microphoneGain_);
}
cv_.notify_all();
}
void AudioMixer::append(std::vector<BYTE>& queue, const BYTE* data, DWORD byteCount, double gain) {
if (!data || byteCount == 0) {
return;
}
copyAudioWithGain(data, byteCount, format_, gain, gainBuffer_);
queue.insert(queue.end(), gainBuffer_.begin(), gainBuffer_.end());
}
bool AudioMixer::pop(std::vector<BYTE>& queue, std::vector<BYTE>& chunk, size_t byteCount) {
if (queue.empty()) {
chunk.assign(byteCount, 0);
return false;
}
chunk.assign(byteCount, 0);
const size_t copiedBytes = std::min(byteCount, queue.size());
std::memcpy(chunk.data(), queue.data(), copiedBytes);
queue.erase(queue.begin(), queue.begin() + static_cast<std::ptrdiff_t>(copiedBytes));
return copiedBytes > 0;
}
void AudioMixer::mixLoop() {
const uint32_t chunkFrames = std::max<uint32_t>(1, format_.sampleRate / 100);
const size_t chunkBytes = static_cast<size_t>(chunkFrames) * format_.blockAlign;
std::vector<BYTE> mixedChunk;
std::vector<BYTE> sourceChunk;
std::chrono::steady_clock::time_point audioClockStart;
bool audioClockStarted = false;
while (true) {
{
std::unique_lock lock(mutex_);
cv_.wait_for(lock, std::chrono::milliseconds(20), [&] {
const bool hasSystem = !includeSystem_ || systemQueue_.size() >= chunkBytes;
const bool hasMicrophone = !includeMicrophone_ || microphoneQueue_.size() >= chunkBytes;
const bool hasAnySource = !systemQueue_.empty() || !microphoneQueue_.empty();
return stopRequested_.load() ||
(timelineStarted_ && (hasSystem || hasMicrophone) && hasAnySource);
});
if (stopRequested_) {
break;
}
if (!timelineStarted_) {
continue;
}
const bool hasAnyQueuedAudio = !systemQueue_.empty() || !microphoneQueue_.empty();
if (!hasAnyQueuedAudio) {
continue;
}
mixedChunk.assign(chunkBytes, 0);
if (includeSystem_) {
pop(systemQueue_, sourceChunk, chunkBytes);
mixAudioInPlace(mixedChunk, sourceChunk.data(), static_cast<DWORD>(sourceChunk.size()), format_);
}
if (includeMicrophone_) {
pop(microphoneQueue_, sourceChunk, chunkBytes);
mixAudioInPlace(mixedChunk, sourceChunk.data(), static_cast<DWORD>(sourceChunk.size()), format_);
}
}
if (!audioClockStarted) {
audioClockStart = std::chrono::steady_clock::now();
audioClockStarted = true;
}
const int64_t timestampHns =
static_cast<int64_t>((emittedFrames_ * HnsPerSecond) / format_.sampleRate);
const int64_t durationHns =
static_cast<int64_t>((static_cast<uint64_t>(chunkFrames) * HnsPerSecond) / format_.sampleRate);
if (!output_(mixedChunk.data(), static_cast<DWORD>(mixedChunk.size()), timestampHns, durationHns)) {
stopRequested_ = true;
break;
}
emittedFrames_ += chunkFrames;
const auto nextDeadline = audioClockStart +
std::chrono::duration_cast<std::chrono::steady_clock::duration>(
std::chrono::duration<double>(static_cast<double>(emittedFrames_) / format_.sampleRate));
std::this_thread::sleep_until(nextDeadline);
}
}
@@ -4,6 +4,12 @@
#include <Windows.h>
#include <atomic>
#include <condition_variable>
#include <cstdint>
#include <functional>
#include <mutex>
#include <thread>
#include <vector>
bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right);
@@ -18,3 +24,45 @@ void mixAudioInPlace(
const BYTE* source,
DWORD byteCount,
const AudioInputFormat& format);
class AudioMixer {
public:
using OutputCallback = std::function<bool(const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns)>;
AudioMixer(
const AudioInputFormat& format,
bool includeSystem,
bool includeMicrophone,
double microphoneGain,
OutputCallback output);
~AudioMixer();
AudioMixer(const AudioMixer&) = delete;
AudioMixer& operator=(const AudioMixer&) = delete;
bool start();
void beginTimeline();
void stop();
void pushSystem(const BYTE* data, DWORD byteCount);
void pushMicrophone(const BYTE* data, DWORD byteCount);
private:
void append(std::vector<BYTE>& queue, const BYTE* data, DWORD byteCount, double gain);
bool pop(std::vector<BYTE>& queue, std::vector<BYTE>& chunk, size_t byteCount);
void mixLoop();
AudioInputFormat format_{};
bool includeSystem_ = false;
bool includeMicrophone_ = false;
double microphoneGain_ = 1.0;
OutputCallback output_;
std::mutex mutex_;
std::condition_variable cv_;
std::vector<BYTE> systemQueue_;
std::vector<BYTE> microphoneQueue_;
std::vector<BYTE> gainBuffer_;
std::thread thread_;
std::atomic<bool> stopRequested_ = false;
bool timelineStarted_ = false;
uint64_t emittedFrames_ = 0;
};
+69 -61
View File
@@ -13,6 +13,7 @@
#include <cctype>
#include <cstdint>
#include <iostream>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
@@ -372,82 +373,78 @@ int main(int argc, char* argv[]) {
}
});
std::mutex microphoneAudioMutex;
std::vector<BYTE> latestMicrophoneAudio;
std::vector<BYTE> mixedAudioBuffer;
std::vector<BYTE> microphoneGainBuffer;
std::unique_ptr<AudioMixer> audioMixer;
auto startAudioCaptures = [&]() -> bool {
if (!audioFormat) {
return true;
}
if (config.captureMic) {
if (!microphoneCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
if (stopRequested || !audioFormat) {
return;
}
copyAudioWithGain(
data,
byteCount,
microphoneCapture.inputFormat(),
config.microphoneGain,
microphoneGainBuffer);
if (config.captureSystemAudio) {
std::scoped_lock lock(microphoneAudioMutex);
latestMicrophoneAudio = microphoneGainBuffer;
return;
}
if (!encoder.writeAudio(
microphoneGainBuffer.data(),
static_cast<DWORD>(microphoneGainBuffer.size()),
timestampHns,
durationHns)) {
audioMixer = std::make_unique<AudioMixer>(
*audioFormat,
config.captureSystemAudio,
config.captureMic,
config.microphoneGain,
[&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
if (!encoder.writeAudio(data, byteCount, timestampHns, durationHns)) {
encodeFailed = true;
stopRequested = true;
cv.notify_all();
return false;
}
})) {
std::cerr << "ERROR: Failed to start WASAPI microphone capture" << std::endl;
return 1;
return true;
});
if (!audioMixer->start()) {
std::cerr << "ERROR: Failed to start native audio mixer" << std::endl;
return false;
}
}
if (config.captureSystemAudio) {
if (!loopbackCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
if (stopRequested) {
return;
}
const BYTE* encodedData = data;
DWORD encodedByteCount = byteCount;
if (config.captureMic && audioFormat) {
mixedAudioBuffer.assign(data, data + byteCount);
{
std::scoped_lock lock(microphoneAudioMutex);
mixAudioInPlace(
mixedAudioBuffer,
latestMicrophoneAudio.data(),
static_cast<DWORD>(latestMicrophoneAudio.size()),
*audioFormat);
if (config.captureMic) {
if (!microphoneCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
(void)timestampHns;
(void)durationHns;
if (stopRequested || !audioMixer) {
return;
}
encodedData = mixedAudioBuffer.data();
encodedByteCount = static_cast<DWORD>(mixedAudioBuffer.size());
}
if (!encoder.writeAudio(encodedData, encodedByteCount, timestampHns, durationHns)) {
encodeFailed = true;
stopRequested = true;
cv.notify_all();
}
})) {
std::cerr << "ERROR: Failed to start WASAPI loopback capture" << std::endl;
microphoneCapture.stop();
return 1;
audioMixer->pushMicrophone(data, byteCount);
})) {
std::cerr << "ERROR: Failed to start WASAPI microphone capture" << std::endl;
audioMixer->stop();
return false;
}
}
if (config.captureSystemAudio) {
if (!loopbackCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
(void)timestampHns;
(void)durationHns;
if (stopRequested || !audioMixer) {
return;
}
audioMixer->pushSystem(data, byteCount);
})) {
std::cerr << "ERROR: Failed to start WASAPI loopback capture" << std::endl;
microphoneCapture.stop();
audioMixer->stop();
return false;
}
}
return true;
};
if (!startAudioCaptures()) {
return 1;
}
if (!session.start()) {
microphoneCapture.stop();
loopbackCapture.stop();
if (audioMixer) {
audioMixer->stop();
}
std::cerr << "ERROR: Failed to start WGC session" << std::endl;
return 1;
}
@@ -467,11 +464,19 @@ int main(int argc, char* argv[]) {
}
microphoneCapture.stop();
loopbackCapture.stop();
if (audioMixer) {
audioMixer->stop();
}
session.stop();
std::cerr << "ERROR: Timed out waiting for first WGC frame" << std::endl;
return 1;
}
}
if (audioMixer) {
audioMixer->beginTimeline();
}
std::cout << "{\"event\":\"recording-started\",\"schemaVersion\":2}" << std::endl;
std::cout << "Recording started" << std::endl;
@@ -484,6 +489,9 @@ int main(int argc, char* argv[]) {
microphoneCapture.stop();
loopbackCapture.stop();
if (audioMixer) {
audioMixer->stop();
}
session.stop();
{
std::scoped_lock lock(mutex);
+2 -1
View File
@@ -1,4 +1,5 @@
import { spawn, spawnSync } from "node:child_process";
import { randomUUID } from "node:crypto";
import fs from "node:fs";
import os from "node:os";
import path from "node:path";
@@ -105,7 +106,7 @@ if (!fs.existsSync(HELPER_PATH)) {
const outputPath = path.join(
os.tmpdir(),
`openscreen-wgc-helper-${WITH_SYSTEM_AUDIO || WITH_MICROPHONE ? "audio" : "video"}-${Date.now()}.mp4`,
`openscreen-wgc-helper-${WITH_SYSTEM_AUDIO || WITH_MICROPHONE ? "audio" : "video"}-${process.pid}-${Date.now()}-${randomUUID()}.mp4`,
);
const config = {