fix: align native mixed audio timeline
This commit is contained in:
@@ -138,7 +138,7 @@ SSOT rules for this phase:
|
||||
|
||||
### 3. WASAPI Microphone
|
||||
|
||||
Status: initial implementation in progress. The helper can open the default WASAPI capture endpoint, apply the OpenScreen microphone gain, encode mic-only audio, and mix mic into system-loopback packets when both endpoints expose the same runtime format. Browser `deviceId` to MMDevice id mapping, resampling between mismatched endpoint formats, and drift correction remain follow-up hardening work.
|
||||
Status: initial implementation in progress. The helper can open the default WASAPI capture endpoint, apply the OpenScreen microphone gain, encode mic-only audio, and mix system loopback plus microphone through a single queued `AudioMixer` timeline when both endpoints expose the same runtime format. Audio endpoints are warmed before WGC starts, the mixer drops pre-roll and begins its paced timeline on the first encoded video frame, then cuts queued tail audio on stop so the MP4 does not drift past the video. Browser `deviceId` to MMDevice id mapping, resampling between mismatched endpoint formats, and drift correction remain follow-up hardening work.
|
||||
|
||||
- Add microphone device enumeration and stable device-id mapping.
|
||||
- Capture selected/default microphone through WASAPI.
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include <mfapi.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
@@ -26,6 +27,8 @@ T clampTo(double value) {
|
||||
|
||||
} // namespace
|
||||
|
||||
constexpr int64_t HnsPerSecond = 10'000'000;
|
||||
|
||||
bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right) {
|
||||
return left.subtype == right.subtype &&
|
||||
left.sampleRate == right.sampleRate &&
|
||||
@@ -43,6 +46,7 @@ void copyAudioWithGain(
|
||||
std::vector<BYTE>& destination) {
|
||||
destination.resize(byteCount);
|
||||
if (!source || byteCount == 0) {
|
||||
std::fill(destination.begin(), destination.end(), static_cast<BYTE>(0));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -126,3 +130,162 @@ void mixAudioInPlace(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AudioMixer::AudioMixer(
|
||||
const AudioInputFormat& format,
|
||||
bool includeSystem,
|
||||
bool includeMicrophone,
|
||||
double microphoneGain,
|
||||
OutputCallback output)
|
||||
: format_(format),
|
||||
includeSystem_(includeSystem),
|
||||
includeMicrophone_(includeMicrophone),
|
||||
microphoneGain_(microphoneGain),
|
||||
output_(std::move(output)) {}
|
||||
|
||||
AudioMixer::~AudioMixer() {
|
||||
stop();
|
||||
}
|
||||
|
||||
bool AudioMixer::start() {
|
||||
if (!output_ || format_.sampleRate == 0 || format_.blockAlign == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
stopRequested_ = false;
|
||||
emittedFrames_ = 0;
|
||||
timelineStarted_ = false;
|
||||
thread_ = std::thread([this] {
|
||||
mixLoop();
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
void AudioMixer::beginTimeline() {
|
||||
{
|
||||
std::scoped_lock lock(mutex_);
|
||||
systemQueue_.clear();
|
||||
microphoneQueue_.clear();
|
||||
emittedFrames_ = 0;
|
||||
timelineStarted_ = true;
|
||||
}
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void AudioMixer::stop() {
|
||||
stopRequested_ = true;
|
||||
cv_.notify_all();
|
||||
if (thread_.joinable()) {
|
||||
thread_.join();
|
||||
}
|
||||
}
|
||||
|
||||
void AudioMixer::pushSystem(const BYTE* data, DWORD byteCount) {
|
||||
if (!includeSystem_ || stopRequested_) {
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
std::scoped_lock lock(mutex_);
|
||||
append(systemQueue_, data, byteCount, 1.0);
|
||||
}
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void AudioMixer::pushMicrophone(const BYTE* data, DWORD byteCount) {
|
||||
if (!includeMicrophone_ || stopRequested_) {
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
std::scoped_lock lock(mutex_);
|
||||
append(microphoneQueue_, data, byteCount, microphoneGain_);
|
||||
}
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void AudioMixer::append(std::vector<BYTE>& queue, const BYTE* data, DWORD byteCount, double gain) {
|
||||
if (!data || byteCount == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
copyAudioWithGain(data, byteCount, format_, gain, gainBuffer_);
|
||||
queue.insert(queue.end(), gainBuffer_.begin(), gainBuffer_.end());
|
||||
}
|
||||
|
||||
bool AudioMixer::pop(std::vector<BYTE>& queue, std::vector<BYTE>& chunk, size_t byteCount) {
|
||||
if (queue.empty()) {
|
||||
chunk.assign(byteCount, 0);
|
||||
return false;
|
||||
}
|
||||
|
||||
chunk.assign(byteCount, 0);
|
||||
const size_t copiedBytes = std::min(byteCount, queue.size());
|
||||
std::memcpy(chunk.data(), queue.data(), copiedBytes);
|
||||
queue.erase(queue.begin(), queue.begin() + static_cast<std::ptrdiff_t>(copiedBytes));
|
||||
return copiedBytes > 0;
|
||||
}
|
||||
|
||||
void AudioMixer::mixLoop() {
|
||||
const uint32_t chunkFrames = std::max<uint32_t>(1, format_.sampleRate / 100);
|
||||
const size_t chunkBytes = static_cast<size_t>(chunkFrames) * format_.blockAlign;
|
||||
std::vector<BYTE> mixedChunk;
|
||||
std::vector<BYTE> sourceChunk;
|
||||
std::chrono::steady_clock::time_point audioClockStart;
|
||||
bool audioClockStarted = false;
|
||||
|
||||
while (true) {
|
||||
{
|
||||
std::unique_lock lock(mutex_);
|
||||
cv_.wait_for(lock, std::chrono::milliseconds(20), [&] {
|
||||
const bool hasSystem = !includeSystem_ || systemQueue_.size() >= chunkBytes;
|
||||
const bool hasMicrophone = !includeMicrophone_ || microphoneQueue_.size() >= chunkBytes;
|
||||
const bool hasAnySource = !systemQueue_.empty() || !microphoneQueue_.empty();
|
||||
return stopRequested_.load() ||
|
||||
(timelineStarted_ && (hasSystem || hasMicrophone) && hasAnySource);
|
||||
});
|
||||
|
||||
if (stopRequested_) {
|
||||
break;
|
||||
}
|
||||
if (!timelineStarted_) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool hasAnyQueuedAudio = !systemQueue_.empty() || !microphoneQueue_.empty();
|
||||
if (!hasAnyQueuedAudio) {
|
||||
continue;
|
||||
}
|
||||
|
||||
mixedChunk.assign(chunkBytes, 0);
|
||||
if (includeSystem_) {
|
||||
pop(systemQueue_, sourceChunk, chunkBytes);
|
||||
mixAudioInPlace(mixedChunk, sourceChunk.data(), static_cast<DWORD>(sourceChunk.size()), format_);
|
||||
}
|
||||
if (includeMicrophone_) {
|
||||
pop(microphoneQueue_, sourceChunk, chunkBytes);
|
||||
mixAudioInPlace(mixedChunk, sourceChunk.data(), static_cast<DWORD>(sourceChunk.size()), format_);
|
||||
}
|
||||
}
|
||||
|
||||
if (!audioClockStarted) {
|
||||
audioClockStart = std::chrono::steady_clock::now();
|
||||
audioClockStarted = true;
|
||||
}
|
||||
|
||||
const int64_t timestampHns =
|
||||
static_cast<int64_t>((emittedFrames_ * HnsPerSecond) / format_.sampleRate);
|
||||
const int64_t durationHns =
|
||||
static_cast<int64_t>((static_cast<uint64_t>(chunkFrames) * HnsPerSecond) / format_.sampleRate);
|
||||
if (!output_(mixedChunk.data(), static_cast<DWORD>(mixedChunk.size()), timestampHns, durationHns)) {
|
||||
stopRequested_ = true;
|
||||
break;
|
||||
}
|
||||
emittedFrames_ += chunkFrames;
|
||||
|
||||
const auto nextDeadline = audioClockStart +
|
||||
std::chrono::duration_cast<std::chrono::steady_clock::duration>(
|
||||
std::chrono::duration<double>(static_cast<double>(emittedFrames_) / format_.sampleRate));
|
||||
std::this_thread::sleep_until(nextDeadline);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,12 @@
|
||||
|
||||
#include <Windows.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right);
|
||||
@@ -18,3 +24,45 @@ void mixAudioInPlace(
|
||||
const BYTE* source,
|
||||
DWORD byteCount,
|
||||
const AudioInputFormat& format);
|
||||
|
||||
class AudioMixer {
|
||||
public:
|
||||
using OutputCallback = std::function<bool(const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns)>;
|
||||
|
||||
AudioMixer(
|
||||
const AudioInputFormat& format,
|
||||
bool includeSystem,
|
||||
bool includeMicrophone,
|
||||
double microphoneGain,
|
||||
OutputCallback output);
|
||||
~AudioMixer();
|
||||
|
||||
AudioMixer(const AudioMixer&) = delete;
|
||||
AudioMixer& operator=(const AudioMixer&) = delete;
|
||||
|
||||
bool start();
|
||||
void beginTimeline();
|
||||
void stop();
|
||||
void pushSystem(const BYTE* data, DWORD byteCount);
|
||||
void pushMicrophone(const BYTE* data, DWORD byteCount);
|
||||
|
||||
private:
|
||||
void append(std::vector<BYTE>& queue, const BYTE* data, DWORD byteCount, double gain);
|
||||
bool pop(std::vector<BYTE>& queue, std::vector<BYTE>& chunk, size_t byteCount);
|
||||
void mixLoop();
|
||||
|
||||
AudioInputFormat format_{};
|
||||
bool includeSystem_ = false;
|
||||
bool includeMicrophone_ = false;
|
||||
double microphoneGain_ = 1.0;
|
||||
OutputCallback output_;
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cv_;
|
||||
std::vector<BYTE> systemQueue_;
|
||||
std::vector<BYTE> microphoneQueue_;
|
||||
std::vector<BYTE> gainBuffer_;
|
||||
std::thread thread_;
|
||||
std::atomic<bool> stopRequested_ = false;
|
||||
bool timelineStarted_ = false;
|
||||
uint64_t emittedFrames_ = 0;
|
||||
};
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <cctype>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
@@ -372,82 +373,78 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
});
|
||||
|
||||
std::mutex microphoneAudioMutex;
|
||||
std::vector<BYTE> latestMicrophoneAudio;
|
||||
std::vector<BYTE> mixedAudioBuffer;
|
||||
std::vector<BYTE> microphoneGainBuffer;
|
||||
std::unique_ptr<AudioMixer> audioMixer;
|
||||
auto startAudioCaptures = [&]() -> bool {
|
||||
if (!audioFormat) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (config.captureMic) {
|
||||
if (!microphoneCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
|
||||
if (stopRequested || !audioFormat) {
|
||||
return;
|
||||
}
|
||||
|
||||
copyAudioWithGain(
|
||||
data,
|
||||
byteCount,
|
||||
microphoneCapture.inputFormat(),
|
||||
config.microphoneGain,
|
||||
microphoneGainBuffer);
|
||||
|
||||
if (config.captureSystemAudio) {
|
||||
std::scoped_lock lock(microphoneAudioMutex);
|
||||
latestMicrophoneAudio = microphoneGainBuffer;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!encoder.writeAudio(
|
||||
microphoneGainBuffer.data(),
|
||||
static_cast<DWORD>(microphoneGainBuffer.size()),
|
||||
timestampHns,
|
||||
durationHns)) {
|
||||
audioMixer = std::make_unique<AudioMixer>(
|
||||
*audioFormat,
|
||||
config.captureSystemAudio,
|
||||
config.captureMic,
|
||||
config.microphoneGain,
|
||||
[&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
|
||||
if (!encoder.writeAudio(data, byteCount, timestampHns, durationHns)) {
|
||||
encodeFailed = true;
|
||||
stopRequested = true;
|
||||
cv.notify_all();
|
||||
return false;
|
||||
}
|
||||
})) {
|
||||
std::cerr << "ERROR: Failed to start WASAPI microphone capture" << std::endl;
|
||||
return 1;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!audioMixer->start()) {
|
||||
std::cerr << "ERROR: Failed to start native audio mixer" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.captureSystemAudio) {
|
||||
if (!loopbackCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
|
||||
if (stopRequested) {
|
||||
return;
|
||||
}
|
||||
|
||||
const BYTE* encodedData = data;
|
||||
DWORD encodedByteCount = byteCount;
|
||||
if (config.captureMic && audioFormat) {
|
||||
mixedAudioBuffer.assign(data, data + byteCount);
|
||||
{
|
||||
std::scoped_lock lock(microphoneAudioMutex);
|
||||
mixAudioInPlace(
|
||||
mixedAudioBuffer,
|
||||
latestMicrophoneAudio.data(),
|
||||
static_cast<DWORD>(latestMicrophoneAudio.size()),
|
||||
*audioFormat);
|
||||
if (config.captureMic) {
|
||||
if (!microphoneCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
|
||||
(void)timestampHns;
|
||||
(void)durationHns;
|
||||
if (stopRequested || !audioMixer) {
|
||||
return;
|
||||
}
|
||||
encodedData = mixedAudioBuffer.data();
|
||||
encodedByteCount = static_cast<DWORD>(mixedAudioBuffer.size());
|
||||
}
|
||||
|
||||
if (!encoder.writeAudio(encodedData, encodedByteCount, timestampHns, durationHns)) {
|
||||
encodeFailed = true;
|
||||
stopRequested = true;
|
||||
cv.notify_all();
|
||||
}
|
||||
})) {
|
||||
std::cerr << "ERROR: Failed to start WASAPI loopback capture" << std::endl;
|
||||
microphoneCapture.stop();
|
||||
return 1;
|
||||
audioMixer->pushMicrophone(data, byteCount);
|
||||
})) {
|
||||
std::cerr << "ERROR: Failed to start WASAPI microphone capture" << std::endl;
|
||||
audioMixer->stop();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.captureSystemAudio) {
|
||||
if (!loopbackCapture.start([&](const BYTE* data, DWORD byteCount, int64_t timestampHns, int64_t durationHns) {
|
||||
(void)timestampHns;
|
||||
(void)durationHns;
|
||||
if (stopRequested || !audioMixer) {
|
||||
return;
|
||||
}
|
||||
|
||||
audioMixer->pushSystem(data, byteCount);
|
||||
})) {
|
||||
std::cerr << "ERROR: Failed to start WASAPI loopback capture" << std::endl;
|
||||
microphoneCapture.stop();
|
||||
audioMixer->stop();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
if (!startAudioCaptures()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!session.start()) {
|
||||
microphoneCapture.stop();
|
||||
loopbackCapture.stop();
|
||||
if (audioMixer) {
|
||||
audioMixer->stop();
|
||||
}
|
||||
std::cerr << "ERROR: Failed to start WGC session" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
@@ -467,11 +464,19 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
microphoneCapture.stop();
|
||||
loopbackCapture.stop();
|
||||
if (audioMixer) {
|
||||
audioMixer->stop();
|
||||
}
|
||||
session.stop();
|
||||
std::cerr << "ERROR: Timed out waiting for first WGC frame" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (audioMixer) {
|
||||
audioMixer->beginTimeline();
|
||||
}
|
||||
|
||||
std::cout << "{\"event\":\"recording-started\",\"schemaVersion\":2}" << std::endl;
|
||||
std::cout << "Recording started" << std::endl;
|
||||
|
||||
@@ -484,6 +489,9 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
microphoneCapture.stop();
|
||||
loopbackCapture.stop();
|
||||
if (audioMixer) {
|
||||
audioMixer->stop();
|
||||
}
|
||||
session.stop();
|
||||
{
|
||||
std::scoped_lock lock(mutex);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { spawn, spawnSync } from "node:child_process";
|
||||
import { randomUUID } from "node:crypto";
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
@@ -105,7 +106,7 @@ if (!fs.existsSync(HELPER_PATH)) {
|
||||
|
||||
const outputPath = path.join(
|
||||
os.tmpdir(),
|
||||
`openscreen-wgc-helper-${WITH_SYSTEM_AUDIO || WITH_MICROPHONE ? "audio" : "video"}-${Date.now()}.mp4`,
|
||||
`openscreen-wgc-helper-${WITH_SYSTEM_AUDIO || WITH_MICROPHONE ? "audio" : "video"}-${process.pid}-${Date.now()}-${randomUUID()}.mp4`,
|
||||
);
|
||||
|
||||
const config = {
|
||||
|
||||
Reference in New Issue
Block a user