fix: normalize native Windows audio for AAC

This commit is contained in:
EtienneLescot
2026-05-10 15:02:24 +02:00
parent 4e5b7a4f5a
commit 8137e816fd
4 changed files with 31 additions and 10 deletions
@@ -100,6 +100,17 @@ bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputForm
left.avgBytesPerSec == right.avgBytesPerSec;
}
AudioInputFormat makeAacCompatibleAudioFormat(const AudioInputFormat& source) {
AudioInputFormat format{};
format.subtype = MFAudioFormat_PCM;
format.sampleRate = source.sampleRate > 0 ? source.sampleRate : 48000;
format.channels = 2;
format.bitsPerSample = 16;
format.blockAlign = format.channels * (format.bitsPerSample / 8);
format.avgBytesPerSec = format.sampleRate * format.blockAlign;
return format;
}
void copyAudioWithGain(
const BYTE* source,
DWORD byteCount,
@@ -13,6 +13,7 @@
#include <vector>
bool sameAudioFormatForMixing(const AudioInputFormat& left, const AudioInputFormat& right);
AudioInputFormat makeAacCompatibleAudioFormat(const AudioInputFormat& source);
void copyAudioWithGain(
const BYTE* source,
DWORD byteCount,
+11 -4
View File
@@ -410,6 +410,7 @@ int main(int argc, char* argv[]) {
WasapiLoopbackCapture loopbackCapture;
WasapiLoopbackCapture microphoneCapture;
const AudioInputFormat* audioFormat = nullptr;
AudioInputFormat encoderAudioFormat{};
AudioInputFormat systemAudioFormat{};
AudioInputFormat microphoneAudioFormat{};
if (config.captureSystemAudio) {
@@ -443,6 +444,12 @@ int main(int argc, char* argv[]) {
<< jsonEscape(wideToUtf8(microphoneCapture.selectedDeviceName())) << "\"";
}
std::cout << "}" << std::endl;
encoderAudioFormat = makeAacCompatibleAudioFormat(*audioFormat);
std::cout << "{\"event\":\"encoder-audio-format\",\"schemaVersion\":2,\"sampleRate\":"
<< encoderAudioFormat.sampleRate
<< ",\"channels\":" << encoderAudioFormat.channels
<< ",\"bitsPerSample\":" << encoderAudioFormat.bitsPerSample
<< "}" << std::endl;
}
MFEncoder encoder;
@@ -454,7 +461,7 @@ int main(int argc, char* argv[]) {
bitrate,
session.device(),
session.context(),
audioFormat)) {
audioFormat ? &encoderAudioFormat : nullptr)) {
std::cerr << "ERROR: Failed to initialize Media Foundation encoder" << std::endl;
return 1;
}
@@ -579,9 +586,9 @@ int main(int argc, char* argv[]) {
}
audioMixer = std::make_unique<AudioMixer>(
*audioFormat,
config.captureSystemAudio ? systemAudioFormat : *audioFormat,
config.captureMic ? microphoneAudioFormat : *audioFormat,
encoderAudioFormat,
config.captureSystemAudio ? systemAudioFormat : encoderAudioFormat,
config.captureMic ? microphoneAudioFormat : encoderAudioFormat,
config.captureSystemAudio,
config.captureMic,
config.microphoneGain,
@@ -1,5 +1,7 @@
#include "mf_encoder.h"
#include "audio_sample_utils.h"
#include <mfapi.h>
#include <mferror.h>
#include <propvarutil.h>
@@ -156,7 +158,7 @@ bool MFEncoder::configureAudioStream(const AudioInputFormat& audioFormat) {
return false;
}
const UINT32 bitsPerSample = std::max<UINT32>(8, audioFormat.bitsPerSample);
const AudioInputFormat encoderFormat = makeAacCompatibleAudioFormat(audioFormat);
const UINT32 aacBytesPerSecond = 24'000;
Microsoft::WRL::ComPtr<IMFMediaType> outputType;
@@ -165,7 +167,7 @@ bool MFEncoder::configureAudioStream(const AudioInputFormat& audioFormat) {
}
outputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio);
outputType->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_AAC);
setAudioFormat(outputType.Get(), audioFormat.channels, audioFormat.sampleRate, 16);
setAudioFormat(outputType.Get(), encoderFormat.channels, encoderFormat.sampleRate, 16);
outputType->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, aacBytesPerSecond);
outputType->SetUINT32(MF_MT_AAC_PAYLOAD_TYPE, 0);
@@ -178,10 +180,10 @@ bool MFEncoder::configureAudioStream(const AudioInputFormat& audioFormat) {
return false;
}
inputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio);
inputType->SetGUID(MF_MT_SUBTYPE, audioFormat.subtype);
setAudioFormat(inputType.Get(), audioFormat.channels, audioFormat.sampleRate, bitsPerSample);
inputType->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, audioFormat.blockAlign);
inputType->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, audioFormat.avgBytesPerSec);
inputType->SetGUID(MF_MT_SUBTYPE, encoderFormat.subtype);
setAudioFormat(inputType.Get(), encoderFormat.channels, encoderFormat.sampleRate, encoderFormat.bitsPerSample);
inputType->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, encoderFormat.blockAlign);
inputType->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, encoderFormat.avgBytesPerSec);
inputType->SetUINT32(MF_MT_ALL_SAMPLES_INDEPENDENT, TRUE);
if (!succeeded(sinkWriter_->SetInputMediaType(audioStreamIndex_, inputType.Get(), nullptr),