Merge pull request #434 from Enriquefft/fix/export-audio-duration-validation

fix: validate export duration and fix audio trim in speed-aware path
This commit is contained in:
Sid
2026-04-18 10:41:38 -07:00
committed by GitHub
4 changed files with 211 additions and 31 deletions
+103 -20
View File
@@ -5,6 +5,7 @@ import type { VideoMuxer } from "./muxer";
const AUDIO_BITRATE = 128_000;
const DECODE_BACKPRESSURE_LIMIT = 20;
const MIN_SPEED_REGION_DELTA_MS = 0.0001;
const SEEK_TIMEOUT_MS = 5_000;
export class AudioProcessor {
private cancelled = false;
@@ -18,9 +19,9 @@ export class AudioProcessor {
demuxer: WebDemuxer,
muxer: VideoMuxer,
videoUrl: string,
trimRegions?: TrimRegion[],
speedRegions?: SpeedRegion[],
readEndSec?: number,
trimRegions: TrimRegion[] | undefined,
speedRegions: SpeedRegion[] | undefined,
validatedDurationSec: number,
): Promise<void> {
const sortedTrims = trimRegions ? [...trimRegions].sort((a, b) => a.startMs - b.startMs) : [];
const sortedSpeedRegions = speedRegions
@@ -35,14 +36,19 @@ export class AudioProcessor {
videoUrl,
sortedTrims,
sortedSpeedRegions,
validatedDurationSec,
);
if (!this.cancelled) {
if (!this.cancelled && renderedAudioBlob.size > 0) {
await this.muxRenderedAudioBlob(renderedAudioBlob, muxer);
return;
}
return;
}
// No speed edits: keep the original demux/decode/encode path with trim timestamp remap.
// The +0.5s buffer mirrors streamingDecoder.decodeAll's read window so the trim-only
// and speed-aware paths agree on how far to read past the validated duration boundary.
const readEndSec = validatedDurationSec + 0.5;
await this.processTrimOnlyAudio(demuxer, muxer, sortedTrims, readEndSec);
}
@@ -55,7 +61,7 @@ export class AudioProcessor {
): Promise<void> {
let audioConfig: AudioDecoderConfig;
try {
audioConfig = (await demuxer.getDecoderConfig("audio")) as AudioDecoderConfig;
audioConfig = await demuxer.getDecoderConfig("audio");
} catch {
console.warn("[AudioProcessor] No audio track found, skipping");
return;
@@ -80,11 +86,10 @@ export class AudioProcessor {
typeof readEndSec === "number" && Number.isFinite(readEndSec)
? Math.max(0, readEndSec)
: undefined;
const audioStream = (
const audioStream =
safeReadEndSec !== undefined
? demuxer.read("audio", 0, safeReadEndSec)
: demuxer.read("audio")
) as ReadableStream<EncodedAudioChunk>;
: demuxer.read("audio");
const reader = audioStream.getReader();
try {
@@ -187,6 +192,7 @@ export class AudioProcessor {
videoUrl: string,
trimRegions: TrimRegion[],
speedRegions: SpeedRegion[],
validatedDurationSec: number,
): Promise<Blob> {
const media = document.createElement("audio");
media.src = videoUrl;
@@ -211,15 +217,44 @@ export class AudioProcessor {
const destinationNode = audioContext.createMediaStreamDestination();
sourceNode.connect(destinationNode);
const { recorder, recordedBlobPromise } = this.startAudioRecording(destinationNode.stream);
let rafId: number | null = null;
let recorder: MediaRecorder | null = null;
let recordedBlobPromise: Promise<Blob> | null = null;
try {
if (audioContext.state === "suspended") {
await audioContext.resume();
}
await this.seekTo(media, 0);
// Skip past any initial trim region(s) before recording starts to avoid
// capturing trimmed audio during the first rAF frames of playback.
// Loops to handle back-to-back or overlapping trims at t=0.
const effectiveEnd = validatedDurationSec;
let startPosition = 0;
for (let i = 0; i <= trimRegions.length; i++) {
const activeTrim = this.findActiveTrimRegion(startPosition * 1000, trimRegions);
if (!activeTrim) break;
startPosition = activeTrim.endMs / 1000;
if (startPosition >= effectiveEnd) break;
}
if (startPosition >= effectiveEnd) {
// All content is trimmed — return silent blob
return new Blob([], { type: "audio/webm" });
}
await this.seekTo(media, startPosition);
// Set initial playback rate for the starting position
const initialSpeedRegion = this.findActiveSpeedRegion(startPosition * 1000, speedRegions);
if (initialSpeedRegion) {
media.playbackRate = initialSpeedRegion.speed;
}
// Start recording only AFTER seeking past trims
const recording = this.startAudioRecording(destinationNode.stream);
recorder = recording.recorder;
recordedBlobPromise = recording.recordedBlobPromise;
await media.play();
await new Promise<void>((resolve, reject) => {
@@ -249,24 +284,66 @@ export class AudioProcessor {
return;
}
// Stop playback at validated duration — browser's media.duration
// may be inflated from bad container metadata.
if (media.currentTime >= validatedDurationSec) {
media.pause();
cleanup();
resolve();
return;
}
const currentTimeMs = media.currentTime * 1000;
const activeTrimRegion = this.findActiveTrimRegion(currentTimeMs, trimRegions);
if (activeTrimRegion && !media.paused && !media.ended) {
const skipToTime = activeTrimRegion.endMs / 1000;
if (skipToTime >= media.duration) {
if (skipToTime >= media.duration || skipToTime >= validatedDurationSec) {
media.pause();
cleanup();
resolve();
return;
}
// Pause recording during trim seek to prevent capturing
// silence/noise as the audio element seeks.
media.pause();
if (recorder?.state === "recording") recorder.pause();
const onSeeked = () => {
clearTimeout(seekTimer);
if (this.cancelled) {
cleanup();
resolve();
return;
}
if (recorder?.state === "paused") recorder.resume();
media
.play()
.then(() => {
if (!this.cancelled) rafId = requestAnimationFrame(tick);
})
.catch((err) => {
cleanup();
reject(
new Error(
`Failed to resume playback after trim seek: ${err instanceof Error ? err.message : String(err)}`,
),
);
});
};
const seekTimer = window.setTimeout(() => {
media.removeEventListener("seeked", onSeeked);
cleanup();
reject(new Error("Audio seek timed out while skipping trim region"));
}, SEEK_TIMEOUT_MS);
media.addEventListener("seeked", onSeeked, { once: true });
media.currentTime = skipToTime;
} else {
const activeSpeedRegion = this.findActiveSpeedRegion(currentTimeMs, speedRegions);
const playbackRate = activeSpeedRegion ? activeSpeedRegion.speed : 1;
if (Math.abs(media.playbackRate - playbackRate) > 0.0001) {
media.playbackRate = playbackRate;
}
return;
}
const activeSpeedRegion = this.findActiveSpeedRegion(currentTimeMs, speedRegions);
const playbackRate = activeSpeedRegion ? activeSpeedRegion.speed : 1;
if (Math.abs(media.playbackRate - playbackRate) > 0.0001) {
media.playbackRate = playbackRate;
}
if (!media.paused && !media.ended) {
@@ -286,7 +363,7 @@ export class AudioProcessor {
cancelAnimationFrame(rafId);
}
media.pause();
if (recorder.state !== "inactive") {
if (recorder && recorder.state !== "inactive") {
recorder.stop();
}
destinationNode.stream.getTracks().forEach((track) => track.stop());
@@ -297,6 +374,12 @@ export class AudioProcessor {
media.load();
}
if (!recordedBlobPromise) {
// Invariant: either an early return above fires, or startAudioRecording ran and
// populated recordedBlobPromise before the playback Promise resolved. Reaching
// here means that contract was broken — fail loud instead of returning silence.
throw new Error("Audio recorder finished without assigning recordedBlobPromise");
}
const recordedBlob = await recordedBlobPromise;
if (this.cancelled) {
throw new Error("Export cancelled");
@@ -314,8 +397,8 @@ export class AudioProcessor {
try {
await demuxer.load(file);
const audioConfig = (await demuxer.getDecoderConfig("audio")) as AudioDecoderConfig;
const reader = (demuxer.read("audio") as ReadableStream<EncodedAudioChunk>).getReader();
const audioConfig = await demuxer.getDecoderConfig("audio");
const reader = demuxer.read("audio").getReader();
let isFirstChunk = true;
try {
+40 -1
View File
@@ -1,5 +1,44 @@
import { describe, expect, it } from "vitest";
import { shouldFailDecodeEndedEarly } from "./streamingDecoder";
import { shouldFailDecodeEndedEarly, validateDuration } from "./streamingDecoder";
describe("validateDuration", () => {
it("returns scanned duration when container reports Infinity", () => {
expect(validateDuration(Infinity, 15.3)).toBe(15.3);
});
it("returns scanned duration when container reports 0", () => {
expect(validateDuration(0, 15.3)).toBe(15.3);
});
it("returns scanned duration when container reports NaN", () => {
expect(validateDuration(NaN, 15.3)).toBe(15.3);
});
it("returns scanned duration when container is inflated beyond threshold", () => {
expect(validateDuration(42, 15.3)).toBe(15.3);
});
it("returns container duration when values are close", () => {
expect(validateDuration(15.5, 15.3)).toBe(15.5);
});
it("returns container duration when scanned is slightly higher", () => {
// container < scanned (scanned overshoot from last frame duration)
expect(validateDuration(15.0, 15.3)).toBe(15.0);
});
it("returns scanned duration when container under-reports beyond threshold", () => {
expect(validateDuration(10, 15.3)).toBe(15.3);
});
it("returns container duration when scanned is zero (corrupted/empty file)", () => {
expect(validateDuration(10, 0)).toBe(10);
});
it("returns 0 when both container is NaN and scanned is zero", () => {
expect(validateDuration(NaN, 0)).toBe(0);
});
});
describe("shouldFailDecodeEndedEarly", () => {
it("does not fail once every segment has been satisfied", () => {
+66 -2
View File
@@ -70,6 +70,37 @@ type EarlyDecodeEndCheck = {
const EARLY_DECODE_END_THRESHOLD_SEC = 1;
const METADATA_TAIL_TOLERANCE_SEC = 1.5;
const STREAM_DURATION_MATCH_TOLERANCE_SEC = 0.25;
const DURATION_DIVERGENCE_THRESHOLD_SEC = 1.5;
// Fallback upper bound for the packet scan when no reliable duration hint is
// available. Explicit end is required (some containers are truncated without
// one), but the hint-derived bound would cap the scan prematurely when
// container/stream duration are missing or corrupt.
const SCAN_UNBOUNDED_FALLBACK_SEC = 24 * 60 * 60;
/**
* Validate container duration against actual packet timestamps.
*
* Chrome/Electron's MediaRecorder writes WebM containers with unreliable
* Duration fields (often Infinity, 0, or inflated) — especially on Linux.
* This function picks the most trustworthy duration value.
*
* @param containerDuration Duration from the container-level metadata
* @param scannedDuration Duration derived from actual packet timestamps (ground truth)
*/
export function validateDuration(containerDuration: number, scannedDuration: number): number {
if (scannedDuration <= 0) {
// Zero scanned duration means corrupted/empty file — fall back to container
// (downstream shouldFailDecodeEndedEarly will catch truly empty files)
return Number.isFinite(containerDuration) ? Math.max(containerDuration, 0) : 0;
}
if (!Number.isFinite(containerDuration) || containerDuration <= 0) {
return scannedDuration;
}
if (Math.abs(containerDuration - scannedDuration) > DURATION_DIVERGENCE_THRESHOLD_SEC) {
return scannedDuration;
}
return containerDuration;
}
export function shouldFailDecodeEndedEarly({
cancelled,
@@ -201,10 +232,43 @@ export class StreamingVideoDecoder {
const audioStream = mediaInfo.streams.find((s) => s.codec_type_string === "audio");
// Scan video packets to find the true content boundary.
// MediaRecorder (especially on Linux) writes unreliable container durations.
// Packet timestamps are ground truth — no decode needed, just timestamp reads.
// Pass explicit range because some containers are truncated without one.
// Sanitize because mediaInfo.duration can be NaN/Infinity (Chromium Linux bug),
// which would propagate into demuxer.read() as an invalid endpoint.
const containerDurationSec = Number.isFinite(mediaInfo.duration) ? mediaInfo.duration : 0;
const streamDurationSec =
typeof videoStream?.duration === "number" && Number.isFinite(videoStream.duration)
? videoStream.duration
: 0;
const hintedDurationSec = Math.max(containerDurationSec, streamDurationSec, 0);
const scanEndSec =
hintedDurationSec > 0 ? hintedDurationSec + 0.5 : SCAN_UNBOUNDED_FALLBACK_SEC;
let maxPacketEndUs = 0;
const scanReader = this.demuxer.read("video", 0, scanEndSec).getReader();
try {
while (true) {
const { done, value } = await scanReader.read();
if (done || !value) break;
const endUs = value.timestamp + (value.duration ?? 0);
if (endUs > maxPacketEndUs) maxPacketEndUs = endUs;
}
} finally {
try {
await scanReader.cancel();
} catch {
/* already closed */
}
}
const scannedDuration = maxPacketEndUs / 1_000_000;
const validatedDuration = validateDuration(mediaInfo.duration, scannedDuration);
this.metadata = {
width: videoStream?.width || 1920,
height: videoStream?.height || 1080,
duration: mediaInfo.duration,
duration: validatedDuration,
streamDuration:
typeof videoStream?.duration === "number" && Number.isFinite(videoStream.duration)
? videoStream.duration
@@ -305,7 +369,7 @@ export class StreamingVideoDecoder {
// One forward stream through the whole file.
// Pass explicit range because some containers are truncated when no end is provided.
const readEndSec = Math.max(this.metadata.duration, this.metadata.streamDuration ?? 0) + 0.5;
const readEndSec = this.metadata.duration + 0.5;
const reader = this.demuxer.read("video", 0, readEndSec).getReader();
// Feed chunks to decoder in background with backpressure
+2 -8
View File
@@ -157,17 +157,11 @@ export class VideoExporter {
this.muxer = muxer;
await muxer.initialize();
const { effectiveDuration, totalFrames } = streamingDecoder.getExportMetrics(
const { totalFrames } = streamingDecoder.getExportMetrics(
this.config.frameRate,
this.config.trimRegions,
this.config.speedRegions,
);
const readEndSec = Math.max(videoInfo.duration, videoInfo.streamDuration ?? 0) + 0.5;
console.log("[VideoExporter] Original duration:", videoInfo.duration, "s");
console.log("[VideoExporter] Effective duration:", effectiveDuration, "s");
console.log("[VideoExporter] Total frames to export:", totalFrames);
console.log("[VideoExporter] Using streaming decode (web-demuxer + VideoDecoder)");
const frameDuration = 1_000_000 / this.config.frameRate;
let frameIndex = 0;
@@ -346,7 +340,7 @@ export class VideoExporter {
this.config.videoUrl,
this.config.trimRegions,
this.config.speedRegions,
readEndSec,
videoInfo.duration,
);
}
}