Testing Audio Applications: Speech Recognition, TTS, and Podcast Processing
Audio applications are hard to test because the output is binary and subjective. Testing speech-to-text accuracy, validating TTS output quality, and verifying podcast processing pipelines all require different strategies. This guide covers practical approaches that give you reliable automated test coverage without requiring audio playback.
Testing Speech Recognition Integration
Mocking Whisper/Deepgram/AssemblyAI
// services/transcriptionService.ts
import Deepgram from '@deepgram/sdk'
const deepgram = new Deepgram(process.env.DEEPGRAM_API_KEY!)
export interface TranscriptionResult {
transcript: string
words: Array<{ word: string; start: number; end: number; confidence: number }>
confidence: number
duration: number
}
export async function transcribeAudio(audioBuffer: Buffer): Promise<TranscriptionResult> {
const response = await deepgram.transcription.preRecorded(
{ buffer: audioBuffer, mimetype: 'audio/wav' },
{
punctuate: true,
diarize: true,
utterances: true,
model: 'general',
}
)
const channel = response.results?.channels?.[0]?.alternatives?.[0]
if (!channel) throw new Error('No transcription result returned')
return {
transcript: channel.transcript,
words: channel.words?.map((w) => ({
word: w.word,
start: w.start,
end: w.end,
confidence: w.confidence,
})) ?? [],
confidence: channel.confidence,
duration: response.metadata?.duration ?? 0,
}
}// services/transcriptionService.test.ts
import Deepgram from '@deepgram/sdk'
import { transcribeAudio } from './transcriptionService'
jest.mock('@deepgram/sdk')
const mockDeepgram = {
transcription: {
preRecorded: jest.fn(),
},
}
;(Deepgram as jest.MockedClass<typeof Deepgram>).mockImplementation(() => mockDeepgram as any)
describe('transcribeAudio', () => {
const mockDeepgramResponse = {
metadata: { duration: 12.5 },
results: {
channels: [{
alternatives: [{
transcript: 'Hello world this is a test',
confidence: 0.98,
words: [
{ word: 'hello', start: 0.0, end: 0.5, confidence: 0.99 },
{ word: 'world', start: 0.5, end: 1.0, confidence: 0.97 },
],
}],
}],
},
}
beforeEach(() => {
mockDeepgram.transcription.preRecorded.mockResolvedValue(mockDeepgramResponse)
})
it('returns the transcript text', async () => {
const result = await transcribeAudio(Buffer.from('fake-audio'))
expect(result.transcript).toBe('Hello world this is a test')
})
it('returns confidence score', async () => {
const result = await transcribeAudio(Buffer.from('fake-audio'))
expect(result.confidence).toBe(0.98)
})
it('returns word-level timestamps', async () => {
const result = await transcribeAudio(Buffer.from('fake-audio'))
expect(result.words).toHaveLength(2)
expect(result.words[0]).toEqual({
word: 'hello', start: 0.0, end: 0.5, confidence: 0.99,
})
})
it('returns duration from metadata', async () => {
const result = await transcribeAudio(Buffer.from('fake-audio'))
expect(result.duration).toBe(12.5)
})
it('throws when no transcription result is returned', async () => {
mockDeepgram.transcription.preRecorded.mockResolvedValue({ results: null, metadata: {} })
await expect(transcribeAudio(Buffer.from('silence'))).rejects.toThrow(
'No transcription result returned'
)
})
it('sends audio with correct MIME type', async () => {
await transcribeAudio(Buffer.from('fake-audio'))
expect(mockDeepgram.transcription.preRecorded).toHaveBeenCalledWith(
expect.objectContaining({ mimetype: 'audio/wav' }),
expect.any(Object)
)
})
})Testing TTS (Text-to-Speech) Output
// services/ttsService.ts
import OpenAI from 'openai'
import fs from 'fs/promises'
const openai = new OpenAI()
export async function generateSpeech(
text: string,
voice: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer',
outputPath: string
): Promise<{ path: string; duration?: number }> {
const mp3 = await openai.audio.speech.create({
model: 'tts-1',
voice,
input: text,
})
const buffer = Buffer.from(await mp3.arrayBuffer())
await fs.writeFile(outputPath, buffer)
return { path: outputPath }
}// services/ttsService.test.ts
import OpenAI from 'openai'
import fs from 'fs/promises'
import { generateSpeech } from './ttsService'
jest.mock('openai')
jest.mock('fs/promises')
const mockOpenAI = {
audio: {
speech: {
create: jest.fn(),
},
},
}
;(OpenAI as jest.MockedClass<typeof OpenAI>).mockImplementation(() => mockOpenAI as any)
const mockFs = fs as jest.Mocked<typeof fs>
describe('generateSpeech', () => {
const fakeAudioBuffer = Buffer.from('fake-mp3-data')
beforeEach(() => {
mockOpenAI.audio.speech.create.mockResolvedValue({
arrayBuffer: async () => fakeAudioBuffer,
})
mockFs.writeFile.mockResolvedValue(undefined)
})
it('calls OpenAI with correct parameters', async () => {
await generateSpeech('Hello world', 'nova', '/tmp/output.mp3')
expect(mockOpenAI.audio.speech.create).toHaveBeenCalledWith({
model: 'tts-1',
voice: 'nova',
input: 'Hello world',
})
})
it('writes audio to the specified output path', async () => {
await generateSpeech('Test text', 'alloy', '/tmp/speech.mp3')
expect(mockFs.writeFile).toHaveBeenCalledWith(
'/tmp/speech.mp3',
expect.any(Buffer)
)
})
it('returns the output path', async () => {
const result = await generateSpeech('Test', 'onyx', '/tmp/out.mp3')
expect(result.path).toBe('/tmp/out.mp3')
})
})Testing Podcast Processing Pipeline
// pipelines/podcastPipeline.ts
import { transcribeAudio } from '../services/transcriptionService'
import { generateSpeech } from '../services/ttsService'
import { extractAudioChapters } from '../services/chapterService'
export interface PodcastProcessingResult {
transcript: string
chapters: Array<{ title: string; startTime: number }>
summary: string
}
export async function processPodcastEpisode(
audioBuffer: Buffer,
generateSummary: (text: string) => Promise<string>
): Promise<PodcastProcessingResult> {
const [transcription, chapters] = await Promise.all([
transcribeAudio(audioBuffer),
extractAudioChapters(audioBuffer),
])
const summary = await generateSummary(transcription.transcript)
return {
transcript: transcription.transcript,
chapters,
summary,
}
}// pipelines/podcastPipeline.test.ts
import { processPodcastEpisode } from './podcastPipeline'
import * as transcriptionService from '../services/transcriptionService'
import * as chapterService from '../services/chapterService'
jest.mock('../services/transcriptionService')
jest.mock('../services/chapterService')
const mockTranscribe = transcriptionService.transcribeAudio as jest.Mock
const mockExtractChapters = chapterService.extractAudioChapters as jest.Mock
describe('processPodcastEpisode', () => {
const mockTranscriptionResult = {
transcript: 'Welcome to the podcast. Today we discuss testing.',
words: [],
confidence: 0.96,
duration: 3600,
}
const mockChapters = [
{ title: 'Introduction', startTime: 0 },
{ title: 'Main Topic', startTime: 300 },
]
beforeEach(() => {
mockTranscribe.mockResolvedValue(mockTranscriptionResult)
mockExtractChapters.mockResolvedValue(mockChapters)
})
it('returns transcript, chapters, and summary', async () => {
const mockSummary = jest.fn().mockResolvedValue('A discussion about testing practices.')
const result = await processPodcastEpisode(Buffer.from('audio'), mockSummary)
expect(result.transcript).toBe(mockTranscriptionResult.transcript)
expect(result.chapters).toEqual(mockChapters)
expect(result.summary).toBe('A discussion about testing practices.')
})
it('runs transcription and chapter extraction in parallel', async () => {
const mockSummary = jest.fn().mockResolvedValue('Summary')
await processPodcastEpisode(Buffer.from('audio'), mockSummary)
// Both should have been called
expect(mockTranscribe).toHaveBeenCalledTimes(1)
expect(mockExtractChapters).toHaveBeenCalledTimes(1)
})
it('passes the transcript to the summary function', async () => {
const mockSummary = jest.fn().mockResolvedValue('Summary')
await processPodcastEpisode(Buffer.from('audio'), mockSummary)
expect(mockSummary).toHaveBeenCalledWith(mockTranscriptionResult.transcript)
})
it('propagates transcription errors', async () => {
mockTranscribe.mockRejectedValue(new Error('Audio format not supported'))
await expect(
processPodcastEpisode(Buffer.from('bad-audio'), jest.fn())
).rejects.toThrow('Audio format not supported')
})
})Testing Transcription Accuracy (Integration)
For integration tests that measure real accuracy, use a Word Error Rate (WER) metric:
// tests/integration/transcriptionAccuracy.test.ts
import { transcribeAudio } from '../../services/transcriptionService'
import fs from 'fs'
import path from 'path'
function wordErrorRate(reference: string, hypothesis: string): number {
const refWords = reference.toLowerCase().split(/\s+/)
const hypWords = hypothesis.toLowerCase().split(/\s+/)
// Simple Levenshtein-based WER
const dp: number[][] = Array.from({ length: refWords.length + 1 }, (_, i) =>
Array.from({ length: hypWords.length + 1 }, (_, j) => (i === 0 ? j : j === 0 ? i : 0))
)
for (let i = 1; i <= refWords.length; i++) {
for (let j = 1; j <= hypWords.length; j++) {
dp[i][j] =
refWords[i - 1] === hypWords[j - 1]
? dp[i - 1][j - 1]
: 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
}
}
return dp[refWords.length][hypWords.length] / refWords.length
}
describe('Transcription accuracy', () => {
it('achieves WER below 5% on clear speech', async () => {
const audioBuffer = fs.readFileSync(
path.join(__dirname, 'fixtures/clear-speech.wav')
)
const expectedTranscript = 'The quick brown fox jumps over the lazy dog'
const result = await transcribeAudio(audioBuffer)
const wer = wordErrorRate(expectedTranscript, result.transcript)
expect(wer).toBeLessThan(0.05) // < 5% error rate
}, 30000)
})What Automated Tests Miss
Mocked audio tests cover pipeline logic but won't catch:
- Background noise degradation — real-world audio quality drops accuracy
- Speaker accent differences — models calibrated on specific accents
- Audio codec compatibility — WAV vs MP3 vs OGG handling varies by provider
- Latency under load — transcription APIs have variable response times
HelpMeTest monitors your audio processing pipelines end-to-end — uploading a real audio file and verifying the transcript appears correctly in your UI. Start free with 10 tests.
Summary
Testing audio processing pipelines:
- Mock STT/TTS SDKs — test request parameters, error handling, response parsing
- Test the pipeline composition — verify parallel execution, correct data flow
- Word Error Rate for accuracy integration tests — a quantitative metric for transcription quality
- Test error paths — empty audio, unsupported format, API unavailability
- Separate unit from integration — mock everything in unit tests; use real audio fixtures only in integration