Testing Whisper and AssemblyAI: Transcription API Unit and Integration Tests
OpenAI's Whisper and AssemblyAI are the two most common transcription APIs in production Python applications. They have different architectures, pricing models, and accuracy profiles — but they share the same testing challenges. This guide walks through a complete test suite for both, covering unit tests, integration tests, and edge cases you will encounter in real deployments.
Whisper vs AssemblyAI: What Matters for Testing
Whisper comes in two forms: the open-source model you run locally, and the OpenAI API endpoint. Testing these is fundamentally different:
- Local Whisper: deterministic output for identical inputs, no rate limits, higher latency
- OpenAI Whisper API: similar to AssemblyAI — async job or direct result, API key required, network dependency
AssemblyAI is API-first with a polling model for long audio. Key testing implications:
- Async by default — tests must handle job submission + polling
- Webhook support means you can test event-driven integrations
- Multiple models (
best,nano) have different accuracy/speed tradeoffs
Setting Up Your Test Environment
# tests/conftest.py
import pytest
import os
from pathlib import Path
FIXTURES_DIR = Path("tests/fixtures/audio")
@pytest.fixture(scope="session")
def whisper_client():
"""Real OpenAI client for integration tests."""
from openai import OpenAI
return OpenAI(api_key=os.environ["OPENAI_API_KEY"])
@pytest.fixture(scope="session")
def assemblyai_client():
"""Real AssemblyAI client for integration tests."""
import assemblyai as aai
aai.settings.api_key = os.environ["ASSEMBLYAI_API_KEY"]
return aai
@pytest.fixture
def mock_whisper_response():
return {
"text": "The quick brown fox jumps over the lazy dog.",
"language": "en",
"duration": 3.2,
"segments": [
{
"id": 0,
"start": 0.0,
"end": 3.2,
"text": "The quick brown fox jumps over the lazy dog.",
"confidence": 0.98
}
]
}Unit Testing the Whisper Integration
Unit tests mock the API entirely and focus on your wrapper code:
# tests/unit/test_whisper_client.py
import pytest
from unittest.mock import patch, MagicMock, mock_open
import io
class TestWhisperClient:
def test_transcribe_sends_correct_file_format(self, mock_whisper_response):
"""Whisper API requires specific audio formats — verify we send them correctly."""
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(text="The quick brown fox")
from myapp.transcription import WhisperTranscriber
transcriber = WhisperTranscriber(api_key="test-key")
result = transcriber.transcribe("tests/fixtures/audio/clean_studio.wav")
call_kwargs = mock_create.call_args.kwargs
assert call_kwargs["model"] == "whisper-1"
assert result == "The quick brown fox"
def test_transcribe_with_language_hint(self):
"""Providing a language hint should improve accuracy for non-English audio."""
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(text="Hola mundo")
from myapp.transcription import WhisperTranscriber
transcriber = WhisperTranscriber(api_key="test-key")
result = transcriber.transcribe("audio.wav", language="es")
call_kwargs = mock_create.call_args.kwargs
assert call_kwargs.get("language") == "es"
def test_handles_file_too_large_error(self):
"""Files > 25MB should be rejected before sending to API."""
from myapp.transcription import WhisperTranscriber, FileTooLargeError
transcriber = WhisperTranscriber(api_key="test-key")
with patch("os.path.getsize", return_value=26 * 1024 * 1024): # 26MB
with pytest.raises(FileTooLargeError) as exc_info:
transcriber.transcribe("large_file.wav")
assert "25MB" in str(exc_info.value)
def test_handles_openai_rate_limit(self):
"""Should retry with backoff on 429 responses."""
from openai import RateLimitError
from myapp.transcription import WhisperTranscriber
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.side_effect = [
RateLimitError("Rate limit exceeded", response=MagicMock(status_code=429), body={}),
MagicMock(text="hello world")
]
transcriber = WhisperTranscriber(api_key="test-key", max_retries=2)
result = transcriber.transcribe("audio.wav")
assert result == "hello world"
assert mock_create.call_count == 2
def test_handles_unsupported_format(self):
"""Should raise clear error for unsupported audio formats."""
from myapp.transcription import WhisperTranscriber, UnsupportedFormatError
transcriber = WhisperTranscriber(api_key="test-key")
with pytest.raises(UnsupportedFormatError):
transcriber.transcribe("audio.amr") # AMR not supportedUnit Testing AssemblyAI Integration
AssemblyAI's async polling model requires testing the polling logic:
# tests/unit/test_assemblyai_client.py
import pytest
from unittest.mock import patch, MagicMock
import time
class TestAssemblyAIClient:
def test_submit_and_poll_success(self):
"""Happy path: submit job, poll until complete, return transcript."""
from myapp.transcription import AssemblyAITranscriber
mock_transcript = MagicMock()
mock_transcript.status = "completed"
mock_transcript.text = "This is the transcribed text."
mock_transcript.words = []
with patch("assemblyai.Transcriber.transcribe") as mock_transcribe:
mock_transcribe.return_value = mock_transcript
transcriber = AssemblyAITranscriber(api_key="test-key")
result = transcriber.transcribe("audio.wav")
assert result.text == "This is the transcribed text."
def test_transcription_failure_raises_exception(self):
"""Failed transcription jobs should raise a clear exception."""
from myapp.transcription import AssemblyAITranscriber, TranscriptionFailedError
mock_transcript = MagicMock()
mock_transcript.status = "error"
mock_transcript.error = "Audio file corrupted or unreadable"
with patch("assemblyai.Transcriber.transcribe") as mock_transcribe:
mock_transcribe.return_value = mock_transcript
transcriber = AssemblyAITranscriber(api_key="test-key")
with pytest.raises(TranscriptionFailedError) as exc_info:
transcriber.transcribe("corrupted.wav")
assert "corrupted or unreadable" in str(exc_info.value)
def test_speaker_diarization_enabled(self):
"""Diarization config should be passed through to AssemblyAI."""
from myapp.transcription import AssemblyAITranscriber
mock_transcript = MagicMock()
mock_transcript.status = "completed"
mock_transcript.text = "Speaker A said something. Speaker B replied."
mock_transcript.utterances = [
MagicMock(speaker="A", text="Speaker A said something."),
MagicMock(speaker="B", text="Speaker B replied.")
]
with patch("assemblyai.Transcriber.transcribe") as mock_transcribe:
mock_transcribe.return_value = mock_transcript
transcriber = AssemblyAITranscriber(api_key="test-key")
result = transcriber.transcribe("meeting.wav", diarize=True)
# Verify diarization config was passed
call_config = mock_transcribe.call_args.args[1]
assert call_config.speaker_labels is TrueEdge Case Tests
These are the tests that catch real production bugs:
# tests/unit/test_edge_cases.py
class TestTranscriptionEdgeCases:
def test_silence_only_audio(self):
"""Pure silence should return empty string, not error."""
from myapp.transcription import WhisperTranscriber
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(text="")
transcriber = WhisperTranscriber(api_key="test-key")
result = transcriber.transcribe("silence.wav")
# Should return empty string, not raise exception
assert result == ""
def test_very_short_audio_under_one_second(self):
"""Sub-second audio clips should be handled gracefully."""
from myapp.transcription import WhisperTranscriber
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(text="Hi")
transcriber = WhisperTranscriber(api_key="test-key")
result = transcriber.transcribe("very_short.wav")
assert isinstance(result, str)
def test_audio_with_only_music_no_speech(self):
"""Music-only audio may return hallucinated text or empty string."""
from myapp.transcription import WhisperTranscriber
# Whisper is known to hallucinate on music — test that we handle
# both empty string and non-empty string gracefully
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(text="♪ Thank you for the music ♪")
transcriber = WhisperTranscriber(api_key="test-key")
result = transcriber.transcribe("music_only.wav")
# We should get a string back regardless
assert isinstance(result, str)
def test_mixed_language_audio(self):
"""Code-switching audio (two languages in one clip) is a known edge case."""
from myapp.transcription import WhisperTranscriber
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(
text="Hello, ¿cómo estás? I am fine, gracias."
)
transcriber = WhisperTranscriber(api_key="test-key")
# Should not raise; mixed output is acceptable
result = transcriber.transcribe("mixed_language.wav")
assert len(result) > 0
@pytest.mark.parametrize("accent_file,min_confidence", [
("american_english.wav", 0.90),
("british_english.wav", 0.85),
("australian_english.wav", 0.82),
("indian_english.wav", 0.75),
("scottish_english.wav", 0.70),
])
def test_accent_coverage(self, accent_file, min_confidence):
"""Different accents should meet minimum confidence thresholds."""
with patch("openai.resources.audio.transcriptions.Transcriptions.create") as mock_create:
mock_create.return_value = MagicMock(
text="The weather is quite nice today.",
# Note: Whisper API doesn't return confidence; this tests AssemblyAI path
)
# Fixture-based test runs real API in integration suite
pass # Skip in unit tests; covered by integration testsIntegration Tests: Accent and Noise Conditions
# tests/integration/test_transcription_quality.py
import pytest
import jiwer
WHISPER_QUALITY_CASES = [
("clean_studio.wav", "clean_studio.txt", 0.04),
("office_background.wav", "office_background.txt", 0.10),
("accented_speech.wav", "accented_speech.txt", 0.15),
("technical_jargon.wav", "technical_jargon.txt", 0.08),
]
@pytest.mark.integration
@pytest.mark.parametrize("audio,reference,max_wer", WHISPER_QUALITY_CASES)
def test_whisper_wer_thresholds(whisper_client, audio, reference, max_wer):
from myapp.transcription import WhisperTranscriber
with open(f"tests/fixtures/transcripts/{reference}") as f:
expected = f.read().strip().lower()
with open(f"tests/fixtures/audio/{audio}", "rb") as f:
response = whisper_client.audio.transcriptions.create(
model="whisper-1",
file=f
)
actual = response.text.lower()
wer = jiwer.wer(expected, actual)
assert wer <= max_wer, (
f"Whisper WER {wer:.1%} > {max_wer:.1%} for {audio}\n"
f"Expected: {expected[:80]}\n"
f"Got: {actual[:80]}"
)Running in CI
Separate expensive integration tests from cheap unit tests:
# Makefile
test-unit:
pytest tests/unit/ -v --timeout=10
test-integration:
pytest tests/integration/ -v --timeout=120 -m integration
test-quality:
pytest tests/integration/ -v -m "integration and slow" --timeout=300The integration tests require real API keys. Store them in your CI secrets manager and only run quality tests on a schedule — daily is sufficient for catching upstream model regressions. For real-time alerts when transcription quality degrades in production, HelpMeTest monitoring can trigger on WER thresholds without maintaining your own monitoring stack.
Conclusion
Testing Whisper and AssemblyAI is a two-layer problem. Unit tests cover your integration code — error handling, retry logic, format validation — and run in seconds with no API keys. Integration tests cover real quality metrics and catch upstream model changes. Build both layers, keep your fixture library current with real production audio samples, and gate production deployments on WER thresholds rather than just "the API returned 200."