How to Mock LLM APIs in Tests: Mock OpenAI, Claude, and Gemini
Calling the real OpenAI, Claude, or Gemini API in every test is slow, expensive, and non-deterministic. Mocking replaces those API calls with controlled, predictable responses — making your unit tests fast and your CI pipeline cheap. Here's every mocking pattern you need.
Why Mock LLM APIs
Before diving in: mocking is for unit tests. You still need integration tests that call the real API. The goal is to isolate what you're testing.
Mock when:
- Testing your application logic (parsing, error handling, routing)
- Testing your prompt construction
- Running tests in CI on every commit
- Avoiding test cost accumulation
Don't mock when:
- Testing that the model actually understands your prompts
- Validating behavior changes after prompt updates
- Running scheduled integration test suites
Pattern 1: unittest.mock — The Standard Approach
The most portable method, works with any Python testing setup.
Mocking the OpenAI Client
# tests/test_openai_mock.py
from unittest.mock import MagicMock, patch
def make_chat_response(content: str, finish_reason: str = "stop"):
"""Create a mock OpenAI ChatCompletion response."""
response = MagicMock()
response.choices = [MagicMock()]
response.choices[0].message.content = content
response.choices[0].message.role = "assistant"
response.choices[0].finish_reason = finish_reason
response.usage = MagicMock()
response.usage.prompt_tokens = 50
response.usage.completion_tokens = 100
response.usage.total_tokens = 150
response.model = "gpt-4o-mini"
response.id = "chatcmpl-mock-123"
return response
def test_my_feature():
with patch("myapp.llm.openai.OpenAI") as mock_cls:
mock_client = MagicMock()
mock_cls.return_value = mock_client
mock_client.chat.completions.create.return_value = make_chat_response(
"The answer is 42."
)
from myapp.llm import MyLLMFeature
feature = MyLLMFeature()
result = feature.ask("What is the answer?")
assert "42" in result
mock_client.chat.completions.create.assert_called_once()Mocking the Claude (Anthropic) Client
def make_claude_response(text: str, stop_reason: str = "end_turn"):
"""Create a mock Anthropic Messages response."""
response = MagicMock()
response.content = [MagicMock()]
response.content[0].text = text
response.content[0].type = "text"
response.stop_reason = stop_reason
response.usage = MagicMock()
response.usage.input_tokens = 50
response.usage.output_tokens = 100
response.model = "claude-3-5-sonnet-20241022"
response.id = "msg_mock_abc123"
return response
def test_claude_feature():
with patch("myapp.claude_feature.anthropic.Anthropic") as mock_cls:
mock_client = MagicMock()
mock_cls.return_value = mock_client
mock_client.messages.create.return_value = make_claude_response(
"This is a helpful response."
)
from myapp.claude_feature import ClaudeFeature
feature = ClaudeFeature()
result = feature.generate("Tell me something helpful")
assert len(result) > 0Mocking the Gemini Client
def make_gemini_response(text: str, blocked: bool = False):
"""Create a mock google-generativeai response."""
response = MagicMock()
response.text = text
response.prompt_feedback = MagicMock()
response.prompt_feedback.block_reason = "SAFETY" if blocked else None
response.usage_metadata = MagicMock()
response.usage_metadata.total_token_count = 120
response.usage_metadata.prompt_token_count = 40
response.usage_metadata.candidates_token_count = 80
return response
def test_gemini_feature():
with patch("myapp.gemini_feature.genai") as mock_genai:
mock_model = MagicMock()
mock_genai.GenerativeModel.return_value = mock_model
mock_model.generate_content.return_value = make_gemini_response(
"Gemini's response here."
)
from myapp.gemini_feature import GeminiFeature
feature = GeminiFeature()
result = feature.generate("Some prompt")
assert isinstance(result, str)Pattern 2: pytest Fixtures
Encapsulate your mock setup in reusable fixtures:
# tests/conftest.py
import pytest
from unittest.mock import MagicMock, patch
@pytest.fixture
def mock_openai():
"""Fixture that provides a mocked OpenAI client."""
with patch("myapp.llm.openai.OpenAI") as mock_cls:
client = MagicMock()
mock_cls.return_value = client
# Default response — tests can override
client.chat.completions.create.return_value = _make_chat_response("Default mock response")
yield client
@pytest.fixture
def mock_claude():
"""Fixture that provides a mocked Anthropic client."""
with patch("myapp.claude.anthropic.Anthropic") as mock_cls:
client = MagicMock()
mock_cls.return_value = client
client.messages.create.return_value = _make_claude_response("Default mock response")
yield client
@pytest.fixture
def mock_gemini():
"""Fixture that provides a mocked Gemini model."""
with patch("myapp.gemini.genai") as mock_genai:
model = MagicMock()
mock_genai.GenerativeModel.return_value = model
model.generate_content.return_value = _make_gemini_response("Default mock response")
yield model
# Usage in tests:
def test_with_custom_response(mock_openai):
mock_openai.chat.completions.create.return_value = _make_chat_response("Custom response")
# ... test codePattern 3: Parameterized Responses
Test your parsing logic against many different response formats:
# tests/test_response_parsing.py
import pytest
from myapp.parser import parse_sentiment
SENTIMENT_RESPONSE_VARIATIONS = [
# (raw LLM output, expected parsed result)
("positive", "positive"),
("POSITIVE", "positive"),
("Positive.", "positive"),
("The sentiment is: positive", "positive"),
("Based on this, I'd say positive", "positive"),
("negative", "negative"),
("neutral", "neutral"),
("This text is neutral in tone.", "neutral"),
]
@pytest.mark.parametrize("raw_output,expected", SENTIMENT_RESPONSE_VARIATIONS)
def test_parse_sentiment_variants(raw_output, expected, mock_openai):
mock_openai.chat.completions.create.return_value = _make_chat_response(raw_output)
from myapp.sentiment import SentimentAnalyzer
analyzer = SentimentAnalyzer()
result = analyzer.analyze("some customer review text")
assert result == expectedPattern 4: VCR Cassettes (Record and Replay)
VCR cassettes record real API responses during a recording session, then replay them in tests — without hitting the network. Perfect for integration-style tests that need realistic responses.
pip install vcrpy pytest-recording# tests/test_with_vcr.py
import pytest
import vcr
my_vcr = vcr.VCR(
cassette_library_dir="tests/cassettes",
record_mode="none", # "new_episodes" to record new, "none" to replay only
match_on=["uri", "method", "body"],
filter_headers=["authorization"], # Don't record API keys
)
@my_vcr.use_cassette("openai_classify.yaml")
def test_classify_with_cassette():
"""Uses recorded response — no real API call."""
from myapp.classifier import classify_text
result = classify_text("This is a wonderful product!")
assert result == "positive"To record cassettes:
# Set record_mode="new_episodes" temporarily
RUN_INTEGRATION_TESTS=<span class="hljs-literal">true pytest tests/test_with_vcr.py --vcr-record=new_episodes
<span class="hljs-comment"># Commit the cassette files to source control
git add tests/cassettes/
git commit -m <span class="hljs-string">"chore: record VCR cassettes for LLM tests"With pytest-recording:
@pytest.mark.vcr
def test_classify_with_cassette():
"""pytest-recording auto-manages cassette files."""
from myapp.classifier import classify_text
result = classify_text("This is a wonderful product!")
assert result == "positive"Pattern 5: Custom Mock Classes
For complex LLM integrations, build proper mock classes instead of MagicMock:
# tests/mocks/mock_openai.py
from dataclasses import dataclass
from typing import Optional
@dataclass
class MockMessage:
content: str
role: str = "assistant"
@dataclass
class MockChoice:
message: MockMessage
finish_reason: str = "stop"
index: int = 0
@dataclass
class MockUsage:
prompt_tokens: int = 50
completion_tokens: int = 100
total_tokens: int = 150
@dataclass
class MockChatCompletion:
choices: list
usage: MockUsage
model: str = "gpt-4o-mini"
id: str = "chatcmpl-mock"
class MockOpenAIClient:
"""Realistic mock of the OpenAI client."""
def __init__(self, responses: Optional[list] = None):
self._responses = responses or []
self._call_count = 0
self.last_call_kwargs = None
@property
def chat(self):
return self
@property
def completions(self):
return self
def create(self, **kwargs):
self.last_call_kwargs = kwargs
self._call_count += 1
if self._responses:
content = self._responses[(self._call_count - 1) % len(self._responses)]
else:
content = "Mock response"
return MockChatCompletion(
choices=[MockChoice(message=MockMessage(content=content))],
usage=MockUsage()
)
# Usage:
def test_with_custom_mock():
mock_client = MockOpenAIClient(responses=["positive", "negative", "positive"])
from myapp.classifier import Classifier
classifier = Classifier(client=mock_client)
assert classifier.classify("text 1") == "positive"
assert classifier.classify("text 2") == "negative"
assert classifier.classify("text 3") == "positive"
assert mock_client._call_count == 3Pattern 6: Simulating Errors
Test your error handling without triggering real API errors:
# tests/test_error_scenarios.py
import openai
import anthropic
import pytest
from unittest.mock import MagicMock, patch
class TestOpenAIErrorScenarios:
def test_rate_limit_error(self, mock_openai):
mock_openai.chat.completions.create.side_effect = openai.RateLimitError(
message="Rate limit exceeded. Please try again later.",
response=MagicMock(status_code=429),
body={"error": {"type": "rate_limit_error", "code": "rate_limit_exceeded"}}
)
from myapp.classifier import Classifier
classifier = Classifier()
with pytest.raises(openai.RateLimitError):
classifier.classify("test text")
def test_context_length_exceeded(self, mock_openai):
mock_openai.chat.completions.create.side_effect = openai.BadRequestError(
message="This model's maximum context length is 128000 tokens.",
response=MagicMock(status_code=400),
body={"error": {"code": "context_length_exceeded"}}
)
from myapp.classifier import Classifier
classifier = Classifier()
with pytest.raises(openai.BadRequestError):
classifier.classify("x" * 1000000)
def test_retry_logic(self, mock_openai):
"""Test that exponential backoff retry works."""
attempts = []
def fail_twice_then_succeed(**kwargs):
attempts.append(len(attempts) + 1)
if len(attempts) < 3:
raise openai.APIStatusError(
message="Server overloaded",
response=MagicMock(status_code=529),
body={}
)
return _make_chat_response("positive")
mock_openai.chat.completions.create.side_effect = fail_twice_then_succeed
from myapp.classifier_with_retry import ClassifierWithRetry
classifier = ClassifierWithRetry(max_retries=3, base_delay=0.001)
result = classifier.classify("test text")
assert result == "positive"
assert len(attempts) == 3
class TestClaudeErrorScenarios:
def test_overloaded_error(self, mock_claude):
mock_claude.messages.create.side_effect = anthropic.APIStatusError(
message="Overloaded",
response=MagicMock(status_code=529),
body={"error": {"type": "overloaded_error"}}
)
from myapp.claude_client import ClaudeClient
client = ClaudeClient()
with pytest.raises(anthropic.APIStatusError):
client.generate("test")
def test_content_policy_violation(self, mock_claude):
mock_claude.messages.create.side_effect = anthropic.BadRequestError(
message="Output blocked by content filtering policy",
response=MagicMock(status_code=400),
body={"error": {"type": "invalid_request_error"}}
)
from myapp.claude_client import ClaudeClient
client = ClaudeClient()
with pytest.raises(anthropic.BadRequestError):
client.generate("problematic content")Pattern 7: Streaming Mocks
Testing streaming responses requires mocking the iterator:
# tests/test_streaming_mock.py
from unittest.mock import MagicMock
def make_streaming_response(text: str):
"""Create a mock streaming response for OpenAI."""
chunks = []
for i, char in enumerate(text):
chunk = MagicMock()
chunk.choices = [MagicMock()]
chunk.choices[0].delta.content = char
chunk.choices[0].finish_reason = None if i < len(text) - 1 else "stop"
chunks.append(chunk)
return iter(chunks)
def test_streaming_accumulates_correctly(mock_openai):
expected = "Hello, world!"
mock_openai.chat.completions.create.return_value = make_streaming_response(expected)
from myapp.streamer import StreamingClient
client = StreamingClient()
result = client.stream_generate("Say hello world")
assert result == expected
def test_streaming_calls_callback_per_chunk(mock_openai):
mock_openai.chat.completions.create.return_value = make_streaming_response("abc")
received_chunks = []
from myapp.streamer import StreamingClient
client = StreamingClient()
client.stream_with_callback("prompt", callback=received_chunks.append)
assert received_chunks == ["a", "b", "c"]Organizing Your Mocks
Keep your mock factories in one place:
tests/
├── conftest.py # pytest fixtures
├── mocks/
│ ├── __init__.py
│ ├── openai.py # OpenAI mock factories
│ ├── anthropic.py # Claude mock factories
│ ├── gemini.py # Gemini mock factories
│ └── responses/
│ ├── positive.json # Sample LLM responses
│ └── negative.json
└── cassettes/ # VCR recordings
├── classify_positive.yaml
└── summarize_long.yamlLoad response fixtures from JSON:
# tests/mocks/responses.py
import json
from pathlib import Path
RESPONSES_DIR = Path(__file__).parent / "responses"
def load_response(name: str) -> str:
"""Load a saved LLM response from disk."""
return json.loads((RESPONSES_DIR / f"{name}.json").read_text())["content"]End-to-End Testing Beyond Mocks
Mocking covers your application logic. But users interact with your LLM features through a UI — a chat interface, an upload form, a results page. That layer needs end-to-end testing that no amount of mocking can replace.
HelpMeTest runs automated browser tests against your live application. You write the tests in plain English; HelpMeTest runs them continuously:
Go To https://your-app.com
Fill In #prompt Summarize this document
Click #submit
Wait For .ai-response
Verify response contains at least 50 words
Verify response does not contain error messagesUse mocks for unit tests. Use HelpMeTest for end-to-end coverage. Use real API calls (gated) for integration tests. That combination gives you complete coverage without spending a fortune on API calls in CI.