OpenAI API Testing Guide: Automate GPT-4 Tests with Python
Building applications on top of the OpenAI API — GPT-4, GPT-4o, embeddings, DALL-E — requires a solid test strategy. LLM outputs are non-deterministic, calls cost money, and rate limits punish naive test suites. This guide gives you a complete Python testing framework for OpenAI-powered applications.
The Core Testing Problem
Traditional APIs return deterministic results: the same input always produces the same output. OpenAI's models don't work that way. GPT-4 might summarize the same document differently on each call — and that's fine, as long as the summary is accurate.
This means your tests need to check:
- Structure: Is the output in the right format?
- Completeness: Does it cover the required content?
- Safety: Does it stay within guardrails?
- Reliability: Does your error handling work?
What you should NOT test: exact string matching on GPT-4 outputs.
Project Setup
pip install openai pytest pytest-asyncio httpx pytest-mockproject/
├── src/
│ ├── __init__.py
│ └── gpt_client.py
├── tests/
│ ├── conftest.py
│ ├── test_unit.py
│ ├── test_integration.py
│ └── recordings/ # VCR cassettes for replay testing
├── .env
└── pytest.ini.env:
OPENAI_API_KEY=sk-proj-...
OPENAI_MODEL=gpt-4o-minipytest.ini:
[pytest]
asyncio_mode = auto
markers =
integration: marks tests that call real OpenAI API
slow: marks tests that take more than 5 secondsUnit Testing with Mocks
Never call the real OpenAI API in unit tests. The pytest-mock fixture makes this clean:
# tests/conftest.py
import pytest
from unittest.mock import MagicMock
@pytest.fixture
def mock_openai_response():
"""Factory for mock OpenAI chat completion responses."""
def _make_response(content: str, finish_reason: str = "stop"):
response = MagicMock()
response.choices = [MagicMock()]
response.choices[0].message.content = content
response.choices[0].finish_reason = finish_reason
response.usage.prompt_tokens = 50
response.usage.completion_tokens = 100
response.usage.total_tokens = 150
return response
return _make_response# src/gpt_client.py
import openai
import os
class GPTClient:
def __init__(self):
self.client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
self.model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
self.total_cost_usd = 0.0
def classify(self, text: str, categories: list[str]) -> str:
if not text.strip():
raise ValueError("Input text cannot be empty")
if not categories:
raise ValueError("Categories list cannot be empty")
category_list = ", ".join(categories)
response = self.client.chat.completions.create(
model=self.model,
max_tokens=50,
temperature=0, # Deterministic output for classification
messages=[
{"role": "system", "content": f"Classify the input into exactly one of: {category_list}. Reply with only the category name."},
{"role": "user", "content": text}
]
)
result = response.choices[0].message.content.strip()
self._track_cost(response.usage.total_tokens)
return result
def _track_cost(self, tokens: int):
# gpt-4o-mini: $0.15/1M input + $0.60/1M output (approximate)
self.total_cost_usd += (tokens / 1_000_000) * 0.30# tests/test_unit.py
import pytest
from unittest.mock import patch, MagicMock
from src.gpt_client import GPTClient
@pytest.fixture
def mock_client(mocker, mock_openai_response):
mock = mocker.patch("src.gpt_client.openai.OpenAI")
instance = mock.return_value
instance.chat.completions.create.return_value = mock_openai_response("positive")
return instance
class TestGPTClassify:
def test_returns_category(self, mock_client):
client = GPTClient()
result = client.classify("I love this!", ["positive", "negative", "neutral"])
assert result in ["positive", "negative", "neutral"]
def test_raises_on_empty_text(self, mock_client):
client = GPTClient()
with pytest.raises(ValueError, match="cannot be empty"):
client.classify("", ["positive", "negative"])
def test_raises_on_empty_categories(self, mock_client):
client = GPTClient()
with pytest.raises(ValueError, match="cannot be empty"):
client.classify("Some text", [])
def test_tracks_cost_after_call(self, mock_client):
client = GPTClient()
assert client.total_cost_usd == 0.0
client.classify("Some text", ["a", "b"])
assert client.total_cost_usd > 0.0
def test_uses_temperature_zero_for_classification(self, mock_client):
client = GPTClient()
client.classify("Some text", ["a", "b"])
call_kwargs = mock_client.chat.completions.create.call_args.kwargs
assert call_kwargs["temperature"] == 0
def test_passes_categories_in_system_prompt(self, mock_client):
client = GPTClient()
client.classify("Some text", ["bug", "feature", "docs"])
call_kwargs = mock_client.chat.completions.create.call_args.kwargs
system_msg = call_kwargs["messages"][0]["content"]
assert "bug" in system_msg
assert "feature" in system_msg
assert "docs" in system_msgIntegration Tests (Real GPT-4 Calls)
# tests/test_integration.py
import pytest
import openai
import json
import os
pytestmark = pytest.mark.skipif(
os.environ.get("RUN_INTEGRATION_TESTS") != "true",
reason="Set RUN_INTEGRATION_TESTS=true to run"
)
@pytest.fixture(scope="session")
def openai_client():
return openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
class TestGPT4Integration:
def test_basic_completion(self, openai_client):
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
max_tokens=20,
temperature=0,
messages=[{"role": "user", "content": "Reply with the single word: pineapple"}]
)
assert "pineapple" in response.choices[0].message.content.lower()
assert response.choices[0].finish_reason == "stop"
def test_json_mode_output(self, openai_client):
"""GPT-4 JSON mode guarantees valid JSON output."""
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
max_tokens=100,
response_format={"type": "json_object"},
messages=[{
"role": "user",
"content": "Return JSON: {\"status\": \"ok\", \"count\": 42}"
}]
)
data = json.loads(response.choices[0].message.content)
assert data["status"] == "ok"
assert data["count"] == 42
def test_function_calling(self, openai_client):
"""Test OpenAI function calling / tool use."""
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["city"]
}
}
}]
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
tools=tools,
tool_choice="auto",
messages=[{"role": "user", "content": "What's the weather in Tokyo?"}]
)
assert response.choices[0].message.tool_calls is not None
tool_call = response.choices[0].message.tool_calls[0]
assert tool_call.function.name == "get_weather"
args = json.loads(tool_call.function.arguments)
assert args["city"].lower() in ["tokyo", "tokyo, japan"]
def test_embeddings_shape(self, openai_client):
"""Verify embeddings have the expected dimensions."""
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input="Test sentence for embedding"
)
embedding = response.data[0].embedding
assert len(embedding) == 1536 # text-embedding-3-small dimensions
assert all(isinstance(v, float) for v in embedding)
def test_streaming_completion(self, openai_client):
"""Test that streaming works and accumulates correctly."""
stream = openai_client.chat.completions.create(
model="gpt-4o-mini",
max_tokens=50,
stream=True,
messages=[{"role": "user", "content": "Count from 1 to 5."}]
)
chunks = []
for chunk in stream:
if chunk.choices[0].delta.content:
chunks.append(chunk.choices[0].delta.content)
full_text = "".join(chunks)
assert len(full_text) > 0
# Should mention numbers 1-5
for num in ["1", "2", "3", "4", "5"]:
assert num in full_textTesting GPT-4 Error Handling
# tests/test_errors.py
import openai
import pytest
from unittest.mock import patch, MagicMock
from src.gpt_client import GPTClient
class TestErrorHandling:
def test_rate_limit_raises(self, mocker):
mocker.patch(
"src.gpt_client.openai.OpenAI"
).return_value.chat.completions.create.side_effect = openai.RateLimitError(
message="Rate limit exceeded",
response=MagicMock(status_code=429),
body={}
)
client = GPTClient()
with pytest.raises(openai.RateLimitError):
client.classify("test", ["a", "b"])
def test_retry_on_service_unavailable(self, mocker):
"""Test exponential backoff retry wrapper."""
call_count = 0
def side_effect(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count < 3:
raise openai.APIStatusError(
message="Service unavailable",
response=MagicMock(status_code=503),
body={}
)
response = MagicMock()
response.choices[0].message.content = "positive"
response.choices[0].finish_reason = "stop"
response.usage.total_tokens = 100
return response
mocker.patch(
"src.gpt_client.openai.OpenAI"
).return_value.chat.completions.create.side_effect = side_effect
from src.gpt_client_with_retry import GPTClientWithRetry
client = GPTClientWithRetry(max_retries=3, base_delay=0.01)
result = client.classify("test", ["positive", "negative"])
assert result == "positive"
assert call_count == 3
def test_context_length_error(self, mocker):
mocker.patch(
"src.gpt_client.openai.OpenAI"
).return_value.chat.completions.create.side_effect = openai.BadRequestError(
message="maximum context length exceeded",
response=MagicMock(status_code=400),
body={"error": {"code": "context_length_exceeded"}}
)
client = GPTClient()
with pytest.raises(openai.BadRequestError):
client.classify("x" * 100000, ["a", "b"])Testing Prompt Versions
When you iterate on prompts, regression-test them:
# tests/test_prompt_regression.py
import pytest
from unittest.mock import patch
PROMPT_TEST_CASES = [
{
"input": "The package arrived damaged and I want a refund.",
"expected_intent": "refund_request",
"min_confidence": 0.9
},
{
"input": "When will my order ship?",
"expected_intent": "shipping_inquiry",
"min_confidence": 0.9
},
{
"input": "How do I change my password?",
"expected_intent": "account_help",
"min_confidence": 0.85
}
]
@pytest.mark.parametrize("case", PROMPT_TEST_CASES, ids=lambda c: c["expected_intent"])
def test_intent_classification_regression(case, mock_openai_classify):
"""Ensure prompt changes don't break intent classification."""
from src.intent_classifier import IntentClassifier
mock_openai_classify.return_value = case["expected_intent"]
classifier = IntentClassifier()
result = classifier.classify(case["input"])
assert result.intent == case["expected_intent"]
assert result.confidence >= case["min_confidence"]Cost Tracking in Tests
Track API costs during integration test runs to catch runaway tests:
# tests/conftest.py (add to existing conftest)
import pytest
from dataclasses import dataclass, field
@dataclass
class TokenUsage:
prompt_tokens: int = 0
completion_tokens: int = 0
@property
def total(self):
return self.prompt_tokens + self.completion_tokens
@property
def estimated_cost_usd(self):
# gpt-4o-mini pricing
return (self.prompt_tokens / 1_000_000 * 0.15) + (self.completion_tokens / 1_000_000 * 0.60)
_session_usage = TokenUsage()
@pytest.fixture(autouse=True)
def track_token_usage(request, monkeypatch):
"""Track tokens used across all integration tests."""
if "integration" not in request.keywords:
return
yield
# After integration test session, report cost
if request.session.testscollected == request.session.testsfailed + request.session.testscollected:
print(f"\nTotal tokens used: {_session_usage.total}")
print(f"Estimated cost: ${_session_usage.estimated_cost_usd:.4f}")CI/CD Setup
# .github/workflows/test.yml
name: Test OpenAI App
on:
push:
branches: [main]
schedule:
- cron: '0 8 * * 1' # Weekly integration run
jobs:
unit-tests:
name: Unit Tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- run: pip install -r requirements.txt
- run: pytest tests/test_unit.py tests/test_errors.py tests/test_prompt_regression.py -v
integration-tests:
name: Integration Tests (Weekly)
runs-on: ubuntu-latest
if: github.event_name == 'schedule'
environment: integration
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
RUN_INTEGRATION_TESTS: "true"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- run: pip install -r requirements.txt
- run: pytest tests/test_integration.py -v -m integration --timeout=120End-to-End Testing with HelpMeTest
Unit and integration tests cover your API layer, but they don't cover the user experience. If you've built a GPT-4-powered app — a chatbot, an AI writing tool, a code review assistant — you need end-to-end tests that exercise the full stack.
HelpMeTest runs Robot Framework tests against your live application, checking that AI features work correctly from the user's perspective:
*** Test Cases ***
GPT Assistant Answers Customer Questions
Go To https://your-app.com/assistant
Fill Text #chat-input What is your return policy?
Click Button Send
Wait Until Element Is Visible .assistant-response timeout=30s
Element Should Contain .assistant-response return
Element Should Contain .assistant-response dayThis catches issues that no unit test will find: a broken UI, a prompt change that breaks production behavior, or a rate limit that only hits in real traffic.
Summary
- Unit tests: Always mock the OpenAI client. Test your logic, not the API.
- Integration tests: Real API calls, gated behind env vars, run weekly not on every commit.
- Use
temperature=0in tests where you need consistent outputs for classification/extraction. - Use
json_objectresponse format when testing structured output — it's more reliable than prompting for JSON. - Test all error paths: rate limits, context overflow, auth failures.
- Track costs during integration test sessions — runaway loops get expensive.
- Add end-to-end tests for the full user experience with tools like HelpMeTest.