Testing ML Model Services with BentoML: Unit, Integration, and Load Tests
BentoML packages machine learning models as production-ready API services. Testing BentoML services means verifying the model logic, the API layer, and the deployment — each at a different level.
BentoML Service Structure
A BentoML service wraps a model and exposes it as an HTTP API:
# service.py
import bentoml
import numpy as np
from pydantic import BaseModel
class ClassificationInput(BaseModel):
features: list[float]
class ClassificationOutput(BaseModel):
label: str
confidence: float
@bentoml.service(
resources={"cpu": "2"},
traffic={"timeout": 10},
)
class ClassifierService:
model_ref = bentoml.sklearn.get("iris_classifier:latest")
def __init__(self):
self.model = self.model_ref.load_model()
@bentoml.api
def classify(self, input: ClassificationInput) -> ClassificationOutput:
features = np.array(input.features).reshape(1, -1)
pred = self.model.predict(features)[0]
prob = self.model.predict_proba(features).max()
return ClassificationOutput(label=str(pred), confidence=float(prob))Unit Testing the Model Logic
Test the prediction logic without running the BentoML server:
# tests/test_model_logic.py
import pytest
import numpy as np
import bentoml
@pytest.fixture(scope="session")
def model():
model_ref = bentoml.sklearn.get("iris_classifier:latest")
return model_ref.load_model()
def test_model_predicts_valid_class(model):
features = np.array([[5.1, 3.5, 1.4, 0.2]])
prediction = model.predict(features)
assert prediction[0] in [0, 1, 2], f"Unexpected class: {prediction[0]}"
def test_model_returns_probabilities(model):
features = np.array([[5.1, 3.5, 1.4, 0.2]])
probs = model.predict_proba(features)
assert probs.shape == (1, 3)
assert abs(probs.sum() - 1.0) < 1e-6, "Probabilities must sum to 1"
def test_model_handles_boundary_values(model):
# Test with extreme but valid feature values
features = np.array([[0.0, 0.0, 0.0, 0.0]])
prediction = model.predict(features)
assert prediction[0] in [0, 1, 2]
def test_model_rejects_wrong_shape(model):
with pytest.raises(ValueError):
model.predict(np.array([5.1, 3.5])) # Missing reshapeIntegration Testing with BentoML's Test Client
BentoML provides a synchronous test client that runs the service in-process:
# tests/test_service_api.py
import pytest
from bentoml.testing.server import TestServer
from service import ClassifierService
@pytest.fixture(scope="session")
def server():
with TestServer(ClassifierService) as server:
yield server
def test_classify_setosa(server):
response = server.client.classify(
features=[5.1, 3.5, 1.4, 0.2]
)
assert response.label is not None
assert 0.0 <= response.confidence <= 1.0
def test_classify_versicolor(server):
response = server.client.classify(
features=[6.4, 3.2, 4.5, 1.5]
)
assert response.confidence > 0.5
def test_invalid_input_returns_422(server):
# Too few features
import httpx
raw = server.client.session.post(
"/classify",
json={"features": [5.1, 3.5]},
)
assert raw.status_code == 422
def test_empty_features_returns_error(server):
import httpx
raw = server.client.session.post(
"/classify",
json={"features": []},
)
assert raw.status_code in [400, 422]Testing with the HTTP Runner Directly
For lower-level control, use the runner API:
import pytest
import asyncio
import bentoml
@pytest.fixture(scope="session")
def runner():
runner = bentoml.sklearn.get("iris_classifier:latest").to_runner()
runner.init_local()
yield runner
runner.destroy()
@pytest.mark.asyncio
async def test_runner_async_predict(runner):
import numpy as np
result = await runner.async_run(np.array([[5.1, 3.5, 1.4, 0.2]]))
assert result[0] in [0, 1, 2]
def test_runner_sync_predict(runner):
import numpy as np
result = runner.run(np.array([[5.1, 3.5, 1.4, 0.2]]))
assert result[0] in [0, 1, 2]Saving Test Models for CI
In CI, you can't rely on a production model store. Save a fixture model during test setup:
# tests/conftest.py
import pytest
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import bentoml
@pytest.fixture(scope="session", autouse=True)
def save_test_model():
"""Train and save a minimal model for tests."""
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(max_iter=200)
clf.fit(X, y)
bentoml.sklearn.save_model(
"iris_classifier",
clf,
metadata={"accuracy": 0.97, "env": "test"},
)Load Testing with Locust
Test throughput and latency of a running BentoML service:
# locustfile.py
import random
from locust import HttpUser, task, between
class ClassifierUser(HttpUser):
wait_time = between(0.1, 0.5)
@task
def classify(self):
payload = {
"features": [
round(random.uniform(4.0, 8.0), 1),
round(random.uniform(2.0, 5.0), 1),
round(random.uniform(1.0, 7.0), 1),
round(random.uniform(0.1, 2.5), 1),
]
}
with self.client.post(
"/classify",
json=payload,
catch_response=True,
) as resp:
if resp.status_code != 200:
resp.failure(f"Unexpected status: {resp.status_code}")
elif resp.elapsed.total_seconds() > 1.0:
resp.failure("Response too slow")locust -f locustfile.py --host=http://localhost:3000 \
--users=50 --spawn-rate=10 --run-time=60s --headlessTesting Bento Builds
Verify the packaged bento before deployment:
# tests/test_bento_build.py
import subprocess
import bentoml
def test_bento_builds_successfully():
result = subprocess.run(
["bentoml", "build", "-f", "bentofile.yaml"],
capture_output=True, text=True
)
assert result.returncode == 0, f"Build failed: {result.stderr}"
def test_latest_bento_has_required_metadata():
bento = bentoml.get("classifier_service:latest")
assert bento.info.labels.get("version") is not None
assert bento.info.labels.get("framework") in ["sklearn", "pytorch", "tensorflow"]CI Pipeline
# .github/workflows/ml-service-tests.yml
name: BentoML Service Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- run: pip install bentoml scikit-learn pytest pytest-asyncio
- name: Setup test model
run: python -c "
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import bentoml
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(max_iter=200).fit(X, y)
bentoml.sklearn.save_model('iris_classifier', clf)
"
- name: Run tests
run: pytest tests/ -vKey Takeaways
- Unit test model logic separately from the BentoML service layer
- Use
TestServerfor integration tests — it runs the full service in-process with no networking overhead - Save deterministic test models in
conftest.pyso CI doesn't depend on a model registry - Load test before scaling decisions with Locust or k6
- Test the bento build step in CI to catch packaging issues before deployment