Testing ML Model Services with BentoML: Unit, Integration, and Load Tests

Testing ML Model Services with BentoML: Unit, Integration, and Load Tests

BentoML packages machine learning models as production-ready API services. Testing BentoML services means verifying the model logic, the API layer, and the deployment — each at a different level.

BentoML Service Structure

A BentoML service wraps a model and exposes it as an HTTP API:

# service.py
import bentoml
import numpy as np
from pydantic import BaseModel

class ClassificationInput(BaseModel):
    features: list[float]

class ClassificationOutput(BaseModel):
    label: str
    confidence: float

@bentoml.service(
    resources={"cpu": "2"},
    traffic={"timeout": 10},
)
class ClassifierService:
    model_ref = bentoml.sklearn.get("iris_classifier:latest")

    def __init__(self):
        self.model = self.model_ref.load_model()

    @bentoml.api
    def classify(self, input: ClassificationInput) -> ClassificationOutput:
        features = np.array(input.features).reshape(1, -1)
        pred = self.model.predict(features)[0]
        prob = self.model.predict_proba(features).max()
        return ClassificationOutput(label=str(pred), confidence=float(prob))

Unit Testing the Model Logic

Test the prediction logic without running the BentoML server:

# tests/test_model_logic.py
import pytest
import numpy as np
import bentoml

@pytest.fixture(scope="session")
def model():
    model_ref = bentoml.sklearn.get("iris_classifier:latest")
    return model_ref.load_model()

def test_model_predicts_valid_class(model):
    features = np.array([[5.1, 3.5, 1.4, 0.2]])
    prediction = model.predict(features)
    assert prediction[0] in [0, 1, 2], f"Unexpected class: {prediction[0]}"

def test_model_returns_probabilities(model):
    features = np.array([[5.1, 3.5, 1.4, 0.2]])
    probs = model.predict_proba(features)
    assert probs.shape == (1, 3)
    assert abs(probs.sum() - 1.0) < 1e-6, "Probabilities must sum to 1"

def test_model_handles_boundary_values(model):
    # Test with extreme but valid feature values
    features = np.array([[0.0, 0.0, 0.0, 0.0]])
    prediction = model.predict(features)
    assert prediction[0] in [0, 1, 2]

def test_model_rejects_wrong_shape(model):
    with pytest.raises(ValueError):
        model.predict(np.array([5.1, 3.5]))  # Missing reshape

Integration Testing with BentoML's Test Client

BentoML provides a synchronous test client that runs the service in-process:

# tests/test_service_api.py
import pytest
from bentoml.testing.server import TestServer

from service import ClassifierService

@pytest.fixture(scope="session")
def server():
    with TestServer(ClassifierService) as server:
        yield server

def test_classify_setosa(server):
    response = server.client.classify(
        features=[5.1, 3.5, 1.4, 0.2]
    )
    assert response.label is not None
    assert 0.0 <= response.confidence <= 1.0

def test_classify_versicolor(server):
    response = server.client.classify(
        features=[6.4, 3.2, 4.5, 1.5]
    )
    assert response.confidence > 0.5

def test_invalid_input_returns_422(server):
    # Too few features
    import httpx
    raw = server.client.session.post(
        "/classify",
        json={"features": [5.1, 3.5]},
    )
    assert raw.status_code == 422

def test_empty_features_returns_error(server):
    import httpx
    raw = server.client.session.post(
        "/classify",
        json={"features": []},
    )
    assert raw.status_code in [400, 422]

Testing with the HTTP Runner Directly

For lower-level control, use the runner API:

import pytest
import asyncio
import bentoml

@pytest.fixture(scope="session")
def runner():
    runner = bentoml.sklearn.get("iris_classifier:latest").to_runner()
    runner.init_local()
    yield runner
    runner.destroy()

@pytest.mark.asyncio
async def test_runner_async_predict(runner):
    import numpy as np
    result = await runner.async_run(np.array([[5.1, 3.5, 1.4, 0.2]]))
    assert result[0] in [0, 1, 2]

def test_runner_sync_predict(runner):
    import numpy as np
    result = runner.run(np.array([[5.1, 3.5, 1.4, 0.2]]))
    assert result[0] in [0, 1, 2]

Saving Test Models for CI

In CI, you can't rely on a production model store. Save a fixture model during test setup:

# tests/conftest.py
import pytest
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import bentoml

@pytest.fixture(scope="session", autouse=True)
def save_test_model():
    """Train and save a minimal model for tests."""
    X, y = load_iris(return_X_y=True)
    clf = LogisticRegression(max_iter=200)
    clf.fit(X, y)
    bentoml.sklearn.save_model(
        "iris_classifier",
        clf,
        metadata={"accuracy": 0.97, "env": "test"},
    )

Load Testing with Locust

Test throughput and latency of a running BentoML service:

# locustfile.py
import random
from locust import HttpUser, task, between

class ClassifierUser(HttpUser):
    wait_time = between(0.1, 0.5)

    @task
    def classify(self):
        payload = {
            "features": [
                round(random.uniform(4.0, 8.0), 1),
                round(random.uniform(2.0, 5.0), 1),
                round(random.uniform(1.0, 7.0), 1),
                round(random.uniform(0.1, 2.5), 1),
            ]
        }
        with self.client.post(
            "/classify",
            json=payload,
            catch_response=True,
        ) as resp:
            if resp.status_code != 200:
                resp.failure(f"Unexpected status: {resp.status_code}")
            elif resp.elapsed.total_seconds() > 1.0:
                resp.failure("Response too slow")
locust -f locustfile.py --host=http://localhost:3000 \
  --users=50 --spawn-rate=10 --run-time=60s --headless

Testing Bento Builds

Verify the packaged bento before deployment:

# tests/test_bento_build.py
import subprocess
import bentoml

def test_bento_builds_successfully():
    result = subprocess.run(
        ["bentoml", "build", "-f", "bentofile.yaml"],
        capture_output=True, text=True
    )
    assert result.returncode == 0, f"Build failed: {result.stderr}"

def test_latest_bento_has_required_metadata():
    bento = bentoml.get("classifier_service:latest")
    assert bento.info.labels.get("version") is not None
    assert bento.info.labels.get("framework") in ["sklearn", "pytorch", "tensorflow"]

CI Pipeline

# .github/workflows/ml-service-tests.yml
name: BentoML Service Tests

on: [push, pull_request]

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - run: pip install bentoml scikit-learn pytest pytest-asyncio
      - name: Setup test model
        run: python -c "
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import bentoml
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(max_iter=200).fit(X, y)
bentoml.sklearn.save_model('iris_classifier', clf)
"
      - name: Run tests
        run: pytest tests/ -v

Key Takeaways

  • Unit test model logic separately from the BentoML service layer
  • Use TestServer for integration tests — it runs the full service in-process with no networking overhead
  • Save deterministic test models in conftest.py so CI doesn't depend on a model registry
  • Load test before scaling decisions with Locust or k6
  • Test the bento build step in CI to catch packaging issues before deployment

Read more