Testing ML Model Deployments with Seldon Core and KServe

Testing ML Model Deployments with Seldon Core and KServe

Seldon Core and KServe deploy machine learning models as Kubernetes-native inference services. Testing these deployments means verifying the inference API, the Kubernetes resource state, canary rollouts, and model-level correctness — before and after going to production.

What to Test

Layer What to verify
Kubernetes resources InferenceService is Ready, replicas running
Inference API Correct output format, valid predictions
Canary rollouts Traffic split working, new version quality
Model correctness Known inputs return expected outputs
Performance p99 latency under threshold

KServe: Checking InferenceService Readiness

# tests/test_kserve_deployment.py
import pytest
import subprocess
import json

NAMESPACE = "ml-models"
MODEL_NAME = "iris-classifier"

def kubectl(*args) -> dict:
    result = subprocess.run(
        ["kubectl", "-n", NAMESPACE, *args, "-o", "json"],
        capture_output=True, text=True, check=True,
    )
    return json.loads(result.stdout)

@pytest.fixture(scope="session")
def inference_service():
    return kubectl("get", "inferenceservice", MODEL_NAME)

def test_inference_service_exists(inference_service):
    assert inference_service["metadata"]["name"] == MODEL_NAME

def test_inference_service_ready(inference_service):
    conditions = inference_service["status"].get("conditions", [])
    ready = next((c for c in conditions if c["type"] == "Ready"), None)
    assert ready is not None, "No Ready condition found"
    assert ready["status"] == "True", (
        f"InferenceService not ready: {ready.get('message', 'no message')}"
    )

def test_predictor_ready(inference_service):
    conditions = inference_service["status"].get("conditions", [])
    predictor = next(
        (c for c in conditions if c["type"] == "PredictorReady"), None
    )
    assert predictor is not None
    assert predictor["status"] == "True"

def test_url_is_set(inference_service):
    url = inference_service["status"].get("url")
    assert url is not None and url.startswith("http")

KServe: Inference API Tests

KServe exposes the V2 inference protocol (/v2/models/{name}/infer):

# tests/test_kserve_inference.py
import pytest
import requests
import numpy as np

KSERVE_URL = "http://iris-classifier.ml-models.svc.cluster.local/v2/models/iris-classifier/infer"

# Known ground truth for regression testing
KNOWN_CASES = [
    # (features, expected_class)
    ([5.1, 3.5, 1.4, 0.2], 0),  # setosa
    ([6.4, 3.2, 4.5, 1.5], 1),  # versicolor
    ([6.3, 3.3, 6.0, 2.5], 2),  # virginica
]

@pytest.fixture(scope="session")
def inference_request():
    def _request(features: list[float]) -> dict:
        payload = {
            "inputs": [{
                "name": "input-0",
                "shape": [1, 4],
                "datatype": "FP32",
                "data": features,
            }]
        }
        resp = requests.post(KSERVE_URL, json=payload, timeout=10)
        resp.raise_for_status()
        return resp.json()
    return _request

def test_inference_returns_200(inference_request):
    result = inference_request([5.1, 3.5, 1.4, 0.2])
    assert "outputs" in result

@pytest.mark.parametrize("features,expected_class", KNOWN_CASES)
def test_known_input_classification(inference_request, features, expected_class):
    result = inference_request(features)
    outputs = result["outputs"][0]["data"]
    predicted_class = int(outputs[0])
    assert predicted_class == expected_class, (
        f"Expected class {expected_class}, got {predicted_class} for input {features}"
    )

def test_inference_latency(inference_request):
    import time
    start = time.perf_counter()
    inference_request([5.1, 3.5, 1.4, 0.2])
    elapsed_ms = (time.perf_counter() - start) * 1000
    assert elapsed_ms < 200, f"Inference took {elapsed_ms:.1f}ms, max is 200ms"

Seldon Core: InferenceGraph Tests

Seldon Core uses SeldonDeployment resources and supports pipelines (InferenceGraph):

# tests/test_seldon_deployment.py
import pytest
import subprocess
import json

NAMESPACE = "seldon"
DEPLOYMENT_NAME = "iris-default"

def seldon_get(resource_type: str, name: str) -> dict:
    result = subprocess.run(
        ["kubectl", "-n", NAMESPACE, "get", resource_type, name, "-o", "json"],
        capture_output=True, text=True, check=True,
    )
    return json.loads(result.stdout)

def test_seldon_deployment_available():
    dep = seldon_get("seldondeployment", DEPLOYMENT_NAME)
    state = dep["status"].get("state")
    assert state == "Available", f"SeldonDeployment state: {state}"

def test_all_replicas_ready():
    dep = seldon_get("seldondeployment", DEPLOYMENT_NAME)
    for predictor in dep["status"].get("deploymentStatus", {}).values():
        replicas = predictor.get("replicas", 0)
        available = predictor.get("availableReplicas", 0)
        assert available == replicas, (
            f"Only {available}/{replicas} replicas available"
        )

Canary Rollout Testing

Test traffic split during a canary deployment:

# tests/test_canary_rollout.py
import pytest
import requests
from collections import Counter

SELDON_URL = "http://iris-default.seldon.svc.cluster.local/api/v1.0/predictions"
SAMPLE_SIZE = 100
EXPECTED_CANARY_FRACTION = 0.2  # 20% canary traffic
TOLERANCE = 0.05  # ±5%

def predict_and_get_version(features: list[float]) -> str:
    """Returns the model version used for this prediction (from response header)."""
    payload = {"data": {"ndarray": [features]}}
    resp = requests.post(SELDON_URL, json=payload)
    resp.raise_for_status()
    return resp.headers.get("x-seldon-route", "unknown")

def test_canary_traffic_split():
    versions = []
    for _ in range(SAMPLE_SIZE):
        version = predict_and_get_version([5.1, 3.5, 1.4, 0.2])
        versions.append(version)

    counts = Counter(versions)
    total = sum(counts.values())

    for version, count in counts.items():
        fraction = count / total
        print(f"Version {version}: {fraction:.1%} of traffic")

    # Verify canary version received approximately the right fraction
    canary_versions = [v for v in counts if "canary" in v.lower()]
    if canary_versions:
        canary_fraction = sum(counts[v] for v in canary_versions) / total
        assert abs(canary_fraction - EXPECTED_CANARY_FRACTION) <= TOLERANCE, (
            f"Canary traffic {canary_fraction:.1%} not within "
            f"{TOLERANCE:.0%} of target {EXPECTED_CANARY_FRACTION:.1%}"
        )

Health Check Endpoint Tests

Both KServe and Seldon expose health endpoints:

# tests/test_health_endpoints.py
import requests

def test_kserve_health():
    resp = requests.get(
        "http://iris-classifier.ml-models.svc.cluster.local/v2/health/ready",
        timeout=5,
    )
    assert resp.status_code == 200

def test_kserve_model_metadata():
    resp = requests.get(
        "http://iris-classifier.ml-models.svc.cluster.local/v2/models/iris-classifier",
        timeout=5,
    )
    assert resp.status_code == 200
    data = resp.json()
    assert data["name"] == "iris-classifier"
    assert "inputs" in data
    assert "outputs" in data

def test_seldon_health():
    resp = requests.get(
        "http://iris-default.seldon.svc.cluster.local/api/v1.0/health/ready",
        timeout=5,
    )
    assert resp.status_code == 200

CI Pipeline for Deployment Tests

# .github/workflows/model-deployment-tests.yml
name: Model Deployment Tests

on:
  push:
    paths:
      - "k8s/models/**"

jobs:
  test-deployment:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Configure kubectl
        run: |
          echo "${{ secrets.KUBECONFIG }}" > ~/.kube/config

      - name: Wait for InferenceService ready
        run: |
          kubectl -n ml-models wait inferenceservice/iris-classifier \
            --for=condition=Ready --timeout=300s

      - name: Run deployment tests
        run: |
          pip install pytest requests
          pytest tests/test_kserve_deployment.py \
                 tests/test_kserve_inference.py \
                 tests/test_health_endpoints.py -v

      - name: Smoke test with known inputs
        run: pytest tests/test_kserve_inference.py::test_known_input_classification -v

Key Takeaways

  • Always check Kubernetes resource conditions (Ready, PredictorReady) before running inference tests
  • Use parameterized known-input tests as regression anchors after every model update
  • Test canary rollouts by sampling multiple requests and asserting traffic fraction
  • Include latency assertions — inference should complete within your SLA, typically under 200ms
  • Run deployment tests in CI after every manifest change, not just after model training

Read more