Testing ML Model Deployments with Seldon Core and KServe
Seldon Core and KServe deploy machine learning models as Kubernetes-native inference services. Testing these deployments means verifying the inference API, the Kubernetes resource state, canary rollouts, and model-level correctness — before and after going to production.
What to Test
| Layer | What to verify |
|---|---|
| Kubernetes resources | InferenceService is Ready, replicas running |
| Inference API | Correct output format, valid predictions |
| Canary rollouts | Traffic split working, new version quality |
| Model correctness | Known inputs return expected outputs |
| Performance | p99 latency under threshold |
KServe: Checking InferenceService Readiness
# tests/test_kserve_deployment.py
import pytest
import subprocess
import json
NAMESPACE = "ml-models"
MODEL_NAME = "iris-classifier"
def kubectl(*args) -> dict:
result = subprocess.run(
["kubectl", "-n", NAMESPACE, *args, "-o", "json"],
capture_output=True, text=True, check=True,
)
return json.loads(result.stdout)
@pytest.fixture(scope="session")
def inference_service():
return kubectl("get", "inferenceservice", MODEL_NAME)
def test_inference_service_exists(inference_service):
assert inference_service["metadata"]["name"] == MODEL_NAME
def test_inference_service_ready(inference_service):
conditions = inference_service["status"].get("conditions", [])
ready = next((c for c in conditions if c["type"] == "Ready"), None)
assert ready is not None, "No Ready condition found"
assert ready["status"] == "True", (
f"InferenceService not ready: {ready.get('message', 'no message')}"
)
def test_predictor_ready(inference_service):
conditions = inference_service["status"].get("conditions", [])
predictor = next(
(c for c in conditions if c["type"] == "PredictorReady"), None
)
assert predictor is not None
assert predictor["status"] == "True"
def test_url_is_set(inference_service):
url = inference_service["status"].get("url")
assert url is not None and url.startswith("http")KServe: Inference API Tests
KServe exposes the V2 inference protocol (/v2/models/{name}/infer):
# tests/test_kserve_inference.py
import pytest
import requests
import numpy as np
KSERVE_URL = "http://iris-classifier.ml-models.svc.cluster.local/v2/models/iris-classifier/infer"
# Known ground truth for regression testing
KNOWN_CASES = [
# (features, expected_class)
([5.1, 3.5, 1.4, 0.2], 0), # setosa
([6.4, 3.2, 4.5, 1.5], 1), # versicolor
([6.3, 3.3, 6.0, 2.5], 2), # virginica
]
@pytest.fixture(scope="session")
def inference_request():
def _request(features: list[float]) -> dict:
payload = {
"inputs": [{
"name": "input-0",
"shape": [1, 4],
"datatype": "FP32",
"data": features,
}]
}
resp = requests.post(KSERVE_URL, json=payload, timeout=10)
resp.raise_for_status()
return resp.json()
return _request
def test_inference_returns_200(inference_request):
result = inference_request([5.1, 3.5, 1.4, 0.2])
assert "outputs" in result
@pytest.mark.parametrize("features,expected_class", KNOWN_CASES)
def test_known_input_classification(inference_request, features, expected_class):
result = inference_request(features)
outputs = result["outputs"][0]["data"]
predicted_class = int(outputs[0])
assert predicted_class == expected_class, (
f"Expected class {expected_class}, got {predicted_class} for input {features}"
)
def test_inference_latency(inference_request):
import time
start = time.perf_counter()
inference_request([5.1, 3.5, 1.4, 0.2])
elapsed_ms = (time.perf_counter() - start) * 1000
assert elapsed_ms < 200, f"Inference took {elapsed_ms:.1f}ms, max is 200ms"Seldon Core: InferenceGraph Tests
Seldon Core uses SeldonDeployment resources and supports pipelines (InferenceGraph):
# tests/test_seldon_deployment.py
import pytest
import subprocess
import json
NAMESPACE = "seldon"
DEPLOYMENT_NAME = "iris-default"
def seldon_get(resource_type: str, name: str) -> dict:
result = subprocess.run(
["kubectl", "-n", NAMESPACE, "get", resource_type, name, "-o", "json"],
capture_output=True, text=True, check=True,
)
return json.loads(result.stdout)
def test_seldon_deployment_available():
dep = seldon_get("seldondeployment", DEPLOYMENT_NAME)
state = dep["status"].get("state")
assert state == "Available", f"SeldonDeployment state: {state}"
def test_all_replicas_ready():
dep = seldon_get("seldondeployment", DEPLOYMENT_NAME)
for predictor in dep["status"].get("deploymentStatus", {}).values():
replicas = predictor.get("replicas", 0)
available = predictor.get("availableReplicas", 0)
assert available == replicas, (
f"Only {available}/{replicas} replicas available"
)Canary Rollout Testing
Test traffic split during a canary deployment:
# tests/test_canary_rollout.py
import pytest
import requests
from collections import Counter
SELDON_URL = "http://iris-default.seldon.svc.cluster.local/api/v1.0/predictions"
SAMPLE_SIZE = 100
EXPECTED_CANARY_FRACTION = 0.2 # 20% canary traffic
TOLERANCE = 0.05 # ±5%
def predict_and_get_version(features: list[float]) -> str:
"""Returns the model version used for this prediction (from response header)."""
payload = {"data": {"ndarray": [features]}}
resp = requests.post(SELDON_URL, json=payload)
resp.raise_for_status()
return resp.headers.get("x-seldon-route", "unknown")
def test_canary_traffic_split():
versions = []
for _ in range(SAMPLE_SIZE):
version = predict_and_get_version([5.1, 3.5, 1.4, 0.2])
versions.append(version)
counts = Counter(versions)
total = sum(counts.values())
for version, count in counts.items():
fraction = count / total
print(f"Version {version}: {fraction:.1%} of traffic")
# Verify canary version received approximately the right fraction
canary_versions = [v for v in counts if "canary" in v.lower()]
if canary_versions:
canary_fraction = sum(counts[v] for v in canary_versions) / total
assert abs(canary_fraction - EXPECTED_CANARY_FRACTION) <= TOLERANCE, (
f"Canary traffic {canary_fraction:.1%} not within "
f"{TOLERANCE:.0%} of target {EXPECTED_CANARY_FRACTION:.1%}"
)Health Check Endpoint Tests
Both KServe and Seldon expose health endpoints:
# tests/test_health_endpoints.py
import requests
def test_kserve_health():
resp = requests.get(
"http://iris-classifier.ml-models.svc.cluster.local/v2/health/ready",
timeout=5,
)
assert resp.status_code == 200
def test_kserve_model_metadata():
resp = requests.get(
"http://iris-classifier.ml-models.svc.cluster.local/v2/models/iris-classifier",
timeout=5,
)
assert resp.status_code == 200
data = resp.json()
assert data["name"] == "iris-classifier"
assert "inputs" in data
assert "outputs" in data
def test_seldon_health():
resp = requests.get(
"http://iris-default.seldon.svc.cluster.local/api/v1.0/health/ready",
timeout=5,
)
assert resp.status_code == 200CI Pipeline for Deployment Tests
# .github/workflows/model-deployment-tests.yml
name: Model Deployment Tests
on:
push:
paths:
- "k8s/models/**"
jobs:
test-deployment:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Configure kubectl
run: |
echo "${{ secrets.KUBECONFIG }}" > ~/.kube/config
- name: Wait for InferenceService ready
run: |
kubectl -n ml-models wait inferenceservice/iris-classifier \
--for=condition=Ready --timeout=300s
- name: Run deployment tests
run: |
pip install pytest requests
pytest tests/test_kserve_deployment.py \
tests/test_kserve_inference.py \
tests/test_health_endpoints.py -v
- name: Smoke test with known inputs
run: pytest tests/test_kserve_inference.py::test_known_input_classification -vKey Takeaways
- Always check Kubernetes resource conditions (
Ready,PredictorReady) before running inference tests - Use parameterized known-input tests as regression anchors after every model update
- Test canary rollouts by sampling multiple requests and asserting traffic fraction
- Include latency assertions — inference should complete within your SLA, typically under 200ms
- Run deployment tests in CI after every manifest change, not just after model training