Synthetic Monitoring and Canary Deployments with Datadog and Grafana
Synthetic monitoring runs scripted tests against your application continuously — from multiple geographic locations, using real browsers or API clients — to detect issues before users report them. Combined with canary deployments, it becomes your automated deployment safety net.
Synthetic Monitoring Fundamentals
Synthetic monitors are different from real user monitoring (RUM):
| Synthetic Monitoring | Real User Monitoring | |
|---|---|---|
| Data source | Scripted tests | Actual user traffic |
| Availability | Always on (no real users needed) | Only when users visit |
| Control | Full (you define what to test) | Observational |
| Privacy | No user data | May contain PII |
| Use case | Detect regressions proactively | Understand real user experience |
Synthetics are best for: uptime monitoring, end-to-end checkout flows, cross-browser testing, performance baselines.
Setting Up Synthetic API Monitors with Datadog
Terraform-Managed Synthetic Tests
# datadog_synthetics.tf
resource "datadog_synthetics_test" "api_health_check" {
name = "API Health Check - Global"
type = "api"
subtype = "http"
status = "live"
request_definition {
method = "GET"
url = "https://api.yourservice.com/health"
}
request_headers = {
"Accept" = "application/json"
"X-Synthetic" = "true" # Tag synthetic traffic in your logs
}
assertion {
type = "statusCode"
operator = "is"
target = "200"
}
assertion {
type = "responseTime"
operator = "lessThan"
target = "2000" # 2 seconds
}
assertion {
type = "body"
operator = "contains"
target = "\"status\":\"ok\""
}
options_list {
tick_every = 60 # Run every 60 seconds
min_failure_duration = 60 # Alert if failing for 60+ seconds
min_location_failed = 2 # Alert if 2+ locations fail
retry {
count = 2
interval = 1000 # Retry after 1 second
}
}
locations = [
"aws:us-east-1",
"aws:eu-west-1",
"aws:ap-southeast-1"
]
message = "@pagerduty-api-critical API health check failing in {{#is_recovery}}recovered in{{/is_recovery}}{{#is_alert}}failed from{{/is_alert}} {{location}}"
}
# Multi-step API test for checkout flow
resource "datadog_synthetics_test" "checkout_flow" {
name = "E2E Checkout Flow"
type = "api"
subtype = "multi"
status = "live"
api_step {
name = "Get products"
subtype = "http"
request_definition {
method = "GET"
url = "https://api.yourservice.com/products"
}
assertion {
type = "statusCode"
operator = "is"
target = "200"
}
extracted_value {
name = "product_id"
type = "body"
field = "$.data[0].id"
}
}
api_step {
name = "Add to cart"
subtype = "http"
request_definition {
method = "POST"
url = "https://api.yourservice.com/cart"
body = "{\"product_id\": \"{{ product_id }}\", \"quantity\": 1}"
}
assertion {
type = "statusCode"
operator = "is"
target = "201"
}
extracted_value {
name = "cart_id"
type = "body"
field = "$.cart_id"
}
}
api_step {
name = "Checkout"
subtype = "http"
request_definition {
method = "POST"
url = "https://api.yourservice.com/checkout/{{ cart_id }}"
}
assertion {
type = "statusCode"
operator = "is"
target = "200"
}
}
options_list {
tick_every = 300 # Every 5 minutes
}
locations = ["aws:us-east-1", "aws:eu-west-1"]
}Browser Synthetic Tests
# Using Datadog's Python client to create browser tests
from datadog_api_client import ApiClient, Configuration
from datadog_api_client.v1.api.synthetics_api import SyntheticsApi
from datadog_api_client.v1.model.synthetics_browser_test import SyntheticsBrowserTest
configuration = Configuration()
with ApiClient(configuration) as api_client:
api_instance = SyntheticsApi(api_client)
body = SyntheticsBrowserTest(
name="Login Flow - Chrome",
type="browser",
config={
"request": {
"url": "https://app.yourservice.com/login"
}
},
steps=[
{
"name": "Enter email",
"type": "typeText",
"params": {
"element": {"userLocator": {"values": [{"type": "css", "value": "#email"}]}},
"value": "test@example.com"
}
},
{
"name": "Enter password",
"type": "typeText",
"params": {
"element": {"userLocator": {"values": [{"type": "css", "value": "#password"}]}},
"value": "{{ synthetics.SYNTHETIC_PASSWORD }}" # Use Datadog secret
}
},
{
"name": "Click login",
"type": "click",
"params": {
"element": {"userLocator": {"values": [{"type": "css", "value": "button[type=submit]"}]}}
}
},
{
"name": "Assert dashboard loaded",
"type": "assertElementContent",
"params": {
"element": {"userLocator": {"values": [{"type": "css", "value": "h1.dashboard-title"}]}},
"operator": "contains",
"value": "Dashboard"
}
}
],
options={
"tick_every": 3600, # Every hour
"device_ids": ["chrome.laptop_large"]
},
locations=["aws:us-east-1"],
status="live"
)
result = api_instance.create_synthetics_browser_test(body=body)Grafana Synthetic Monitoring
Grafana's k6-based synthetic monitoring is a strong alternative, especially if you're already using Grafana Cloud:
// k6 synthetic script for Grafana Cloud
import { browser } from 'k6/experimental/browser';
import { check, sleep } from 'k6';
import http from 'k6/http';
export const options = {
scenarios: {
api_check: {
executor: 'constant-arrival-rate',
rate: 1,
timeUnit: '1m', // 1 request per minute
duration: '8760h', // Run indefinitely
preAllocatedVUs: 2,
},
},
thresholds: {
http_req_failed: ['rate<0.01'],
http_req_duration: ['p(95)<500'],
},
};
export default function () {
const response = http.get('https://api.yourservice.com/health', {
headers: { 'X-Synthetic': 'grafana-k6' }
});
check(response, {
'status is 200': (r) => r.status === 200,
'response time OK': (r) => r.timings.duration < 500,
'body contains status ok': (r) => r.body.includes('"status":"ok"'),
});
}Integrating Synthetics with Canary Deployments
The most powerful use of synthetic monitoring: automated canary deployment gates.
The Pattern
- Deploy new version to 5% of traffic (canary pods)
- Route synthetic monitors to canary endpoints
- If synthetic tests fail → automatic rollback
- If synthetics pass for 10 minutes → promote to 50%, then 100%
Implementation with Argo Rollouts
# argo-rollout.yaml
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: api-service
spec:
strategy:
canary:
steps:
- setWeight: 5 # 5% canary
- pause: {duration: 2m}
- analysis:
templates:
- templateName: synthetic-health-check
- setWeight: 25 # 25% after passing synthetics
- pause: {duration: 5m}
- analysis:
templates:
- templateName: synthetic-health-check
- setWeight: 100 # Full rollout
---
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
name: synthetic-health-check
spec:
metrics:
- name: datadog-synthetic-results
interval: 1m
count: 5 # Check 5 times
successCondition: result.status == "passed"
failureLimit: 1 # Rollback on first failure
provider:
datadog:
query: |
avg:synthetics.test_runs{status:failed,test_name:API Health Check}.as_count() == 0Kubernetes Rollout with Synthetic Gate
import subprocess
import time
import requests
def deploy_with_synthetic_gate(image_tag: str, synthetic_test_id: str) -> bool:
"""
Deploy new image with synthetic monitoring gate.
Returns True if deployment succeeded, False if rolled back.
"""
print(f"Deploying {image_tag}...")
# Deploy canary (5%)
subprocess.run([
"kubectl", "set", "image",
"deployment/api-service-canary",
f"api-service={image_tag}"
], check=True)
print("Waiting 2 minutes for canary to stabilize...")
time.sleep(120)
# Check synthetic test results for the past 2 minutes
dd_api = DatadogSyntheticsClient()
results = dd_api.get_test_results(
test_id=synthetic_test_id,
from_time=time.time() - 120
)
failed_results = [r for r in results if r["result"]["passed"] is False]
if len(failed_results) > 0:
print(f"❌ Synthetic tests failing ({len(failed_results)} failures). Rolling back.")
subprocess.run([
"kubectl", "rollout", "undo",
"deployment/api-service-canary"
], check=True)
return False
print("✅ Synthetics passing. Promoting to full rollout.")
subprocess.run([
"kubectl", "set", "image",
"deployment/api-service",
f"api-service={image_tag}"
], check=True)
return TrueAlerting Thresholds
# Alert on synthetic failures in Grafana Alerting
alert_rule = {
"title": "Synthetic Monitor Failure",
"condition": "C",
"data": [
{
"refId": "A",
"datasourceUid": "prometheus",
"model": {
"expr": 'probe_success{job="blackbox"} == 0',
"intervalMs": 10000,
"maxDataPoints": 43200
}
},
{
"refId": "C",
"datasourceUid": "__expr__",
"model": {
"type": "threshold",
"conditions": [{
"evaluator": {"params": [0], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A"]},
"reducer": {"params": [], "type": "last"}
}]
}
}
],
"for": "5m", # Must fail for 5 minutes before alerting
"labels": {"severity": "critical"},
"annotations": {
"summary": "Synthetic monitor {{ $labels.instance }} is failing",
"description": "The synthetic monitor has been failing for 5 minutes. Check the endpoint immediately."
}
}SLO Integration
Wire synthetic results into your SLO calculations:
def get_synthetic_availability(test_id: str, window_days: int = 28) -> float:
"""
Calculate availability SLI from synthetic monitor data.
Returns fraction of successful checks in the window.
"""
results = dd_client.get_all_test_results(
test_id=test_id,
from_time=time.time() - (window_days * 86400)
)
total = len(results)
passed = sum(1 for r in results if r["result"]["passed"])
return passed / total if total > 0 else 1.0
# Check SLO compliance
sli_value = get_synthetic_availability("abc-123-def", window_days=28)
slo_target = 0.999
if sli_value < slo_target:
error_minutes = (slo_target - sli_value) * 28 * 24 * 60
print(f"SLO BREACH: Synthetic availability {sli_value:.4%} below {slo_target:.4%}")
print(f" Excess downtime: {error_minutes:.1f} minutes")Synthetic monitoring and canary deployments together form a deployment safety system that catches regressions within minutes rather than after user complaints. The investment in setup pays back on the first deployment regression it catches automatically.