Synthetic Monitoring and Canary Deployments with Datadog and Grafana

Synthetic Monitoring and Canary Deployments with Datadog and Grafana

Synthetic monitoring runs scripted tests against your application continuously — from multiple geographic locations, using real browsers or API clients — to detect issues before users report them. Combined with canary deployments, it becomes your automated deployment safety net.

Synthetic Monitoring Fundamentals

Synthetic monitors are different from real user monitoring (RUM):

Synthetic Monitoring Real User Monitoring
Data source Scripted tests Actual user traffic
Availability Always on (no real users needed) Only when users visit
Control Full (you define what to test) Observational
Privacy No user data May contain PII
Use case Detect regressions proactively Understand real user experience

Synthetics are best for: uptime monitoring, end-to-end checkout flows, cross-browser testing, performance baselines.

Setting Up Synthetic API Monitors with Datadog

Terraform-Managed Synthetic Tests

# datadog_synthetics.tf
resource "datadog_synthetics_test" "api_health_check" {
  name    = "API Health Check - Global"
  type    = "api"
  subtype = "http"
  status  = "live"
  
  request_definition {
    method = "GET"
    url    = "https://api.yourservice.com/health"
  }
  
  request_headers = {
    "Accept"       = "application/json"
    "X-Synthetic"  = "true"  # Tag synthetic traffic in your logs
  }
  
  assertion {
    type     = "statusCode"
    operator = "is"
    target   = "200"
  }
  
  assertion {
    type     = "responseTime"
    operator = "lessThan"
    target   = "2000"  # 2 seconds
  }
  
  assertion {
    type     = "body"
    operator = "contains"
    target   = "\"status\":\"ok\""
  }
  
  options_list {
    tick_every       = 60   # Run every 60 seconds
    min_failure_duration = 60   # Alert if failing for 60+ seconds
    min_location_failed  = 2    # Alert if 2+ locations fail
    
    retry {
      count    = 2
      interval = 1000  # Retry after 1 second
    }
  }
  
  locations = [
    "aws:us-east-1",
    "aws:eu-west-1",
    "aws:ap-southeast-1"
  ]
  
  message = "@pagerduty-api-critical API health check failing in {{#is_recovery}}recovered in{{/is_recovery}}{{#is_alert}}failed from{{/is_alert}} {{location}}"
}

# Multi-step API test for checkout flow
resource "datadog_synthetics_test" "checkout_flow" {
  name    = "E2E Checkout Flow"
  type    = "api"
  subtype = "multi"
  status  = "live"
  
  api_step {
    name = "Get products"
    subtype = "http"
    
    request_definition {
      method = "GET"
      url    = "https://api.yourservice.com/products"
    }
    
    assertion {
      type     = "statusCode"
      operator = "is"
      target   = "200"
    }
    
    extracted_value {
      name  = "product_id"
      type  = "body"
      field = "$.data[0].id"
    }
  }
  
  api_step {
    name = "Add to cart"
    subtype = "http"
    
    request_definition {
      method = "POST"
      url    = "https://api.yourservice.com/cart"
      body   = "{\"product_id\": \"{{ product_id }}\", \"quantity\": 1}"
    }
    
    assertion {
      type     = "statusCode"
      operator = "is"
      target   = "201"
    }
    
    extracted_value {
      name  = "cart_id"
      type  = "body"
      field = "$.cart_id"
    }
  }
  
  api_step {
    name = "Checkout"
    subtype = "http"
    
    request_definition {
      method = "POST"
      url    = "https://api.yourservice.com/checkout/{{ cart_id }}"
    }
    
    assertion {
      type     = "statusCode"
      operator = "is"
      target   = "200"
    }
  }
  
  options_list {
    tick_every = 300  # Every 5 minutes
  }
  
  locations = ["aws:us-east-1", "aws:eu-west-1"]
}

Browser Synthetic Tests

# Using Datadog's Python client to create browser tests
from datadog_api_client import ApiClient, Configuration
from datadog_api_client.v1.api.synthetics_api import SyntheticsApi
from datadog_api_client.v1.model.synthetics_browser_test import SyntheticsBrowserTest

configuration = Configuration()
with ApiClient(configuration) as api_client:
    api_instance = SyntheticsApi(api_client)
    
    body = SyntheticsBrowserTest(
        name="Login Flow - Chrome",
        type="browser",
        config={
            "request": {
                "url": "https://app.yourservice.com/login"
            }
        },
        steps=[
            {
                "name": "Enter email",
                "type": "typeText",
                "params": {
                    "element": {"userLocator": {"values": [{"type": "css", "value": "#email"}]}},
                    "value": "test@example.com"
                }
            },
            {
                "name": "Enter password",
                "type": "typeText",
                "params": {
                    "element": {"userLocator": {"values": [{"type": "css", "value": "#password"}]}},
                    "value": "{{ synthetics.SYNTHETIC_PASSWORD }}"  # Use Datadog secret
                }
            },
            {
                "name": "Click login",
                "type": "click",
                "params": {
                    "element": {"userLocator": {"values": [{"type": "css", "value": "button[type=submit]"}]}}
                }
            },
            {
                "name": "Assert dashboard loaded",
                "type": "assertElementContent",
                "params": {
                    "element": {"userLocator": {"values": [{"type": "css", "value": "h1.dashboard-title"}]}},
                    "operator": "contains",
                    "value": "Dashboard"
                }
            }
        ],
        options={
            "tick_every": 3600,  # Every hour
            "device_ids": ["chrome.laptop_large"]
        },
        locations=["aws:us-east-1"],
        status="live"
    )
    
    result = api_instance.create_synthetics_browser_test(body=body)

Grafana Synthetic Monitoring

Grafana's k6-based synthetic monitoring is a strong alternative, especially if you're already using Grafana Cloud:

// k6 synthetic script for Grafana Cloud
import { browser } from 'k6/experimental/browser';
import { check, sleep } from 'k6';
import http from 'k6/http';

export const options = {
  scenarios: {
    api_check: {
      executor: 'constant-arrival-rate',
      rate: 1,
      timeUnit: '1m',  // 1 request per minute
      duration: '8760h',  // Run indefinitely
      preAllocatedVUs: 2,
    },
  },
  thresholds: {
    http_req_failed: ['rate<0.01'],
    http_req_duration: ['p(95)<500'],
  },
};

export default function () {
  const response = http.get('https://api.yourservice.com/health', {
    headers: { 'X-Synthetic': 'grafana-k6' }
  });
  
  check(response, {
    'status is 200': (r) => r.status === 200,
    'response time OK': (r) => r.timings.duration < 500,
    'body contains status ok': (r) => r.body.includes('"status":"ok"'),
  });
}

Integrating Synthetics with Canary Deployments

The most powerful use of synthetic monitoring: automated canary deployment gates.

The Pattern

  1. Deploy new version to 5% of traffic (canary pods)
  2. Route synthetic monitors to canary endpoints
  3. If synthetic tests fail → automatic rollback
  4. If synthetics pass for 10 minutes → promote to 50%, then 100%

Implementation with Argo Rollouts

# argo-rollout.yaml
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
  name: api-service
spec:
  strategy:
    canary:
      steps:
        - setWeight: 5      # 5% canary
        - pause: {duration: 2m}
        - analysis:
            templates:
              - templateName: synthetic-health-check
        - setWeight: 25     # 25% after passing synthetics
        - pause: {duration: 5m}
        - analysis:
            templates:
              - templateName: synthetic-health-check
        - setWeight: 100    # Full rollout
---
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
  name: synthetic-health-check
spec:
  metrics:
    - name: datadog-synthetic-results
      interval: 1m
      count: 5    # Check 5 times
      successCondition: result.status == "passed"
      failureLimit: 1  # Rollback on first failure
      
      provider:
        datadog:
          query: |
            avg:synthetics.test_runs{status:failed,test_name:API Health Check}.as_count() == 0

Kubernetes Rollout with Synthetic Gate

import subprocess
import time
import requests

def deploy_with_synthetic_gate(image_tag: str, synthetic_test_id: str) -> bool:
    """
    Deploy new image with synthetic monitoring gate.
    Returns True if deployment succeeded, False if rolled back.
    """
    print(f"Deploying {image_tag}...")
    
    # Deploy canary (5%)
    subprocess.run([
        "kubectl", "set", "image",
        "deployment/api-service-canary",
        f"api-service={image_tag}"
    ], check=True)
    
    print("Waiting 2 minutes for canary to stabilize...")
    time.sleep(120)
    
    # Check synthetic test results for the past 2 minutes
    dd_api = DatadogSyntheticsClient()
    results = dd_api.get_test_results(
        test_id=synthetic_test_id,
        from_time=time.time() - 120
    )
    
    failed_results = [r for r in results if r["result"]["passed"] is False]
    
    if len(failed_results) > 0:
        print(f"❌ Synthetic tests failing ({len(failed_results)} failures). Rolling back.")
        subprocess.run([
            "kubectl", "rollout", "undo",
            "deployment/api-service-canary"
        ], check=True)
        return False
    
    print("✅ Synthetics passing. Promoting to full rollout.")
    subprocess.run([
        "kubectl", "set", "image",
        "deployment/api-service",
        f"api-service={image_tag}"
    ], check=True)
    
    return True

Alerting Thresholds

# Alert on synthetic failures in Grafana Alerting
alert_rule = {
    "title": "Synthetic Monitor Failure",
    "condition": "C",
    "data": [
        {
            "refId": "A",
            "datasourceUid": "prometheus",
            "model": {
                "expr": 'probe_success{job="blackbox"} == 0',
                "intervalMs": 10000,
                "maxDataPoints": 43200
            }
        },
        {
            "refId": "C",
            "datasourceUid": "__expr__",
            "model": {
                "type": "threshold",
                "conditions": [{
                    "evaluator": {"params": [0], "type": "gt"},
                    "operator": {"type": "and"},
                    "query": {"params": ["A"]},
                    "reducer": {"params": [], "type": "last"}
                }]
            }
        }
    ],
    "for": "5m",  # Must fail for 5 minutes before alerting
    "labels": {"severity": "critical"},
    "annotations": {
        "summary": "Synthetic monitor {{ $labels.instance }} is failing",
        "description": "The synthetic monitor has been failing for 5 minutes. Check the endpoint immediately."
    }
}

SLO Integration

Wire synthetic results into your SLO calculations:

def get_synthetic_availability(test_id: str, window_days: int = 28) -> float:
    """
    Calculate availability SLI from synthetic monitor data.
    Returns fraction of successful checks in the window.
    """
    results = dd_client.get_all_test_results(
        test_id=test_id,
        from_time=time.time() - (window_days * 86400)
    )
    
    total = len(results)
    passed = sum(1 for r in results if r["result"]["passed"])
    
    return passed / total if total > 0 else 1.0

# Check SLO compliance
sli_value = get_synthetic_availability("abc-123-def", window_days=28)
slo_target = 0.999

if sli_value < slo_target:
    error_minutes = (slo_target - sli_value) * 28 * 24 * 60
    print(f"SLO BREACH: Synthetic availability {sli_value:.4%} below {slo_target:.4%}")
    print(f"  Excess downtime: {error_minutes:.1f} minutes")

Synthetic monitoring and canary deployments together form a deployment safety system that catches regressions within minutes rather than after user complaints. The investment in setup pays back on the first deployment regression it catches automatically.

Read more