Testing AI Agent Observability: LangSmith Integration Tests, Trace Validation, and Cost Regression Testing

Testing AI Agent Observability: LangSmith Integration Tests, Trace Validation, and Cost Regression Testing

Observability for AI agents is different from observability for traditional software. A web server trace shows function calls and database queries. An AI agent trace shows LLM calls, tool invocations, token counts, and reasoning chains. The failure modes are different too: not just errors and timeouts, but hallucinations, reasoning loops, and cost explosions.

But here's the thing: if you're not testing your observability setup, you don't actually have observability — you have the hope of observability. Tests verify that traces are being captured, that they contain the right information, and that costs haven't regressed.

Here's how to test your AI agent observability stack.

What "Testing Observability" Means

Testing observability doesn't mean testing LangSmith itself (that's Langchain's job). It means testing:

  1. Your instrumentation — are your agents emitting the traces you expect?
  2. Trace completeness — does each trace contain the right spans, metadata, and timing?
  3. Token usage tracking — are token counts being recorded accurately?
  4. Cost regression — does a code change unexpectedly increase token usage or LLM calls?
  5. Alerting integration — do trace anomalies trigger the right alerts?

These are tests about your system's behavior from an observability perspective, not about LangSmith's API.

Testing LangSmith Integration

Before you can test traces, verify that your agents are correctly integrated with LangSmith.

import os
import pytest
from unittest.mock import patch, MagicMock, call
from langsmith import Client

class TestLangSmithIntegration:
    @pytest.fixture(autouse=True)
    def mock_langsmith(self):
        """Mock LangSmith to avoid real API calls in tests."""
        with patch("langsmith.Client") as mock_client_class:
            mock_client = MagicMock()
            mock_client_class.return_value = mock_client
            self.langsmith_client = mock_client
            yield mock_client
    
    def test_agent_traces_are_created(self):
        """Verify that running an agent creates a LangSmith trace."""
        from your_agent import run_research_agent
        
        with patch("langchain_core.tracers.langchain.LangChainTracer") as mock_tracer_class:
            mock_tracer = MagicMock()
            mock_tracer_class.return_value = mock_tracer
            
            run_research_agent("What is LangGraph?")
            
            # Tracer should have been initialized
            mock_tracer_class.assert_called()
    
    def test_trace_includes_run_name(self):
        """Traces should include a descriptive run name for filtering in LangSmith."""
        from your_agent import run_research_agent
        
        with patch.dict(os.environ, {"LANGCHAIN_TRACING_V2": "true"}):
            # Capture what run name was used
            created_runs = []
            
            original_runnable = None  # capture the trace metadata
            
            run_research_agent(
                "What is LangGraph?",
                run_name="test-research-run"
            )
            
            # Verify the run name was passed to the tracer
            # (implementation depends on your agent's tracer setup)

    def test_langsmith_disabled_when_env_not_set(self):
        """Agent should work without LangSmith when env vars are not set."""
        env = {k: v for k, v in os.environ.items() 
               if not k.startswith("LANGCHAIN")}
        
        with patch.dict(os.environ, env, clear=True):
            from your_agent import run_research_agent
            # Should not raise even without LangSmith configured
            result = run_research_agent("test query")
            assert result is not None

Testing Trace Completeness

A complete trace should contain all the spans you expect: the root span, LLM call spans, and tool call spans.

from dataclasses import dataclass
from typing import Optional

@dataclass
class MockSpan:
    name: str
    inputs: dict
    outputs: dict
    metadata: dict
    start_time: float
    end_time: float
    parent_id: Optional[str] = None
    error: Optional[str] = None

class TraceCapture:
    """Captures trace spans during agent execution for test assertions."""
    
    def __init__(self):
        self.spans: list[MockSpan] = []
    
    def on_llm_start(self, serialized, prompts, **kwargs):
        self.spans.append(MockSpan(
            name="llm_call",
            inputs={"prompts": prompts},
            outputs={},
            metadata=serialized,
            start_time=__import__("time").time(),
            end_time=0
        ))
    
    def on_llm_end(self, response, **kwargs):
        if self.spans:
            self.spans[-1].end_time = __import__("time").time()
            self.spans[-1].outputs = {"response": str(response)}
    
    def on_tool_start(self, serialized, input_str, **kwargs):
        self.spans.append(MockSpan(
            name=f"tool_{serialized.get('name', 'unknown')}",
            inputs={"input": input_str},
            outputs={},
            metadata=serialized,
            start_time=__import__("time").time(),
            end_time=0
        ))
    
    def on_tool_end(self, output, **kwargs):
        if self.spans:
            self.spans[-1].end_time = __import__("time").time()
            self.spans[-1].outputs = {"output": output}

class TestTraceCompleteness:
    def test_research_workflow_produces_expected_spans(self):
        from your_agent import create_research_agent
        
        trace_capture = TraceCapture()
        agent = create_research_agent(callbacks=[trace_capture])
        
        with patch("your_agent.search_api") as mock_search:
            mock_search.search.return_value = [{"title": "result"}]
            
            with patch("your_agent.llm") as mock_llm:
                from langchain_core.messages import AIMessage
                mock_llm.invoke.return_value = AIMessage(
                    content="Research complete. Found relevant information."
                )
                
                agent.run("research AI testing")
        
        span_names = [s.name for s in trace_capture.spans]
        
        # Should have at least one LLM call
        assert "llm_call" in span_names, "No LLM spans captured"
        
        # Should have tool call if agent used tools
        tool_spans = [s for s in trace_capture.spans if s.name.startswith("tool_")]
        assert len(tool_spans) > 0, "No tool call spans captured"
    
    def test_spans_have_timing_information(self):
        from your_agent import create_research_agent
        
        trace_capture = TraceCapture()
        agent = create_research_agent(callbacks=[trace_capture])
        
        with patch("your_agent.llm") as mock_llm:
            from langchain_core.messages import AIMessage
            mock_llm.invoke.return_value = AIMessage(content="done")
            agent.run("test")
        
        for span in trace_capture.spans:
            assert span.start_time > 0, f"Span {span.name} missing start time"
            if span.end_time > 0:  # Not all spans may complete
                assert span.end_time >= span.start_time
    
    def test_error_spans_are_captured(self):
        from your_agent import create_research_agent
        
        trace_capture = TraceCapture()
        agent = create_research_agent(callbacks=[trace_capture])
        
        with patch("your_agent.search_api") as mock_search:
            mock_search.search.side_effect = Exception("Search API down")
            
            try:
                agent.run("test")
            except Exception:
                pass
        
        # Error should be captured in trace even if agent crashed
        error_spans = [s for s in trace_capture.spans if s.error]
        assert len(error_spans) > 0, "Error was not captured in trace"

Testing Token Usage Assertions

Token usage determines cost. Test that your agents report token usage accurately and that usage matches expectations.

class TokenUsageTracker:
    def __init__(self):
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.calls: list[dict] = []
    
    def on_llm_end(self, response, **kwargs):
        if hasattr(response, "llm_output") and response.llm_output:
            usage = response.llm_output.get("token_usage", {})
            self.total_input_tokens += usage.get("prompt_tokens", 0)
            self.total_output_tokens += usage.get("completion_tokens", 0)
            self.calls.append({
                "input_tokens": usage.get("prompt_tokens", 0),
                "output_tokens": usage.get("completion_tokens", 0)
            })

class TestTokenUsage:
    def test_token_usage_is_tracked(self):
        from your_agent import create_research_agent
        
        tracker = TokenUsageTracker()
        agent = create_research_agent(callbacks=[tracker])
        
        with patch("your_agent.llm") as mock_llm:
            from langchain_core.messages import AIMessage
            
            # Simulate LLM response with token usage
            mock_response = MagicMock()
            mock_response.generations = [[MagicMock(text="response")]]
            mock_response.llm_output = {
                "token_usage": {
                    "prompt_tokens": 150,
                    "completion_tokens": 50,
                    "total_tokens": 200
                }
            }
            mock_llm.generate.return_value = mock_response
            
            agent.run("test query")
        
        assert tracker.total_input_tokens > 0
        assert tracker.total_output_tokens > 0
    
    def test_agent_reports_token_usage_per_call(self):
        """Each LLM call should report its own token usage, not accumulated."""
        from your_agent import create_research_agent
        
        tracker = TokenUsageTracker()
        agent = create_research_agent(callbacks=[tracker])
        
        with patch("your_agent.llm") as mock_llm:
            # Simulate 2 LLM calls
            responses = [
                MagicMock(llm_output={"token_usage": {"prompt_tokens": 100, "completion_tokens": 50}}),
                MagicMock(llm_output={"token_usage": {"prompt_tokens": 200, "completion_tokens": 100}})
            ]
            mock_llm.generate.side_effect = [
                MagicMock(generations=[[MagicMock()]], llm_output=r.llm_output)
                for r in responses
            ]
            
            agent.run("multi-step query")
        
        # Should have recorded individual calls, not just totals
        if len(tracker.calls) >= 2:
            assert tracker.calls[0]["input_tokens"] != tracker.calls[1]["input_tokens"]

Cost Regression Testing

A single code change — adding context to a prompt, extending the system message — can double your token usage. Test for cost regressions in CI.

import json
from pathlib import Path

COST_BASELINE_FILE = Path("tests/cost-baselines.json")

def load_cost_baselines() -> dict:
    if COST_BASELINE_FILE.exists():
        return json.loads(COST_BASELINE_FILE.read_text())
    return {}

def save_cost_baseline(test_name: str, token_count: int):
    baselines = load_cost_baselines()
    baselines[test_name] = token_count
    COST_BASELINE_FILE.write_text(json.dumps(baselines, indent=2))

class TestCostRegression:
    """Cost regression tests. Run with RECORD_COSTS=true to update baselines."""
    
    def _run_and_count_tokens(self, query: str) -> int:
        from your_agent import create_research_agent
        tracker = TokenUsageTracker()
        agent = create_research_agent(callbacks=[tracker])
        
        with patch("your_agent.llm") as mock_llm, \
             patch("your_agent.search_api") as mock_search:
            
            from langchain_core.messages import AIMessage
            mock_search.search.return_value = [{"title": "r", "content": "c"}]
            mock_response = MagicMock()
            mock_response.generations = [[MagicMock(text="response")]]
            mock_response.llm_output = {
                "token_usage": {"prompt_tokens": 200, "completion_tokens": 50}
            }
            mock_llm.generate.return_value = mock_response
            
            agent.run(query)
        
        return tracker.total_input_tokens + tracker.total_output_tokens
    
    def test_simple_research_query_token_cost(self):
        test_name = "simple_research_query"
        token_count = self._run_and_count_tokens("What is Python?")
        
        if os.getenv("RECORD_COSTS"):
            save_cost_baseline(test_name, token_count)
            return
        
        baselines = load_cost_baselines()
        if test_name not in baselines:
            pytest.skip(f"No baseline for {test_name}. Run with RECORD_COSTS=true.")
        
        baseline = baselines[test_name]
        max_allowed = baseline * 1.20  # Allow 20% increase
        
        assert token_count <= max_allowed, (
            f"Token cost regression: {token_count} tokens exceeds baseline "
            f"{baseline} by {((token_count/baseline - 1) * 100):.1f}%"
        )
    
    def test_multi_tool_query_token_cost(self):
        test_name = "multi_tool_query"
        token_count = self._run_and_count_tokens("Research and compare LangGraph vs CrewAI")
        
        if os.getenv("RECORD_COSTS"):
            save_cost_baseline(test_name, token_count)
            return
        
        baselines = load_cost_baselines()
        if test_name not in baselines:
            pytest.skip(f"No baseline for {test_name}. Run with RECORD_COSTS=true.")
        
        baseline = baselines[test_name]
        assert token_count <= baseline * 1.20

Commit tests/cost-baselines.json to your repository. On PRs, if token usage increases by more than 20%, the test fails and you investigate before merging.

Testing Alerting Integration

Observability is only useful if alerts fire when things go wrong. Test your alerting logic.

from unittest.mock import patch, MagicMock

class TestAlertingIntegration:
    def test_alert_fires_on_excessive_token_usage(self):
        from your_observability import check_and_alert
        
        with patch("your_observability.send_alert") as mock_alert:
            # Simulate a run that used 10x the expected tokens
            check_and_alert(
                run_id="test-run",
                token_count=50000,  # Way above threshold
                token_threshold=5000
            )
            
            mock_alert.assert_called_once()
            alert_message = mock_alert.call_args[0][0]
            assert "token" in alert_message.lower()
    
    def test_no_alert_on_normal_token_usage(self):
        from your_observability import check_and_alert
        
        with patch("your_observability.send_alert") as mock_alert:
            check_and_alert(
                run_id="test-run",
                token_count=1000,  # Normal usage
                token_threshold=5000
            )
            
            mock_alert.assert_not_called()
    
    def test_alert_fires_on_reasoning_loop_detection(self):
        """Detect when an agent is stuck in a loop (same tool called 5+ times)."""
        from your_observability import detect_reasoning_loop
        
        tool_calls = ["search", "search", "search", "search", "search"]
        assert detect_reasoning_loop(tool_calls) is True
    
    def test_no_loop_on_legitimate_repeated_calls(self):
        from your_observability import detect_reasoning_loop
        
        # Legitimately calling the same tool with different inputs
        tool_calls = ["search", "analyze", "search", "synthesize"]
        assert detect_reasoning_loop(tool_calls) is False

CI Configuration

- name: Run observability tests
  run: pytest tests/observability/ -v
  env:
    LANGCHAIN_TRACING_V2: "false"  # Disable real tracing in CI

- name: Check cost regressions
  run: pytest tests/observability/test_cost_regression.py -v
  env:
    LANGCHAIN_TRACING_V2: "false"

Observability without tests is instrumentation theater. You think you're monitoring your agents. You're hoping you are. Tests confirm the difference.

Read more