RAG Pipeline Testing with LangChain and LlamaIndex

RAG Pipeline Testing with LangChain and LlamaIndex

RAG (Retrieval-Augmented Generation) pipelines combine a retrieval system with an LLM to answer questions using your own data. Testing RAG pipelines is challenging because both the retrieval component and the generation component can fail independently — or fail together in ways that aren't obvious.

This guide covers testing RAG pipelines built with LangChain and LlamaIndex, from component-level unit tests to end-to-end evaluation.

RAG Pipeline Architecture

A typical RAG pipeline has these components:

User Query
    ↓
[Query Embedding] ← Embedding Model
    ↓
[Vector Retrieval] ← Vector Store
    ↓
[Context Assembly] — Chunk selection, deduplication, reranking
    ↓
[Prompt Construction] — System prompt + context + query
    ↓
[LLM Generation] ← Language Model
    ↓
Response

Each step can be tested independently. Component tests are fast and don't require LLM calls; integration tests validate the full pipeline.

What to Test at Each Stage

Stage What can go wrong Test type
Query embedding Wrong model, wrong dimensions Unit
Retrieval Low recall, wrong chunks retrieved Unit/Integration
Context assembly Too much context (token overflow), duplicate chunks Unit
Prompt construction Context not included, format errors Unit
LLM generation Hallucination, off-topic, too long/short Integration/Eval
End-to-end Query not answered despite relevant docs Eval

Testing LangChain Retrieval

# tests/test_langchain_retriever.py
import pytest
from unittest.mock import MagicMock, patch
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

@pytest.fixture
def mock_vector_store():
    """Create a Chroma vector store with pre-loaded test documents."""
    # Use a real in-memory store for integration tests
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    docs = [
        Document(
            page_content="HelpMeTest is a cloud-based QA testing platform",
            metadata={"source": "homepage", "category": "product"},
        ),
        Document(
            page_content="HelpMeTest uses Robot Framework and Playwright for test automation",
            metadata={"source": "docs", "category": "technical"},
        ),
        Document(
            page_content="HelpMeTest pricing starts at $0 for the free tier with 10 tests",
            metadata={"source": "pricing", "category": "pricing"},
        ),
        Document(
            page_content="Machine learning model training requires GPU infrastructure",
            metadata={"source": "blog", "category": "ml"},
        ),
    ]
    
    store = Chroma.from_documents(docs, embeddings)
    return store

class TestLangChainRetriever:
    def test_retrieves_relevant_documents(self, mock_vector_store):
        """Retriever should return documents relevant to the query."""
        retriever = mock_vector_store.as_retriever(search_kwargs={"k": 3})
        
        docs = retriever.invoke("What testing frameworks does HelpMeTest use?")
        
        # Should retrieve technical docs, not pricing or ML docs
        doc_contents = [d.page_content for d in docs]
        relevant = any("Robot Framework" in c or "Playwright" in c for c in doc_contents)
        assert relevant, f"No relevant docs retrieved. Got: {doc_contents}"

    def test_retriever_respects_k_parameter(self, mock_vector_store):
        """Retriever should return at most k documents."""
        for k in [1, 2, 3]:
            retriever = mock_vector_store.as_retriever(search_kwargs={"k": k})
            docs = retriever.invoke("HelpMeTest testing")
            assert len(docs) <= k, f"Expected <= {k} docs, got {len(docs)}"

    def test_metadata_filter_narrows_results(self, mock_vector_store):
        """Metadata filter should restrict retrieval to matching documents."""
        retriever = mock_vector_store.as_retriever(
            search_kwargs={"k": 5, "filter": {"category": "pricing"}}
        )
        
        docs = retriever.invoke("HelpMeTest cost")
        
        for doc in docs:
            assert doc.metadata["category"] == "pricing", (
                f"Got non-pricing doc: {doc.metadata}"
            )

    def test_handles_query_with_no_relevant_docs(self, mock_vector_store):
        """Retriever should return results even for off-topic queries."""
        retriever = mock_vector_store.as_retriever(search_kwargs={"k": 1})
        
        # Query about cooking — no relevant docs exist
        docs = retriever.invoke("best pasta carbonara recipe")
        
        # Should not raise — may return unrelated docs or empty
        assert isinstance(docs, list)

Testing LlamaIndex Query Engine

# tests/test_llamaindex_engine.py
import pytest
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from unittest.mock import MagicMock, patch

@pytest.fixture(scope="session")
def test_index():
    """Build a small test index with controlled content."""
    documents = [
        Document(
            text="""
            HelpMeTest is a cloud-hosted SaaS platform for software quality assurance.
            It uses Robot Framework and Playwright for browser automation.
            The free tier includes 10 tests and unlimited health checks.
            The Pro plan costs $100 per month and includes unlimited tests.
            """,
            metadata={"source": "helpmetest-docs"},
        ),
    ]
    
    parser = SentenceSplitter(chunk_size=256, chunk_overlap=20)
    nodes = parser.get_nodes_from_documents(documents)
    
    return VectorStoreIndex(nodes)

class TestLlamaIndexQueryEngine:
    def test_query_returns_response(self, test_index):
        """Query engine should return a non-empty response."""
        engine = test_index.as_query_engine()
        response = engine.query("What is HelpMeTest?")
        
        assert response is not None
        assert len(str(response)) > 0

    def test_source_nodes_included(self, test_index):
        """Response should include source nodes for traceability."""
        engine = test_index.as_query_engine(response_mode="compact")
        response = engine.query("What frameworks does HelpMeTest use?")
        
        assert len(response.source_nodes) > 0
        
        # Source nodes should contain the relevant content
        node_texts = [n.text for n in response.source_nodes]
        relevant = any("Robot Framework" in t or "Playwright" in t for t in node_texts)
        assert relevant, f"Source nodes don't contain relevant content: {node_texts}"

    def test_retrieves_correct_chunk_count(self, test_index):
        """Query engine should retrieve the configured number of chunks."""
        engine = test_index.as_query_engine(similarity_top_k=2)
        response = engine.query("HelpMeTest pricing and features")
        
        assert len(response.source_nodes) <= 2

Testing Context Assembly

The context assembly step assembles retrieved chunks into a prompt. Test it independently:

# tests/test_context_assembly.py
import pytest

def assemble_context(
    chunks: list[str],
    max_tokens: int = 2000,
    separator: str = "\n\n---\n\n",
) -> str:
    """Assemble retrieved chunks into a context string respecting token limits."""
    # Rough token estimate: 1 token ≈ 4 characters
    assembled = []
    current_length = 0
    
    for chunk in chunks:
        chunk_tokens = len(chunk) // 4
        if current_length + chunk_tokens > max_tokens:
            break
        assembled.append(chunk)
        current_length += chunk_tokens
    
    return separator.join(assembled)

class TestContextAssembly:
    def test_assembles_chunks_with_separator(self):
        chunks = ["Chunk A", "Chunk B", "Chunk C"]
        result = assemble_context(chunks)
        
        assert "Chunk A" in result
        assert "Chunk B" in result
        assert "---" in result  # Separator present

    def test_respects_token_limit(self):
        """Large context should be truncated to fit within token limit."""
        # Each chunk is ~100 tokens (400 chars)
        large_chunks = ["x" * 400] * 10
        
        result = assemble_context(large_chunks, max_tokens=500)
        
        # Should contain at most 500 tokens worth of content
        estimated_tokens = len(result) // 4
        assert estimated_tokens <= 500 + 100  # Small buffer for separators

    def test_empty_chunks_returns_empty_string(self):
        result = assemble_context([])
        assert result == ""

    def test_single_chunk_no_separator(self):
        result = assemble_context(["Only chunk"])
        assert result == "Only chunk"
        assert "---" not in result

    def test_preserves_chunk_order(self):
        """Most relevant chunks (first) should appear first in context."""
        chunks = ["Most relevant", "Less relevant", "Least relevant"]
        result = assemble_context(chunks)
        
        assert result.index("Most relevant") < result.index("Less relevant")
        assert result.index("Less relevant") < result.index("Least relevant")

Testing Prompt Construction

# tests/test_prompt_construction.py
def build_rag_prompt(query: str, context: str, system_prompt: str = "") -> dict:
    """Build the messages array for the LLM call."""
    base_system = """You are a helpful assistant. Answer the user's question using ONLY
    the context provided below. If the answer is not in the context, say 'I don't know.'
    
    Context:
    {context}
    """.strip()
    
    system = system_prompt or base_system.format(context=context)
    
    return {
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": query},
        ]
    }

class TestPromptConstruction:
    def test_context_included_in_system_prompt(self):
        prompt = build_rag_prompt("What is X?", context="X is a testing tool")
        
        system_content = prompt["messages"][0]["content"]
        assert "X is a testing tool" in system_content

    def test_query_is_user_message(self):
        prompt = build_rag_prompt("What is X?", context="X is a testing tool")
        
        user_message = prompt["messages"][1]
        assert user_message["role"] == "user"
        assert user_message["content"] == "What is X?"

    def test_message_structure_is_valid(self):
        prompt = build_rag_prompt("Query", context="Context")
        
        assert "messages" in prompt
        assert len(prompt["messages"]) >= 2
        
        for msg in prompt["messages"]:
            assert "role" in msg
            assert "content" in msg
            assert msg["role"] in ("system", "user", "assistant")

    def test_empty_context_handled(self):
        prompt = build_rag_prompt("What is X?", context="")
        
        # Should not crash — may produce a less useful prompt
        assert prompt is not None
        assert len(prompt["messages"]) == 2

End-to-End Evaluation with Real LLM Calls

For integration tests that make real LLM calls, evaluate answer quality:

# tests/evaluation/test_rag_quality.py
import pytest
from dataclasses import dataclass
from typing import Callable

@dataclass
class QATestCase:
    question: str
    expected_keywords: list[str]  # Words that must appear in the answer
    forbidden_keywords: list[str]  # Words that signal hallucination
    must_cite_source: bool = True

QA_TEST_CASES = [
    QATestCase(
        question="What is the price of HelpMeTest Pro plan?",
        expected_keywords=["$100", "month", "pro"],
        forbidden_keywords=["$50", "$200", "free"],
        must_cite_source=True,
    ),
    QATestCase(
        question="What testing frameworks does HelpMeTest use?",
        expected_keywords=["robot framework", "playwright"],
        forbidden_keywords=["selenium", "cypress"],
        must_cite_source=True,
    ),
    QATestCase(
        question="Does HelpMeTest support Kubernetes deployment?",
        expected_keywords=["don't know", "not mentioned", "cannot"],
        forbidden_keywords=[],
        must_cite_source=False,
    ),
]

@pytest.mark.integration
class TestRAGQuality:
    """Integration tests that make real LLM calls. Run separately from unit tests."""
    
    def test_answer_contains_expected_content(self, rag_pipeline):
        """Answers should contain keywords from the ground-truth context."""
        for case in QA_TEST_CASES:
            if not case.expected_keywords:
                continue
            
            result = rag_pipeline.query(case.question)
            answer_lower = result.answer.lower()
            
            missing = [
                kw for kw in case.expected_keywords
                if kw.lower() not in answer_lower
            ]
            
            assert not missing, (
                f"Question: '{case.question}'\n"
                f"Missing keywords: {missing}\n"
                f"Got: {result.answer}"
            )

    def test_answer_does_not_hallucinate(self, rag_pipeline):
        """Answers should not contain made-up facts."""
        for case in QA_TEST_CASES:
            if not case.forbidden_keywords:
                continue
            
            result = rag_pipeline.query(case.question)
            answer_lower = result.answer.lower()
            
            hallucinated = [
                kw for kw in case.forbidden_keywords
                if kw.lower() in answer_lower
            ]
            
            assert not hallucinated, (
                f"Question: '{case.question}'\n"
                f"Hallucinated content: {hallucinated}\n"
                f"Got: {result.answer}"
            )

    def test_unanswerable_questions_get_honest_response(self, rag_pipeline):
        """Questions not in the knowledge base should get an honest 'I don't know'."""
        result = rag_pipeline.query("What is the recipe for chocolate cake?")
        
        honest_phrases = ["don't know", "not in", "cannot find", "no information"]
        is_honest = any(p in result.answer.lower() for p in honest_phrases)
        
        assert is_honest, (
            f"Expected honest 'I don't know' response, got: {result.answer}"
        )

Regression Testing

Track quality metrics over time to detect regressions:

# tests/test_rag_regression.py
import json
import os
from pathlib import Path

BASELINE_FILE = Path("tests/baselines/rag_metrics.json")

def compute_recall_at_k(test_cases, retriever, k=3):
    """Compute average Recall@K across test cases."""
    recalls = []
    for case in test_cases:
        results = retriever.retrieve(case.query)[:k]
        result_ids = {r.node_id for r in results}
        expected_ids = set(case.expected_doc_ids)
        recalls.append(len(result_ids & expected_ids) / len(expected_ids))
    return sum(recalls) / len(recalls)

class TestRAGRegression:
    def test_recall_does_not_regress(self, retriever, test_cases):
        """Recall@3 should not drop below the baseline."""
        current_recall = compute_recall_at_k(test_cases, retriever, k=3)
        
        if BASELINE_FILE.exists():
            baseline = json.loads(BASELINE_FILE.read_text())
            baseline_recall = baseline.get("recall_at_3", 0)
            
            # Allow 5% regression tolerance
            assert current_recall >= baseline_recall * 0.95, (
                f"Recall@3 regressed: {baseline_recall:.3f}{current_recall:.3f}"
            )
        else:
            # Save baseline on first run
            BASELINE_FILE.parent.mkdir(exist_ok=True)
            BASELINE_FILE.write_text(json.dumps({"recall_at_3": current_recall}))

Summary

Testing RAG pipelines built with LangChain or LlamaIndex requires testing at multiple levels:

  • Embedding tests: Verify the embedding model produces meaningful vectors
  • Retrieval tests: Measure Recall@K against a curated ground-truth dataset
  • Context assembly tests: Verify token limits, ordering, and separator logic
  • Prompt construction tests: Ensure context is correctly injected into prompts
  • Integration tests: Validate end-to-end quality with real LLM calls
  • Regression tests: Track recall and quality metrics over time

Component tests run in milliseconds. Integration tests with real LLM calls should run on a schedule (not on every commit) to manage cost and latency. HelpMeTest can orchestrate both — fast unit tests on every PR, full integration evaluation nightly.

Read more