RAG Pipeline Testing with LangChain and LlamaIndex
RAG (Retrieval-Augmented Generation) pipelines combine a retrieval system with an LLM to answer questions using your own data. Testing RAG pipelines is challenging because both the retrieval component and the generation component can fail independently — or fail together in ways that aren't obvious.
This guide covers testing RAG pipelines built with LangChain and LlamaIndex, from component-level unit tests to end-to-end evaluation.
RAG Pipeline Architecture
A typical RAG pipeline has these components:
User Query
↓
[Query Embedding] ← Embedding Model
↓
[Vector Retrieval] ← Vector Store
↓
[Context Assembly] — Chunk selection, deduplication, reranking
↓
[Prompt Construction] — System prompt + context + query
↓
[LLM Generation] ← Language Model
↓
ResponseEach step can be tested independently. Component tests are fast and don't require LLM calls; integration tests validate the full pipeline.
What to Test at Each Stage
| Stage | What can go wrong | Test type |
|---|---|---|
| Query embedding | Wrong model, wrong dimensions | Unit |
| Retrieval | Low recall, wrong chunks retrieved | Unit/Integration |
| Context assembly | Too much context (token overflow), duplicate chunks | Unit |
| Prompt construction | Context not included, format errors | Unit |
| LLM generation | Hallucination, off-topic, too long/short | Integration/Eval |
| End-to-end | Query not answered despite relevant docs | Eval |
Testing LangChain Retrieval
# tests/test_langchain_retriever.py
import pytest
from unittest.mock import MagicMock, patch
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
@pytest.fixture
def mock_vector_store():
"""Create a Chroma vector store with pre-loaded test documents."""
# Use a real in-memory store for integration tests
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
docs = [
Document(
page_content="HelpMeTest is a cloud-based QA testing platform",
metadata={"source": "homepage", "category": "product"},
),
Document(
page_content="HelpMeTest uses Robot Framework and Playwright for test automation",
metadata={"source": "docs", "category": "technical"},
),
Document(
page_content="HelpMeTest pricing starts at $0 for the free tier with 10 tests",
metadata={"source": "pricing", "category": "pricing"},
),
Document(
page_content="Machine learning model training requires GPU infrastructure",
metadata={"source": "blog", "category": "ml"},
),
]
store = Chroma.from_documents(docs, embeddings)
return store
class TestLangChainRetriever:
def test_retrieves_relevant_documents(self, mock_vector_store):
"""Retriever should return documents relevant to the query."""
retriever = mock_vector_store.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("What testing frameworks does HelpMeTest use?")
# Should retrieve technical docs, not pricing or ML docs
doc_contents = [d.page_content for d in docs]
relevant = any("Robot Framework" in c or "Playwright" in c for c in doc_contents)
assert relevant, f"No relevant docs retrieved. Got: {doc_contents}"
def test_retriever_respects_k_parameter(self, mock_vector_store):
"""Retriever should return at most k documents."""
for k in [1, 2, 3]:
retriever = mock_vector_store.as_retriever(search_kwargs={"k": k})
docs = retriever.invoke("HelpMeTest testing")
assert len(docs) <= k, f"Expected <= {k} docs, got {len(docs)}"
def test_metadata_filter_narrows_results(self, mock_vector_store):
"""Metadata filter should restrict retrieval to matching documents."""
retriever = mock_vector_store.as_retriever(
search_kwargs={"k": 5, "filter": {"category": "pricing"}}
)
docs = retriever.invoke("HelpMeTest cost")
for doc in docs:
assert doc.metadata["category"] == "pricing", (
f"Got non-pricing doc: {doc.metadata}"
)
def test_handles_query_with_no_relevant_docs(self, mock_vector_store):
"""Retriever should return results even for off-topic queries."""
retriever = mock_vector_store.as_retriever(search_kwargs={"k": 1})
# Query about cooking — no relevant docs exist
docs = retriever.invoke("best pasta carbonara recipe")
# Should not raise — may return unrelated docs or empty
assert isinstance(docs, list)Testing LlamaIndex Query Engine
# tests/test_llamaindex_engine.py
import pytest
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from unittest.mock import MagicMock, patch
@pytest.fixture(scope="session")
def test_index():
"""Build a small test index with controlled content."""
documents = [
Document(
text="""
HelpMeTest is a cloud-hosted SaaS platform for software quality assurance.
It uses Robot Framework and Playwright for browser automation.
The free tier includes 10 tests and unlimited health checks.
The Pro plan costs $100 per month and includes unlimited tests.
""",
metadata={"source": "helpmetest-docs"},
),
]
parser = SentenceSplitter(chunk_size=256, chunk_overlap=20)
nodes = parser.get_nodes_from_documents(documents)
return VectorStoreIndex(nodes)
class TestLlamaIndexQueryEngine:
def test_query_returns_response(self, test_index):
"""Query engine should return a non-empty response."""
engine = test_index.as_query_engine()
response = engine.query("What is HelpMeTest?")
assert response is not None
assert len(str(response)) > 0
def test_source_nodes_included(self, test_index):
"""Response should include source nodes for traceability."""
engine = test_index.as_query_engine(response_mode="compact")
response = engine.query("What frameworks does HelpMeTest use?")
assert len(response.source_nodes) > 0
# Source nodes should contain the relevant content
node_texts = [n.text for n in response.source_nodes]
relevant = any("Robot Framework" in t or "Playwright" in t for t in node_texts)
assert relevant, f"Source nodes don't contain relevant content: {node_texts}"
def test_retrieves_correct_chunk_count(self, test_index):
"""Query engine should retrieve the configured number of chunks."""
engine = test_index.as_query_engine(similarity_top_k=2)
response = engine.query("HelpMeTest pricing and features")
assert len(response.source_nodes) <= 2Testing Context Assembly
The context assembly step assembles retrieved chunks into a prompt. Test it independently:
# tests/test_context_assembly.py
import pytest
def assemble_context(
chunks: list[str],
max_tokens: int = 2000,
separator: str = "\n\n---\n\n",
) -> str:
"""Assemble retrieved chunks into a context string respecting token limits."""
# Rough token estimate: 1 token ≈ 4 characters
assembled = []
current_length = 0
for chunk in chunks:
chunk_tokens = len(chunk) // 4
if current_length + chunk_tokens > max_tokens:
break
assembled.append(chunk)
current_length += chunk_tokens
return separator.join(assembled)
class TestContextAssembly:
def test_assembles_chunks_with_separator(self):
chunks = ["Chunk A", "Chunk B", "Chunk C"]
result = assemble_context(chunks)
assert "Chunk A" in result
assert "Chunk B" in result
assert "---" in result # Separator present
def test_respects_token_limit(self):
"""Large context should be truncated to fit within token limit."""
# Each chunk is ~100 tokens (400 chars)
large_chunks = ["x" * 400] * 10
result = assemble_context(large_chunks, max_tokens=500)
# Should contain at most 500 tokens worth of content
estimated_tokens = len(result) // 4
assert estimated_tokens <= 500 + 100 # Small buffer for separators
def test_empty_chunks_returns_empty_string(self):
result = assemble_context([])
assert result == ""
def test_single_chunk_no_separator(self):
result = assemble_context(["Only chunk"])
assert result == "Only chunk"
assert "---" not in result
def test_preserves_chunk_order(self):
"""Most relevant chunks (first) should appear first in context."""
chunks = ["Most relevant", "Less relevant", "Least relevant"]
result = assemble_context(chunks)
assert result.index("Most relevant") < result.index("Less relevant")
assert result.index("Less relevant") < result.index("Least relevant")Testing Prompt Construction
# tests/test_prompt_construction.py
def build_rag_prompt(query: str, context: str, system_prompt: str = "") -> dict:
"""Build the messages array for the LLM call."""
base_system = """You are a helpful assistant. Answer the user's question using ONLY
the context provided below. If the answer is not in the context, say 'I don't know.'
Context:
{context}
""".strip()
system = system_prompt or base_system.format(context=context)
return {
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": query},
]
}
class TestPromptConstruction:
def test_context_included_in_system_prompt(self):
prompt = build_rag_prompt("What is X?", context="X is a testing tool")
system_content = prompt["messages"][0]["content"]
assert "X is a testing tool" in system_content
def test_query_is_user_message(self):
prompt = build_rag_prompt("What is X?", context="X is a testing tool")
user_message = prompt["messages"][1]
assert user_message["role"] == "user"
assert user_message["content"] == "What is X?"
def test_message_structure_is_valid(self):
prompt = build_rag_prompt("Query", context="Context")
assert "messages" in prompt
assert len(prompt["messages"]) >= 2
for msg in prompt["messages"]:
assert "role" in msg
assert "content" in msg
assert msg["role"] in ("system", "user", "assistant")
def test_empty_context_handled(self):
prompt = build_rag_prompt("What is X?", context="")
# Should not crash — may produce a less useful prompt
assert prompt is not None
assert len(prompt["messages"]) == 2End-to-End Evaluation with Real LLM Calls
For integration tests that make real LLM calls, evaluate answer quality:
# tests/evaluation/test_rag_quality.py
import pytest
from dataclasses import dataclass
from typing import Callable
@dataclass
class QATestCase:
question: str
expected_keywords: list[str] # Words that must appear in the answer
forbidden_keywords: list[str] # Words that signal hallucination
must_cite_source: bool = True
QA_TEST_CASES = [
QATestCase(
question="What is the price of HelpMeTest Pro plan?",
expected_keywords=["$100", "month", "pro"],
forbidden_keywords=["$50", "$200", "free"],
must_cite_source=True,
),
QATestCase(
question="What testing frameworks does HelpMeTest use?",
expected_keywords=["robot framework", "playwright"],
forbidden_keywords=["selenium", "cypress"],
must_cite_source=True,
),
QATestCase(
question="Does HelpMeTest support Kubernetes deployment?",
expected_keywords=["don't know", "not mentioned", "cannot"],
forbidden_keywords=[],
must_cite_source=False,
),
]
@pytest.mark.integration
class TestRAGQuality:
"""Integration tests that make real LLM calls. Run separately from unit tests."""
def test_answer_contains_expected_content(self, rag_pipeline):
"""Answers should contain keywords from the ground-truth context."""
for case in QA_TEST_CASES:
if not case.expected_keywords:
continue
result = rag_pipeline.query(case.question)
answer_lower = result.answer.lower()
missing = [
kw for kw in case.expected_keywords
if kw.lower() not in answer_lower
]
assert not missing, (
f"Question: '{case.question}'\n"
f"Missing keywords: {missing}\n"
f"Got: {result.answer}"
)
def test_answer_does_not_hallucinate(self, rag_pipeline):
"""Answers should not contain made-up facts."""
for case in QA_TEST_CASES:
if not case.forbidden_keywords:
continue
result = rag_pipeline.query(case.question)
answer_lower = result.answer.lower()
hallucinated = [
kw for kw in case.forbidden_keywords
if kw.lower() in answer_lower
]
assert not hallucinated, (
f"Question: '{case.question}'\n"
f"Hallucinated content: {hallucinated}\n"
f"Got: {result.answer}"
)
def test_unanswerable_questions_get_honest_response(self, rag_pipeline):
"""Questions not in the knowledge base should get an honest 'I don't know'."""
result = rag_pipeline.query("What is the recipe for chocolate cake?")
honest_phrases = ["don't know", "not in", "cannot find", "no information"]
is_honest = any(p in result.answer.lower() for p in honest_phrases)
assert is_honest, (
f"Expected honest 'I don't know' response, got: {result.answer}"
)Regression Testing
Track quality metrics over time to detect regressions:
# tests/test_rag_regression.py
import json
import os
from pathlib import Path
BASELINE_FILE = Path("tests/baselines/rag_metrics.json")
def compute_recall_at_k(test_cases, retriever, k=3):
"""Compute average Recall@K across test cases."""
recalls = []
for case in test_cases:
results = retriever.retrieve(case.query)[:k]
result_ids = {r.node_id for r in results}
expected_ids = set(case.expected_doc_ids)
recalls.append(len(result_ids & expected_ids) / len(expected_ids))
return sum(recalls) / len(recalls)
class TestRAGRegression:
def test_recall_does_not_regress(self, retriever, test_cases):
"""Recall@3 should not drop below the baseline."""
current_recall = compute_recall_at_k(test_cases, retriever, k=3)
if BASELINE_FILE.exists():
baseline = json.loads(BASELINE_FILE.read_text())
baseline_recall = baseline.get("recall_at_3", 0)
# Allow 5% regression tolerance
assert current_recall >= baseline_recall * 0.95, (
f"Recall@3 regressed: {baseline_recall:.3f} → {current_recall:.3f}"
)
else:
# Save baseline on first run
BASELINE_FILE.parent.mkdir(exist_ok=True)
BASELINE_FILE.write_text(json.dumps({"recall_at_3": current_recall}))Summary
Testing RAG pipelines built with LangChain or LlamaIndex requires testing at multiple levels:
- Embedding tests: Verify the embedding model produces meaningful vectors
- Retrieval tests: Measure Recall@K against a curated ground-truth dataset
- Context assembly tests: Verify token limits, ordering, and separator logic
- Prompt construction tests: Ensure context is correctly injected into prompts
- Integration tests: Validate end-to-end quality with real LLM calls
- Regression tests: Track recall and quality metrics over time
Component tests run in milliseconds. Integration tests with real LLM calls should run on a schedule (not on every commit) to manage cost and latency. HelpMeTest can orchestrate both — fast unit tests on every PR, full integration evaluation nightly.