Integration Testing for LangChain and LlamaIndex RAG Chains
LangChain and LlamaIndex are composable systems. A RAG chain is a sequence of components: a retriever, a prompt template, an LLM, an output parser. Each component can fail, and their integration can fail in ways that none of the individual components would catch in isolation.
Integration testing for RAG chains means testing the assembled pipeline — with realistic inputs, verifying that data flows correctly between components, and asserting on the outputs that matter for your application.
What Integration Tests Cover That Unit Tests Don't
Unit tests test individual components in isolation:
- "Does the retriever return documents with similarity scores?"
- "Does the prompt template format correctly?"
- "Does the LLM client handle rate limit errors?"
Integration tests verify the assembled chain:
- "When a user asks about topic X, does the chain retrieve X-related documents and generate a relevant answer?"
- "When the retriever returns no documents, does the chain return a graceful 'I don't know' response?"
- "When the LLM fails, does the chain surface a useful error rather than crashing?"
Both are necessary. Integration tests catch composition bugs that unit tests miss.
Testing LangChain RAG Chains
The Chain Structure
A typical LangChain RAG chain:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
def build_rag_chain(vector_store):
retriever = vector_store.as_retriever(search_kwargs={"k": 4})
prompt = ChatPromptTemplate.from_template("""
Answer the question based only on the following context.
If the context doesn't contain relevant information, say "I don't have enough information to answer this."
Context: {context}
Question: {question}
Answer:
""")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
return chain
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)Integration Test Setup with Mocked LLM
For deterministic integration tests, use a fake LLM:
import pytest
from langchain_core.language_models.fake import FakeListLLM
from unittest.mock import MagicMock
@pytest.fixture
def mock_vector_store():
"""Returns a vector store that returns known documents."""
mock = MagicMock()
mock.as_retriever.return_value = MagicMock(
invoke=lambda query: [
MagicMock(
page_content="PostgreSQL supports ACID transactions and complex queries.",
metadata={"source": "postgres-docs", "page": 1}
),
MagicMock(
page_content="PostgreSQL was created at UC Berkeley in 1986.",
metadata={"source": "postgres-docs", "page": 2}
)
]
)
return mock
@pytest.fixture
def fake_llm_chain(mock_vector_store):
"""Build chain with fake LLM for deterministic testing."""
# FakeListLLM returns responses from a predefined list in order
fake_llm = FakeListLLM(responses=[
"PostgreSQL supports ACID transactions and complex queries.",
"I don't have enough information to answer this.",
])
retriever = mock_vector_store.as_retriever()
prompt = ChatPromptTemplate.from_template(
"Context: {context}\nQuestion: {question}\nAnswer:"
)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| fake_llm
| StrOutputParser()
)
return chainTesting Happy Path
def test_rag_chain_returns_answer_for_known_query(fake_llm_chain):
answer = fake_llm_chain.invoke("What does PostgreSQL support?")
assert isinstance(answer, str)
assert len(answer) > 0
assert "ACID" in answer or "transactions" in answer.lower()
def test_rag_chain_invokes_retriever_with_user_query(mock_vector_store):
chain = build_rag_chain(mock_vector_store)
chain.invoke("test query")
retriever = mock_vector_store.as_retriever.return_value
retriever.invoke.assert_called_once_with("test query")Testing Empty Retrieval Handling
@pytest.fixture
def empty_retriever_chain():
mock_store = MagicMock()
mock_store.as_retriever.return_value = MagicMock(
invoke=lambda q: [] # returns no documents
)
fake_llm = FakeListLLM(responses=["I don't have enough information to answer this."])
retriever = mock_store.as_retriever()
prompt = ChatPromptTemplate.from_template(
"Context: {context}\nQuestion: {question}\nAnswer:"
)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| fake_llm
| StrOutputParser()
)
return chain
def test_chain_handles_empty_retrieval_gracefully(empty_retriever_chain):
answer = empty_retriever_chain.invoke("What is the capital of Mars?")
# Should not crash or return empty string
assert isinstance(answer, str)
assert len(answer) > 0
# Should indicate uncertainty (not hallucinate)
assert any(phrase in answer.lower() for phrase in [
"don't have", "insufficient", "cannot", "no information", "not sure"
])Testing Context Passing
Verify that retrieved documents are correctly formatted and passed to the prompt:
def test_retrieved_documents_passed_to_prompt(mock_vector_store):
captured_prompt_input = {}
def capture_prompt(input_dict):
captured_prompt_input.update(input_dict)
return ChatPromptValue(messages=[HumanMessage(content="mocked")])
mock_prompt = MagicMock(side_effect=capture_prompt)
fake_llm = FakeListLLM(responses=["mocked answer"])
retriever = mock_vector_store.as_retriever()
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| mock_prompt
| fake_llm
| StrOutputParser()
)
chain.invoke("What is PostgreSQL?")
assert "context" in captured_prompt_input
assert "PostgreSQL" in captured_prompt_input["context"]
assert "question" in captured_prompt_input
assert captured_prompt_input["question"] == "What is PostgreSQL?"Testing LlamaIndex RAG Pipelines
Setting Up LlamaIndex Tests
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.llms.mock import MockLLM
from llama_index.core.embeddings.mock_embed_model import MockEmbedding
@pytest.fixture
def llamaindex_test_engine():
"""Build LlamaIndex query engine with mock LLM and embeddings."""
docs = [
Document(
text="LlamaIndex is a data framework for LLM applications.",
metadata={"source": "llamaindex-docs", "page": 1}
),
Document(
text="LlamaIndex supports retrieval-augmented generation workflows.",
metadata={"source": "llamaindex-docs", "page": 2}
),
]
# Use mock LLM and embeddings for deterministic tests
mock_llm = MockLLM(max_tokens=256)
mock_embed = MockEmbedding(embed_dim=256)
index = VectorStoreIndex.from_documents(
docs,
llm=mock_llm,
embed_model=mock_embed,
)
return index.as_query_engine(llm=mock_llm, similarity_top_k=2)
def test_llamaindex_query_engine_returns_response(llamaindex_test_engine):
response = llamaindex_test_engine.query("What is LlamaIndex?")
assert response is not None
assert str(response) != ""
def test_llamaindex_response_includes_source_nodes(llamaindex_test_engine):
response = llamaindex_test_engine.query("What does LlamaIndex support?")
assert hasattr(response, 'source_nodes')
assert len(response.source_nodes) > 0
for node in response.source_nodes:
assert node.score is not None
assert node.score >= 0.0
def test_llamaindex_retrieves_relevant_nodes(llamaindex_test_engine):
response = llamaindex_test_engine.query("retrieval-augmented generation")
source_texts = [n.node.text for n in response.source_nodes]
assert any("retrieval" in t.lower() for t in source_texts), (
"Retrieved nodes should include text about retrieval-augmented generation"
)Testing Error Handling in RAG Chains
def test_chain_handles_llm_rate_limit_gracefully():
from langchain_core.exceptions import OutputParserException
from openai import RateLimitError
mock_llm = MagicMock()
mock_llm.invoke.side_effect = RateLimitError(
message="Rate limit exceeded",
response=MagicMock(status_code=429),
body={}
)
chain = build_rag_chain_with_llm(mock_vector_store, mock_llm)
with pytest.raises(RateLimitError):
chain.invoke("test query")
# Or if your chain has retry/fallback:
# response = chain.invoke("test query")
# assert response.get("error") == "rate_limit"
def test_chain_handles_retriever_timeout():
mock_store = MagicMock()
mock_store.as_retriever.return_value = MagicMock(
invoke=MagicMock(side_effect=TimeoutError("Vector DB timeout"))
)
chain = build_rag_chain(mock_store)
with pytest.raises(TimeoutError):
chain.invoke("test query")End-to-End Integration Tests (Against Real Services)
For CI environments with real vector DB and LLM access:
@pytest.mark.integration
@pytest.mark.skipif(
not os.getenv("OPENAI_API_KEY"),
reason="Requires OPENAI_API_KEY environment variable"
)
def test_full_rag_chain_with_real_llm():
"""Real E2E test — runs against actual OpenAI API and vector DB."""
chain = build_production_rag_chain()
answer = chain.invoke("What are the benefits of using a vector database for semantic search?")
assert isinstance(answer, str)
assert len(answer) > 100 # substantial answer
assert any(word in answer.lower() for word in ["semantic", "similarity", "embedding", "vector"])
# Should not be a refusal
assert "cannot" not in answer.lower()[:50]Key Takeaways
- Unit test components, integration test the assembled chain — both are necessary
- Use FakeListLLM and mock retrievers for deterministic, fast tests in CI
- Test the empty retrieval case explicitly — chains must handle no-context gracefully
- Verify that retrieved context is correctly formatted and passed to the prompt (test data flow, not just output)
- Use
@pytest.mark.integrationto separate fast unit/integration tests from slow real-API tests - Test error handling: LLM rate limits, retriever timeouts, and empty results all need explicit test coverage