Testing Chunking Strategies for Document Retrieval Quality

Testing Chunking Strategies for Document Retrieval Quality

Chunking is one of the most impactful and least-tested parts of a RAG pipeline. How you split documents into chunks directly determines retrieval quality: too small and chunks lose context; too large and they include irrelevant content that confuses the LLM; wrong boundaries and questions about sentence-end information miss the answer entirely.

Most teams pick a chunk size, try it manually on a few examples, and move on. That's not testing. Testing chunking means measuring retrieval quality across a representative set of documents and queries, comparing strategies, and catching regressions when you change the approach.

The Chunking Quality Variables

Before writing tests, understand what you're measuring:

  • Chunk size — tokens per chunk (128, 256, 512, 1024)
  • Overlap — how many tokens from one chunk appear at the start of the next
  • Splitting strategy — fixed-size, sentence-based, paragraph-based, semantic, recursive
  • Metadata preservation — does each chunk retain source document ID, page number, section heading?

Each choice is a hypothesis. Tests validate that hypothesis against real documents and queries.

Testing Fixed-Size Chunking

from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

def chunk_document_fixed(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separator="\n\n",
    )
    return splitter.split_text(text)

def test_chunks_do_not_exceed_max_size():
    document = "word " * 2000
    chunks = chunk_document_fixed(document, chunk_size=500, overlap=50)

    for i, chunk in enumerate(chunks):
        assert len(chunk) <= 550, (  # allow small overage at sentence boundaries
            f"Chunk {i} has {len(chunk)} chars, expected <= 550"
        )

def test_chunk_count_is_reasonable():
    document = "word " * 1000  # ~5000 chars
    chunks = chunk_document_fixed(document, chunk_size=500, overlap=50)

    # With 500 char chunks and 50 overlap, expect roughly 10-12 chunks
    assert 8 <= len(chunks) <= 15, f"Unexpected chunk count: {len(chunks)}"

def test_overlap_preserves_boundary_content():
    """Verify that content at chunk boundaries appears in both adjacent chunks."""
    document = "\n\n".join([f"Paragraph {i}: " + "content " * 20 for i in range(10)])
    chunks = chunk_document_fixed(document, chunk_size=200, overlap=50)

    # Get the last 30 chars of chunk N and first 60 chars of chunk N+1
    # Overlap content should appear in both
    for i in range(len(chunks) - 1):
        end_of_chunk = chunks[i][-40:]
        start_of_next = chunks[i + 1][:80]

        # Some overlap should exist
        words_from_end = set(end_of_chunk.split())
        words_from_next = set(start_of_next.split())
        overlap_words = words_from_end & words_from_next

        assert len(overlap_words) > 0, (
            f"No overlap detected between chunk {i} and chunk {i+1}. "
            f"End of chunk {i}: '{end_of_chunk}'\n"
            f"Start of chunk {i+1}: '{start_of_next}'"
        )

Testing Semantic Chunking

Semantic chunking splits on meaning boundaries rather than character count. Test that semantically related content stays together:

from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings

def test_semantic_chunker_keeps_related_content_together():
    document = """
    Python is a high-level programming language known for its readability.
    It uses indentation to define code blocks.
    Python supports multiple programming paradigms including procedural and object-oriented.

    Machine learning is a subset of artificial intelligence.
    It enables systems to learn from data without explicit programming.
    Common ML libraries in Python include scikit-learn, TensorFlow, and PyTorch.

    Web development with Python often uses frameworks like Django and Flask.
    Django provides a full-stack solution with ORM, routing, and templates.
    Flask is a microframework suitable for simpler applications.
    """

    chunker = SemanticChunker(embeddings=OpenAIEmbeddings())
    chunks = chunker.split_text(document)

    # Python basics, ML, and web dev should ideally be in separate chunks
    # At minimum, ML content should not be split across chunks that contain Python basics
    ml_chunk = next((c for c in chunks if "machine learning" in c.lower()), None)
    web_chunk = next((c for c in chunks if "django" in c.lower()), None)
    python_chunk = next((c for c in chunks if "indentation" in c.lower()), None)

    if ml_chunk and web_chunk:
        assert ml_chunk != web_chunk, "ML and web dev content should be in separate chunks"

def test_semantic_chunk_sizes_are_within_bounds():
    long_document = " ".join(["technical content"] * 500)

    chunker = SemanticChunker(
        embeddings=OpenAIEmbeddings(),
        breakpoint_threshold_type="percentile",
        breakpoint_threshold_amount=95,
    )
    chunks = chunker.split_text(long_document)

    for chunk in chunks:
        token_estimate = len(chunk.split())
        assert token_estimate <= 1500, f"Semantic chunk too large: ~{token_estimate} tokens"
        assert token_estimate >= 10, f"Semantic chunk too small: ~{token_estimate} tokens"

Testing Retrieval Quality Per Chunking Strategy

The real test: does this chunking strategy allow the retriever to find the right content?

import pytest
from your_vector_store import VectorStore

@pytest.mark.parametrize("chunk_size,overlap", [
    (256, 0),
    (256, 50),
    (512, 0),
    (512, 100),
    (1024, 100),
])
def test_retrieval_quality_for_chunk_config(chunk_size, overlap, test_documents, eval_queries):
    """
    test_documents: list of (doc_id, full_text) pairs
    eval_queries: list of (query, expected_doc_id, expected_answer_snippet)
    """
    # Build fresh index with this chunking config
    store = VectorStore(collection=f"test-{chunk_size}-{overlap}")

    for doc_id, text in test_documents:
        chunks = chunk_document_fixed(text, chunk_size=chunk_size, overlap=overlap)
        store.upsert_chunks(doc_id, chunks)

    hits = 0
    for query, expected_doc_id, expected_snippet in eval_queries:
        results = store.search(query, top_k=5)

        # Does the expected document appear in top-5?
        doc_hit = any(r.source_doc_id == expected_doc_id for r in results)

        # Does any retrieved chunk contain the expected answer snippet?
        snippet_hit = any(expected_snippet.lower() in r.content.lower() for r in results)

        if doc_hit and snippet_hit:
            hits += 1

    recall = hits / len(eval_queries)
    print(f"chunk_size={chunk_size}, overlap={overlap}: recall={recall:.3f}")

    # Minimum acceptable recall
    assert recall >= 0.75, (
        f"Chunk config (size={chunk_size}, overlap={overlap}) has recall {recall:.3f} < 0.75"
    )

    store.delete_collection()

Testing Chunk Metadata Preservation

Each chunk must carry metadata linking it back to the source document:

def test_chunks_preserve_source_metadata():
    from your_chunker import DocumentChunker

    chunker = DocumentChunker(chunk_size=512, overlap=50)
    chunks = chunker.chunk_document(
        doc_id="manual-v2-chapter3",
        text="Chapter 3 content " * 100,
        metadata={
            "source": "user-manual-v2.pdf",
            "page": 42,
            "section": "Chapter 3",
        }
    )

    for chunk in chunks:
        assert chunk.metadata["source_doc_id"] == "manual-v2-chapter3"
        assert chunk.metadata["source"] == "user-manual-v2.pdf"
        assert chunk.metadata["page"] == 42
        assert "chunk_index" in chunk.metadata  # which chunk within the document

def test_chunk_index_is_sequential():
    chunker = DocumentChunker(chunk_size=200, overlap=20)
    chunks = chunker.chunk_document(doc_id="test-doc", text="word " * 500)

    indices = [c.metadata["chunk_index"] for c in chunks]
    assert indices == list(range(len(chunks))), "Chunk indices must be sequential starting from 0"

Testing Cross-Boundary Questions

The hardest retrieval case: the answer spans a chunk boundary. Test this explicitly:

def test_overlap_helps_answer_boundary_spanning_questions():
    """
    The answer to a question is split across two chunks.
    Overlap should help one of the chunks contain enough context to answer.
    """
    # Craft a document where the key fact spans where a 100-char chunk would split
    # Ensure first half of fact is at end of one chunk, second half at start of next
    padding = "filler content " * 6  # ~100 chars to push to boundary
    key_fact = "The answer is forty-two. This was determined by the Deep Thought computer."

    document = padding + key_fact + padding * 5

    no_overlap_chunks = chunk_document_fixed(document, chunk_size=100, overlap=0)
    with_overlap_chunks = chunk_document_fixed(document, chunk_size=100, overlap=30)

    query_words = set("what is the answer deep thought".split())

    def chunks_covering_query(chunks):
        return [c for c in chunks if len(set(c.lower().split()) & query_words) >= 2]

    no_overlap_coverage = chunks_covering_query(no_overlap_chunks)
    with_overlap_coverage = chunks_covering_query(with_overlap_chunks)

    # Overlap should provide more coverage of boundary-spanning content
    assert len(with_overlap_coverage) >= len(no_overlap_coverage), (
        "Overlap did not improve boundary-spanning query coverage"
    )

Comparing Chunking Strategies with a Score Report

def compare_chunking_strategies(test_documents, eval_queries):
    strategies = {
        "fixed-256-0": {"size": 256, "overlap": 0, "type": "fixed"},
        "fixed-512-50": {"size": 512, "overlap": 50, "type": "fixed"},
        "fixed-1024-100": {"size": 1024, "overlap": 100, "type": "fixed"},
        "recursive-512": {"size": 512, "overlap": 50, "type": "recursive"},
    }

    results = {}
    for name, config in strategies.items():
        recall = measure_recall(config, test_documents, eval_queries)
        results[name] = recall

    print("\nChunking Strategy Comparison:")
    print("-" * 40)
    for name, recall in sorted(results.items(), key=lambda x: x[1], reverse=True):
        print(f"{name:30s}: {recall:.3f}")

    best_strategy = max(results, key=results.get)
    print(f"\nBest strategy: {best_strategy} (recall={results[best_strategy]:.3f})")
    return results

Key Takeaways

  • Test chunk sizes, overlaps, and strategies systematically — don't guess
  • Measure retrieval recall: what fraction of eval queries find the expected document in top-K?
  • Always test overlap: boundary-spanning questions need it, and you should quantify the improvement
  • Verify chunk metadata is preserved — source doc ID and chunk index are required for result attribution
  • Build a comparison harness to objectively pick the best chunking config for your document corpus
  • Rerun retrieval quality tests whenever you change the chunking strategy, even for "small" adjustments

Read more