Vector Database Testing Guide: Embeddings, Similarity Search, and Accuracy
Vector databases power the retrieval layer of AI applications — semantic search, RAG pipelines, recommendation systems, and knowledge bases. But testing them is different from testing SQL databases. You can't assert on exact row matches; you're asserting on approximate similarity, ranking order, and threshold behavior.
This guide covers a comprehensive testing strategy for vector databases: what to test, how to structure tests, and practical examples for the major providers.
What Makes Vector Database Testing Different
A traditional database test might be: "Insert record A, query for it, expect exactly record A back." Vector database tests are fundamentally probabilistic:
- Embedding correctness: Does similar text produce similar vectors?
- Recall: What percentage of truly relevant results appear in the top-K?
- Precision: What fraction of returned results are actually relevant?
- Threshold sensitivity: At what similarity score does noise start appearing?
- Index configuration: Does changing the index algorithm affect result quality?
You need a ground-truth dataset to test against — known query-result pairs where you can measure whether the retrieval is working correctly.
Setting Up a Test Dataset
The foundation of vector database testing is a curated test dataset:
# tests/fixtures/test_corpus.py
import pytest
from dataclasses import dataclass
from typing import List
@dataclass
class TestDocument:
id: str
text: str
category: str
expected_similar: List[str] # IDs of documents that should be similar
TEST_DOCUMENTS = [
TestDocument(
id="doc-001",
text="Python unit testing with pytest and fixtures",
category="testing",
expected_similar=["doc-002", "doc-003"],
),
TestDocument(
id="doc-002",
text="Writing automated tests in Python using pytest framework",
category="testing",
expected_similar=["doc-001", "doc-003"],
),
TestDocument(
id="doc-003",
text="Test fixtures and parametrize in pytest",
category="testing",
expected_similar=["doc-001", "doc-002"],
),
TestDocument(
id="doc-004",
text="Machine learning model deployment with Docker",
category="ml-ops",
expected_similar=["doc-005"],
),
TestDocument(
id="doc-005",
text="Containerizing ML models for production deployment",
category="ml-ops",
expected_similar=["doc-004"],
),
]
TEST_QUERIES = [
{
"query": "how to write tests in python",
"expected_top_3": ["doc-001", "doc-002", "doc-003"],
"should_not_include": ["doc-004", "doc-005"],
},
{
"query": "deploy machine learning containers",
"expected_top_2": ["doc-004", "doc-005"],
"should_not_include": ["doc-001", "doc-002", "doc-003"],
},
]Testing Embedding Correctness
Before testing retrieval, verify that your embedding model produces sensible vectors:
# tests/test_embeddings.py
import pytest
import numpy as np
from sentence_transformers import SentenceTransformer
@pytest.fixture(scope="session")
def embedding_model():
return SentenceTransformer("all-MiniLM-L6-v2")
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
class TestEmbeddingCorrectness:
def test_similar_texts_have_high_cosine_similarity(self, embedding_model):
"""Semantically similar texts should produce similar vectors."""
text1 = "Python unit testing with pytest"
text2 = "Writing tests in Python using pytest framework"
emb1 = embedding_model.encode(text1)
emb2 = embedding_model.encode(text2)
similarity = cosine_similarity(emb1, emb2)
assert similarity > 0.8, f"Expected > 0.8, got {similarity}"
def test_dissimilar_texts_have_low_cosine_similarity(self, embedding_model):
"""Semantically unrelated texts should have low similarity."""
text1 = "Python unit testing with pytest"
text2 = "Deploy machine learning models with Docker"
emb1 = embedding_model.encode(text1)
emb2 = embedding_model.encode(text2)
similarity = cosine_similarity(emb1, emb2)
assert similarity < 0.5, f"Expected < 0.5, got {similarity}"
def test_identical_texts_have_perfect_similarity(self, embedding_model):
"""Identical texts should have similarity of 1.0."""
text = "Testing vector databases with Python"
emb1 = embedding_model.encode(text)
emb2 = embedding_model.encode(text)
similarity = cosine_similarity(emb1, emb2)
assert abs(similarity - 1.0) < 1e-5, f"Expected ~1.0, got {similarity}"
def test_embedding_dimensions_consistent(self, embedding_model):
"""All embeddings should have the same dimension."""
texts = ["short", "a much longer piece of text", "中文文本"]
embeddings = [embedding_model.encode(t) for t in texts]
dimensions = [len(e) for e in embeddings]
assert len(set(dimensions)) == 1, f"Inconsistent dimensions: {dimensions}"
def test_embedding_model_handles_edge_cases(self, embedding_model):
"""Edge cases should produce valid embeddings without errors."""
edge_cases = [
"", # Empty string
" ", # Whitespace only
"a" * 10000, # Very long text
"!!!???###", # Special characters only
]
for text in edge_cases:
embedding = embedding_model.encode(text)
assert embedding is not None
assert not np.any(np.isnan(embedding)), f"NaN in embedding for: {repr(text)[:50]}"Testing Similarity Search Accuracy (Recall@K)
Recall@K measures what percentage of truly relevant results appear in the top-K:
# tests/test_similarity_search.py
import pytest
from tests.fixtures.test_corpus import TEST_DOCUMENTS, TEST_QUERIES
@pytest.fixture(scope="session")
def populated_index(vector_db_client, embedding_model):
"""Create and populate a test index, clean up after session."""
index_name = "test-recall"
# Create index
vector_db_client.create_index(index_name, dimension=384)
# Upsert test documents
vectors = [
{
"id": doc.id,
"values": embedding_model.encode(doc.text).tolist(),
"metadata": {"text": doc.text, "category": doc.category},
}
for doc in TEST_DOCUMENTS
]
vector_db_client.upsert(index_name, vectors)
yield index_name
# Cleanup
vector_db_client.delete_index(index_name)
class TestSimilaritySearchAccuracy:
def test_recall_at_3(self, populated_index, vector_db_client, embedding_model):
"""Top-3 results should contain the expected relevant documents."""
for test_case in TEST_QUERIES:
query_embedding = embedding_model.encode(test_case["query"])
results = vector_db_client.query(
populated_index,
vector=query_embedding.tolist(),
top_k=3,
)
result_ids = [r["id"] for r in results["matches"]]
expected = set(test_case["expected_top_3"])
actual = set(result_ids)
recall = len(expected & actual) / len(expected)
assert recall >= 0.67, (
f"Query '{test_case['query']}': Recall@3 = {recall:.2f}, "
f"expected {expected}, got {actual}"
)
def test_relevant_results_outrank_irrelevant(self, populated_index, vector_db_client, embedding_model):
"""Relevant documents should rank above irrelevant ones."""
for test_case in TEST_QUERIES:
if "should_not_include" not in test_case:
continue
query_embedding = embedding_model.encode(test_case["query"])
results = vector_db_client.query(
populated_index,
vector=query_embedding.tolist(),
top_k=5,
)
result_ids = [r["id"] for r in results["matches"]]
# Check none of the "should not include" IDs appear in top-3
top_3_ids = set(result_ids[:3])
should_not = set(test_case["should_not_include"])
contamination = top_3_ids & should_not
assert not contamination, (
f"Query '{test_case['query']}': Irrelevant docs in top-3: {contamination}"
)
def test_similarity_scores_are_ranked(self, populated_index, vector_db_client, embedding_model):
"""Results should be returned in descending similarity order."""
query_embedding = embedding_model.encode("python testing")
results = vector_db_client.query(
populated_index,
vector=query_embedding.tolist(),
top_k=5,
include_values=True,
)
scores = [r["score"] for r in results["matches"]]
assert scores == sorted(scores, reverse=True), "Results not in descending score order"
def test_similarity_threshold_filtering(self, populated_index, vector_db_client, embedding_model):
"""Low-similarity results should be excluded when threshold is set."""
query_embedding = embedding_model.encode("completely unrelated topic about cooking")
results = vector_db_client.query(
populated_index,
vector=query_embedding.tolist(),
top_k=10,
score_threshold=0.7, # High threshold
)
# Cooking query against testing/ml-ops docs should return few or no results above threshold
assert len(results.get("matches", [])) <= 2Testing Metadata Filtering
Vector databases support metadata filtering to narrow results before similarity search:
class TestMetadataFiltering:
def test_category_filter_excludes_other_categories(
self, populated_index, vector_db_client, embedding_model
):
"""Metadata filter should restrict results to matching documents only."""
query_embedding = embedding_model.encode("automated testing")
# Filter to ml-ops category only
results = vector_db_client.query(
populated_index,
vector=query_embedding.tolist(),
top_k=10,
filter={"category": {"$eq": "ml-ops"}},
)
for match in results.get("matches", []):
assert match["metadata"]["category"] == "ml-ops", (
f"Expected ml-ops, got {match['metadata']['category']} for {match['id']}"
)
def test_filter_narrows_result_count(self, populated_index, vector_db_client, embedding_model):
"""Category filter should return fewer results than unfiltered query."""
query_embedding = embedding_model.encode("automated testing")
unfiltered = vector_db_client.query(
populated_index, vector=query_embedding.tolist(), top_k=10
)
filtered = vector_db_client.query(
populated_index,
vector=query_embedding.tolist(),
top_k=10,
filter={"category": {"$eq": "testing"}},
)
assert len(filtered["matches"]) <= len(unfiltered["matches"])Testing Index Operations
Test that the database handles CRUD operations correctly:
class TestIndexOperations:
def test_upsert_and_retrieve(self, vector_db_client, embedding_model):
"""Documents upserted should be retrievable by ID."""
test_id = "test-upsert-001"
embedding = embedding_model.encode("test upsert document").tolist()
vector_db_client.upsert("test-index", [{"id": test_id, "values": embedding}])
result = vector_db_client.fetch("test-index", ids=[test_id])
assert test_id in result["vectors"]
def test_update_existing_vector(self, vector_db_client, embedding_model):
"""Upserting an existing ID should update the vector."""
doc_id = "test-update-001"
original = embedding_model.encode("original text").tolist()
updated = embedding_model.encode("completely different updated text").tolist()
vector_db_client.upsert("test-index", [{"id": doc_id, "values": original}])
vector_db_client.upsert("test-index", [{"id": doc_id, "values": updated}])
result = vector_db_client.fetch("test-index", ids=[doc_id])
fetched_vector = result["vectors"][doc_id]["values"]
# Vector should match updated, not original
assert fetched_vector != original
assert fetched_vector == pytest.approx(updated, abs=1e-6)
def test_delete_removes_vector(self, vector_db_client, embedding_model):
"""Deleted vectors should not appear in query results."""
doc_id = "test-delete-001"
text = "this document will be deleted"
embedding = embedding_model.encode(text).tolist()
vector_db_client.upsert("test-index", [{"id": doc_id, "values": embedding}])
vector_db_client.delete("test-index", ids=[doc_id])
# Query with the same text — deleted doc should not appear
results = vector_db_client.query("test-index", vector=embedding, top_k=5)
result_ids = [r["id"] for r in results.get("matches", [])]
assert doc_id not in result_idsPerformance Testing
Latency and throughput matter for production use:
import time
import statistics
class TestSearchPerformance:
def test_query_latency_under_500ms(self, populated_index, vector_db_client, embedding_model):
"""Single query should return in under 500ms."""
query_embedding = embedding_model.encode("test query for latency measurement").tolist()
start = time.monotonic()
vector_db_client.query(populated_index, vector=query_embedding, top_k=10)
latency_ms = (time.monotonic() - start) * 1000
assert latency_ms < 500, f"Query latency {latency_ms:.0f}ms exceeds 500ms threshold"
def test_p95_latency_under_200ms(self, populated_index, vector_db_client, embedding_model):
"""95th percentile query latency should be under 200ms."""
query_embedding = embedding_model.encode("performance test query").tolist()
latencies = []
for _ in range(20):
start = time.monotonic()
vector_db_client.query(populated_index, vector=query_embedding, top_k=10)
latencies.append((time.monotonic() - start) * 1000)
p95 = sorted(latencies)[int(0.95 * len(latencies))]
assert p95 < 200, f"P95 latency {p95:.0f}ms exceeds 200ms threshold"Connecting to HelpMeTest
Vector database tests need to run continuously — embedding model updates, index configuration changes, and data drift all affect retrieval quality over time. HelpMeTest monitors your vector database test suite on a schedule, alerting you when recall drops or latency degrades without requiring manual test runs.
Summary
Testing vector databases requires a different mindset than relational database testing:
- Build a ground-truth dataset — known query/result pairs to measure against
- Test embedding correctness — similar texts, similar vectors; dissimilar texts, dissimilar vectors
- Measure Recall@K — what fraction of expected results appear in the top-K
- Test metadata filtering — ensure filters correctly narrow the result space
- Test index operations — upsert, update, and delete behavior
- Monitor latency — query P95 latency is a production quality signal
Quality vector database tests give you confidence that your similarity search works as intended — not just that the vectors are stored, but that users get the right answers.