Unit Testing Embeddings and Vector Similarity
Embeddings are the foundation of semantic search, RAG systems, and recommendation engines. When embedding behavior is wrong — whether because the model changed, the preprocessing changed, or the similarity calculation is buggy — everything downstream breaks silently. Search returns irrelevant results. RAG systems hallucinate. Recommendations get weird.
Testing embeddings requires thinking about what "correct" means for a vector. You can't do equality checks. Instead, you test relationships: semantically similar texts should have higher cosine similarity than semantically different texts, and those relationships should be stable across code changes.
What to Test for Embeddings
- Dimensionality — does the embedding have the expected number of dimensions?
- Normalization — is the vector unit-normalized (required for cosine similarity)?
- Semantic ordering — is
sim(A, B) > sim(A, C)when B is semantically closer to A than C? - Known anchors — specific (text1, text2) pairs should have similarity above/below known thresholds
- Stability — do the same inputs produce the same outputs deterministically?
- Preprocessing consistency — does whitespace, capitalization, or punctuation affect embeddings as expected?
Testing Embedding Dimensionality and Shape
import numpy as np
import pytest
from your_embedding_service import get_embedding, get_embeddings_batch
def test_single_embedding_shape():
embedding = get_embedding("Hello, world!")
assert isinstance(embedding, np.ndarray)
assert embedding.shape == (1536,) # OpenAI text-embedding-3-small
assert embedding.dtype in (np.float32, np.float64)
def test_batch_embedding_shape():
texts = ["First text", "Second text", "Third text"]
embeddings = get_embeddings_batch(texts)
assert embeddings.shape == (3, 1536)
def test_empty_string_handling():
with pytest.raises(ValueError, match="cannot embed empty string"):
get_embedding("")
def test_very_long_text_truncation():
long_text = "word " * 10000 # exceeds token limit
# Should either truncate or raise a clear error, not silently produce wrong output
try:
embedding = get_embedding(long_text)
assert embedding.shape == (1536,) # if truncation is policy
except ValueError as e:
assert "too long" in str(e).lower() # if error is policyTesting Vector Normalization
Cosine similarity assumes unit-normalized vectors. If your system normalizes vectors, test that normalization is applied correctly:
def test_embedding_is_unit_normalized():
embedding = get_embedding("Test sentence for normalization check")
norm = np.linalg.norm(embedding)
# Unit vector has norm = 1.0, allow small floating point error
assert abs(norm - 1.0) < 1e-5, f"Embedding norm is {norm}, expected 1.0"
def test_batch_embeddings_all_normalized():
texts = [
"Short text",
"A much longer text with many more words and more complex vocabulary",
"123 numbers and $pecial ch@racters",
]
embeddings = get_embeddings_batch(texts)
norms = np.linalg.norm(embeddings, axis=1)
for i, norm in enumerate(norms):
assert abs(norm - 1.0) < 1e-5, f"Embedding {i} norm is {norm}"Testing Semantic Ordering (the Core Test)
The most important embedding test: semantically similar texts must be closer than semantically different texts.
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
# Semantic ordering test cases: (anchor, similar, different)
SEMANTIC_ORDERING_CASES = [
(
"How do I fix a Python import error?",
"Python ImportError: No module named 'requests'", # similar: same topic
"Best practices for baking sourdough bread", # different: unrelated domain
),
(
"The president signed the bill into law",
"The legislation was approved by the executive", # similar: paraphrase
"Quantum entanglement violates Bell inequalities", # different: unrelated
),
(
"JavaScript async/await tutorial",
"How to use promises and async functions in JS", # similar: same topic
"Photosynthesis converts sunlight to glucose", # different: unrelated
),
]
@pytest.mark.parametrize("anchor,similar,different", SEMANTIC_ORDERING_CASES)
def test_semantic_ordering(anchor, similar, different):
emb_anchor = get_embedding(anchor)
emb_similar = get_embedding(similar)
emb_different = get_embedding(different)
sim_with_similar = cosine_similarity(emb_anchor, emb_similar)
sim_with_different = cosine_similarity(emb_anchor, emb_different)
assert sim_with_similar > sim_with_different, (
f"Semantic ordering violated:\n"
f" Anchor: {anchor}\n"
f" Similar: {similar} (similarity={sim_with_similar:.3f})\n"
f" Different: {different} (similarity={sim_with_different:.3f})\n"
f" Expected: similar score > different score"
)Testing Known Similarity Thresholds
For critical (query, document) pairs, establish expected similarity ranges and test against them:
# Known similarity anchors — update these when changing embedding models
SIMILARITY_ANCHORS = [
{
"text_a": "machine learning",
"text_b": "deep learning",
"min_similarity": 0.85,
"description": "Near-synonyms in ML domain"
},
{
"text_a": "car",
"text_b": "automobile",
"min_similarity": 0.90,
"description": "Exact synonyms"
},
{
"text_a": "Python programming language",
"text_b": "Monty Python sketch comedy",
"max_similarity": 0.60,
"description": "Same word, different semantic domain"
},
{
"text_a": "How to test REST APIs",
"text_b": "How to bake chocolate cake",
"max_similarity": 0.40,
"description": "Completely different domains"
},
]
@pytest.mark.parametrize("case", SIMILARITY_ANCHORS)
def test_known_similarity_thresholds(case):
emb_a = get_embedding(case['text_a'])
emb_b = get_embedding(case['text_b'])
similarity = cosine_similarity(emb_a, emb_b)
if 'min_similarity' in case:
assert similarity >= case['min_similarity'], (
f"{case['description']}: similarity {similarity:.3f} < min {case['min_similarity']}"
)
if 'max_similarity' in case:
assert similarity <= case['max_similarity'], (
f"{case['description']}: similarity {similarity:.3f} > max {case['max_similarity']}"
)Testing Embedding Determinism
The same input must always produce the same embedding:
def test_embedding_determinism():
text = "Determinism test: this text should always embed identically"
embedding1 = get_embedding(text)
embedding2 = get_embedding(text)
embedding3 = get_embedding(text)
np.testing.assert_array_equal(embedding1, embedding2)
np.testing.assert_array_equal(embedding1, embedding3)
def test_batch_order_independence():
texts = ["First text", "Second text", "Third text"]
reversed_texts = list(reversed(texts))
embeddings = get_embeddings_batch(texts)
reversed_embeddings = get_embeddings_batch(reversed_texts)
# Reverse the reversed results to compare in original order
reversed_embeddings = reversed_embeddings[::-1]
np.testing.assert_array_almost_equal(embeddings, reversed_embeddings, decimal=5)Testing Preprocessing Effects
Understand and document how preprocessing affects your embeddings:
def test_whitespace_normalization():
text_clean = "Hello world"
text_spaces = "Hello world" # multiple spaces
text_newlines = "Hello\nworld"
emb_clean = get_embedding(text_clean)
emb_spaces = get_embedding(text_spaces)
emb_newlines = get_embedding(text_newlines)
# Check your system's policy: are these treated identically or differently?
# Document whichever is true for your system
sim_spaces = cosine_similarity(emb_clean, emb_spaces)
sim_newlines = cosine_similarity(emb_clean, emb_newlines)
# Most embedding models treat these as equivalent
assert sim_spaces > 0.999, f"Extra spaces changed embedding significantly: {sim_spaces:.4f}"
assert sim_newlines > 0.995, f"Newline changed embedding significantly: {sim_newlines:.4f}"
def test_case_sensitivity():
lower = get_embedding("python programming")
upper = get_embedding("PYTHON PROGRAMMING")
mixed = get_embedding("Python Programming")
sim = cosine_similarity(lower, upper)
# Most models are largely case-insensitive for content words
assert sim > 0.98, f"Case sensitivity too high: {sim:.4f}"Regression Testing Against Saved Embeddings
When you need to ensure an embedding model update doesn't change behavior:
import json
from pathlib import Path
EMBEDDING_FIXTURES_PATH = Path("tests/fixtures/embedding_snapshots.json")
def test_embeddings_match_saved_snapshots():
"""Catch silent embedding model changes."""
if not EMBEDDING_FIXTURES_PATH.exists():
pytest.skip("No embedding snapshots found — run save_embedding_snapshots() first")
with open(EMBEDDING_FIXTURES_PATH) as f:
snapshots = json.load(f)
for snap in snapshots:
current = get_embedding(snap['text'])
saved = np.array(snap['embedding'])
similarity = cosine_similarity(current, saved)
# If you updated the model intentionally, regenerate snapshots
assert similarity > 0.999, (
f"Embedding for '{snap['text'][:50]}...' changed significantly. "
f"Cosine similarity with saved snapshot: {similarity:.4f}. "
"If this is intentional (model update), regenerate snapshots."
)
def save_embedding_snapshots():
"""Run this once to create baseline snapshots."""
test_texts = [
"software testing best practices",
"how to write unit tests",
"integration testing vs unit testing",
]
snapshots = [
{"text": t, "embedding": get_embedding(t).tolist()}
for t in test_texts
]
with open(EMBEDDING_FIXTURES_PATH, 'w') as f:
json.dump(snapshots, f)Performance Testing for Embedding Services
Embeddings in production often become latency bottlenecks:
import time
def test_single_embedding_latency():
start = time.perf_counter()
get_embedding("Performance test sentence")
elapsed_ms = (time.perf_counter() - start) * 1000
assert elapsed_ms < 500, f"Single embedding took {elapsed_ms:.0f}ms, expected < 500ms"
def test_batch_embedding_throughput():
texts = [f"Test sentence number {i}" for i in range(100)]
start = time.perf_counter()
get_embeddings_batch(texts)
elapsed_ms = (time.perf_counter() - start) * 1000
per_item_ms = elapsed_ms / len(texts)
assert per_item_ms < 50, f"Batch throughput: {per_item_ms:.1f}ms/item, expected < 50ms/item"Key Takeaways
- Test semantic ordering: similar texts must score higher than dissimilar texts for the same anchor
- Use known similarity anchors with min/max thresholds to catch model changes
- Test determinism: the same input must always produce the same embedding
- Test normalization if your cosine similarity assumes unit vectors
- Save embedding snapshots as regression baselines — catch silent model updates
- Test preprocessing edge cases: whitespace, case, punctuation, empty strings, very long texts