Testing Pinecone, Weaviate, and Qdrant Integrations

Testing Pinecone, Weaviate, and Qdrant Integrations

Vector databases are infrastructure. Like any database, they can be misconfigured, queried incorrectly, or updated in ways that corrupt your index. Unlike relational databases, the failure modes are subtle: incorrect metadata filters silently narrow results, wrong namespace routing returns data from a different index, stale vectors from deleted documents still appear in search results.

Testing your vector database integration is not optional if your application depends on semantic search or RAG.

What to Test for Vector Database Integrations

  1. Upsert correctness — vectors are stored with correct IDs, metadata, and values
  2. Query accuracy — similarity search returns correct top-K results for known vectors
  3. Namespace/collection isolation — queries don't bleed across namespaces or tenants
  4. Metadata filtering — filters correctly narrow results
  5. Delete propagation — deleted vectors no longer appear in results
  6. Index freshness — recently upserted vectors appear in subsequent queries

Testing Pinecone Integration

Setting Up a Test Index

Use a separate Pinecone index for tests, or a separate namespace within your development index:

import pytest
import numpy as np
from pinecone import Pinecone

@pytest.fixture(scope="session")
def pinecone_test_index():
    pc = Pinecone(api_key=os.environ["PINECONE_TEST_API_KEY"])

    index_name = "test-index"
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        )

    index = pc.Index(index_name)

    yield index

    # Cleanup after all tests
    index.delete(delete_all=True, namespace="test")

@pytest.fixture(autouse=True)
def cleanup_test_namespace(pinecone_test_index):
    yield
    pinecone_test_index.delete(delete_all=True, namespace="test")

Testing Upsert and Query

def make_random_vector(dim=1536):
    v = np.random.randn(dim).astype(np.float32)
    return (v / np.linalg.norm(v)).tolist()

def test_pinecone_upsert_and_query(pinecone_test_index):
    # Create a known vector
    known_vector = make_random_vector()

    pinecone_test_index.upsert(
        vectors=[
            {"id": "doc-1", "values": known_vector, "metadata": {"source": "test", "category": "A"}},
        ],
        namespace="test"
    )

    import time; time.sleep(1)  # Wait for indexing

    # Query with the same vector — should return itself as top result
    results = pinecone_test_index.query(
        vector=known_vector,
        top_k=1,
        namespace="test",
        include_metadata=True,
    )

    assert len(results.matches) == 1
    assert results.matches[0].id == "doc-1"
    assert results.matches[0].score > 0.999  # nearly identical vector
    assert results.matches[0].metadata["category"] == "A"

def test_pinecone_metadata_filter(pinecone_test_index):
    vec_a = make_random_vector()
    vec_b = make_random_vector()

    pinecone_test_index.upsert(
        vectors=[
            {"id": "cat-a-1", "values": vec_a, "metadata": {"category": "A"}},
            {"id": "cat-b-1", "values": vec_b, "metadata": {"category": "B"}},
        ],
        namespace="test"
    )

    import time; time.sleep(1)

    # Query with filter — should only return category A
    results = pinecone_test_index.query(
        vector=vec_a,
        top_k=10,
        namespace="test",
        filter={"category": {"$eq": "A"}},
        include_metadata=True,
    )

    returned_ids = [m.id for m in results.matches]
    assert "cat-a-1" in returned_ids
    assert "cat-b-1" not in returned_ids

def test_pinecone_namespace_isolation(pinecone_test_index):
    """Verify namespace A and namespace B don't share results."""
    vec = make_random_vector()

    pinecone_test_index.upsert(
        vectors=[{"id": "shared-id", "values": vec, "metadata": {"ns": "ns-a"}}],
        namespace="test-ns-a"
    )

    import time; time.sleep(1)

    # Query in different namespace — should not find the vector
    results = pinecone_test_index.query(
        vector=vec,
        top_k=5,
        namespace="test-ns-b",  # different namespace
    )

    assert len(results.matches) == 0

def test_pinecone_delete_propagation(pinecone_test_index):
    vec = make_random_vector()
    pinecone_test_index.upsert(
        vectors=[{"id": "to-delete", "values": vec}],
        namespace="test"
    )

    import time; time.sleep(1)
    pinecone_test_index.delete(ids=["to-delete"], namespace="test")
    time.sleep(1)

    results = pinecone_test_index.query(
        vector=vec, top_k=5, namespace="test"
    )
    returned_ids = [m.id for m in results.matches]
    assert "to-delete" not in returned_ids

Testing Weaviate Integration

Weaviate's schema-based approach requires testing class creation and object insertion:

import weaviate
import pytest

@pytest.fixture(scope="session")
def weaviate_client():
    client = weaviate.connect_to_local(
        host=os.environ.get("WEAVIATE_TEST_HOST", "localhost"),
        port=int(os.environ.get("WEAVIATE_TEST_PORT", 8080)),
    )

    # Create test class
    if not client.collections.exists("TestDocument"):
        client.collections.create(
            name="TestDocument",
            vectorizer_config=weaviate.config.Configure.Vectorizer.none(),
            properties=[
                weaviate.classes.config.Property(name="content", data_type=weaviate.classes.config.DataType.TEXT),
                weaviate.classes.config.Property(name="category", data_type=weaviate.classes.config.DataType.TEXT),
            ]
        )

    yield client

    client.collections.delete("TestDocument")
    client.close()

def test_weaviate_insert_and_search(weaviate_client):
    collection = weaviate_client.collections.get("TestDocument")

    # Insert with known vector
    known_vector = make_random_vector()
    uuid = collection.data.insert(
        properties={"content": "Test document content", "category": "tech"},
        vector=known_vector,
    )

    # Near-vector search
    results = collection.query.near_vector(
        near_vector=known_vector,
        limit=1,
        return_metadata=weaviate.classes.query.MetadataQuery(certainty=True),
    )

    assert len(results.objects) == 1
    assert str(results.objects[0].uuid) == str(uuid)
    assert results.objects[0].metadata.certainty > 0.99

def test_weaviate_where_filter(weaviate_client):
    collection = weaviate_client.collections.get("TestDocument")

    vec_tech = make_random_vector()
    vec_science = make_random_vector()

    collection.data.insert(
        properties={"content": "Tech article", "category": "tech"},
        vector=vec_tech,
    )
    collection.data.insert(
        properties={"content": "Science article", "category": "science"},
        vector=vec_tech,  # same vector — only filter should distinguish
    )

    results = collection.query.near_vector(
        near_vector=vec_tech,
        limit=10,
        filters=weaviate.classes.query.Filter.by_property("category").equal("tech"),
        return_properties=["content", "category"],
    )

    categories = [obj.properties["category"] for obj in results.objects]
    assert all(c == "tech" for c in categories)
    assert "science" not in categories

Testing Qdrant Integration

Qdrant uses collections with explicit vector configuration:

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue

@pytest.fixture(scope="session")
def qdrant_client():
    client = QdrantClient(
        host=os.environ.get("QDRANT_TEST_HOST", "localhost"),
        port=int(os.environ.get("QDRANT_TEST_PORT", 6333)),
    )

    collection_name = "test_documents"
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
        )

    yield client, collection_name

    client.delete_collection(collection_name)

def test_qdrant_upsert_and_search(qdrant_client):
    client, collection = qdrant_client

    known_vector = make_random_vector()
    client.upsert(
        collection_name=collection,
        points=[
            PointStruct(
                id=1,
                vector=known_vector,
                payload={"content": "Test document", "category": "A"},
            )
        ],
    )

    results = client.search(
        collection_name=collection,
        query_vector=known_vector,
        limit=1,
        with_payload=True,
    )

    assert len(results) == 1
    assert results[0].id == 1
    assert results[0].score > 0.999
    assert results[0].payload["category"] == "A"

def test_qdrant_payload_filter(qdrant_client):
    client, collection = qdrant_client

    vec = make_random_vector()
    client.upsert(
        collection_name=collection,
        points=[
            PointStruct(id=10, vector=make_random_vector(), payload={"category": "A"}),
            PointStruct(id=11, vector=make_random_vector(), payload={"category": "B"}),
            PointStruct(id=12, vector=vec, payload={"category": "A"}),
        ]
    )

    results = client.search(
        collection_name=collection,
        query_vector=vec,
        limit=10,
        query_filter=Filter(
            must=[FieldCondition(key="category", match=MatchValue(value="A"))]
        ),
        with_payload=True,
    )

    ids = [r.id for r in results]
    assert 11 not in ids  # category B should be filtered out
    assert all(r.payload["category"] == "A" for r in results)

Testing Your Application-Level Abstraction

If your application wraps the vector DB client in a service layer, test that abstraction:

from your_app.vector_store import VectorStore

def test_vector_store_add_and_search():
    store = VectorStore(backend="qdrant", collection="test")

    doc_id = store.add_document(
        text="PostgreSQL is an open-source relational database",
        metadata={"source": "docs", "doc_type": "database"}
    )

    results = store.search(
        query="open source relational database",
        top_k=5,
        filter={"doc_type": "database"},
    )

    assert len(results) > 0
    assert results[0].doc_id == doc_id
    assert results[0].score > 0.7
    assert results[0].metadata["source"] == "docs"

def test_vector_store_delete_removes_from_search():
    store = VectorStore(backend="qdrant", collection="test")
    doc_id = store.add_document("Temporary document to be deleted", metadata={})

    store.delete_document(doc_id)

    results = store.search("Temporary document to be deleted", top_k=5)
    returned_ids = [r.doc_id for r in results]
    assert doc_id not in returned_ids

Key Takeaways

  • Always use a separate test index or namespace — never run tests against your production index
  • Test upsert-then-query immediately to catch indexing latency issues
  • Test metadata filters explicitly — they are a common source of silent bugs
  • Test namespace/collection isolation to catch multi-tenant routing errors
  • Test delete propagation — deleted vectors must not appear in subsequent queries
  • Test your application-layer abstraction, not just the raw client — that's where application bugs live

Read more