CockroachDB Testing: Distributed SQL, Jitter Tolerance, and Chaos Tests

CockroachDB Testing: Distributed SQL, Jitter Tolerance, and Chaos Tests

CockroachDB's distributed architecture introduces failure modes that standard database tests miss: transaction conflicts requiring retries, serializable isolation violations, and behavior under network partition. Your test suite needs to cover all three to have confidence in production.

Key Takeaways

Transaction retry logic is not optional in CockroachDB. The database can return a 40001 serialization failure error at any time. Application code that doesn't handle retries will fail silently in production under load.

Test under simulated network partition. CockroachDB is designed to survive node failures, but your application-level retry loops and timeout handling need to be tested. Use Toxiproxy to simulate partition, latency, and packet loss against a local cluster.

Use CockroachDB's CRDB-specific extensions intentionally. Functions like gen_random_uuid(), now() with high precision, and SHOW RANGES have specific semantics. Test that your queries use them correctly and that they behave the same in both single-node (test) and multi-node (production) configurations.

Running CockroachDB for Tests

CockroachDB ships an official Docker image that runs a single-node cluster — perfect for local development and CI. Single-node CRDB supports full SQL semantics, transactions, and most production features:

docker run -d \
  --name crdb-test \
  -p 26257:26257 \
  -p 8080:8080 \
  cockroachdb/cockroach:latest start-single-node \
  --insecure \
  --listen-addr=0.0.0.0

# Wait for it to be ready
docker <span class="hljs-built_in">exec crdb-test ./cockroach sql --insecure -e <span class="hljs-string">"SELECT 1"

Or use Docker Compose:

# docker-compose.yml
version: "3.8"
services:
  cockroachdb:
    image: cockroachdb/cockroach:latest
    command: start-single-node --insecure --listen-addr=0.0.0.0
    ports:
      - "26257:26257"
      - "8080:8080"
    healthcheck:
      test: ["CMD", "./cockroach", "node", "status", "--insecure"]
      interval: 5s
      timeout: 5s
      retries: 10

Connect with the PostgreSQL driver — CRDB is wire-compatible with Postgres:

// test/helpers/crdb.ts
import { Pool } from "pg";

export function createTestPool() {
  return new Pool({
    connectionString: "postgresql://root@localhost:26257/defaultdb?sslmode=disable",
    max: 20,
  });
}

export async function setupSchema(pool: Pool) {
  await pool.query(`
    CREATE TABLE IF NOT EXISTS accounts (
      id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
      owner_id UUID NOT NULL,
      balance DECIMAL(18, 2) NOT NULL DEFAULT 0,
      version INT NOT NULL DEFAULT 0,
      created_at TIMESTAMPTZ NOT NULL DEFAULT now()
    )
  `);
}

Testing Distributed Transactions

The canonical CockroachDB example is a fund transfer — a transaction that reads two rows, validates a balance, and updates both. This is exactly where serialization failures appear under concurrency:

// src/account-repository.ts
import { Pool, PoolClient } from "pg";

export class AccountRepository {
  constructor(private pool: Pool) {}

  async transfer(fromId: string, toId: string, amount: number): Promise<void> {
    const maxRetries = 5;

    for (let attempt = 0; attempt <= maxRetries; attempt++) {
      const client = await this.pool.connect();
      try {
        await client.query("BEGIN");
        await client.query("SET TRANSACTION ISOLATION LEVEL SERIALIZABLE");

        const { rows: fromRows } = await client.query(
          "SELECT balance FROM accounts WHERE id = $1 FOR UPDATE",
          [fromId]
        );

        if (fromRows.length === 0) throw new Error("Source account not found");
        if (parseFloat(fromRows[0].balance) < amount) {
          throw new Error("Insufficient funds");
        }

        await client.query(
          "UPDATE accounts SET balance = balance - $1, version = version + 1 WHERE id = $2",
          [amount, fromId]
        );
        await client.query(
          "UPDATE accounts SET balance = balance + $1, version = version + 1 WHERE id = $2",
          [amount, toId]
        );

        await client.query("COMMIT");
        return;
      } catch (err: any) {
        await client.query("ROLLBACK");

        // 40001 = serialization failure, 40P01 = deadlock detected
        if (err.code === "40001" || err.code === "40P01") {
          if (attempt === maxRetries) throw err;
          // Exponential backoff with jitter
          await sleep(Math.pow(2, attempt) * 10 + Math.random() * 10);
          continue;
        }

        throw err;
      } finally {
        client.release();
      }
    }
  }
}

function sleep(ms: number) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

Now test the happy path and the retry path:

// test/account-repository.test.ts
import { Pool } from "pg";
import { AccountRepository } from "../src/account-repository";
import { createTestPool, setupSchema } from "./helpers/crdb";

let pool: Pool;
let repo: AccountRepository;

beforeAll(async () => {
  pool = createTestPool();
  await setupSchema(pool);
  repo = new AccountRepository(pool);
});

afterAll(() => pool.end());

afterEach(async () => {
  await pool.query("DELETE FROM accounts");
});

async function createAccount(balance: number): Promise<string> {
  const { rows } = await pool.query(
    "INSERT INTO accounts (owner_id, balance) VALUES (gen_random_uuid(), $1) RETURNING id",
    [balance]
  );
  return rows[0].id;
}

test("transfers funds between accounts", async () => {
  const alice = await createAccount(1000);
  const bob = await createAccount(500);

  await repo.transfer(alice, bob, 200);

  const { rows } = await pool.query(
    "SELECT id, balance FROM accounts WHERE id = ANY($1)",
    [[alice, bob]]
  );

  const aliceRow = rows.find((r) => r.id === alice);
  const bobRow = rows.find((r) => r.id === bob);

  expect(parseFloat(aliceRow.balance)).toBe(800);
  expect(parseFloat(bobRow.balance)).toBe(700);
});

test("rejects transfer when balance is insufficient", async () => {
  const alice = await createAccount(100);
  const bob = await createAccount(0);

  await expect(repo.transfer(alice, bob, 500)).rejects.toThrow("Insufficient funds");

  // Balances must be unchanged
  const { rows } = await pool.query(
    "SELECT id, balance FROM accounts WHERE id = ANY($1)",
    [[alice, bob]]
  );
  expect(parseFloat(rows.find((r) => r.id === alice).balance)).toBe(100);
  expect(parseFloat(rows.find((r) => r.id === bob).balance)).toBe(0);
});

test("concurrent transfers do not corrupt balances", async () => {
  const alice = await createAccount(10000);
  const bob = await createAccount(10000);

  // 20 concurrent transfers in both directions
  const transfers = [
    ...Array.from({ length: 10 }, () => repo.transfer(alice, bob, 100)),
    ...Array.from({ length: 10 }, () => repo.transfer(bob, alice, 100)),
  ];

  await Promise.allSettled(transfers); // some may fail, that's ok

  const { rows } = await pool.query(
    "SELECT id, balance FROM accounts WHERE id = ANY($1)",
    [[alice, bob]]
  );

  const total = rows.reduce((sum, r) => sum + parseFloat(r.balance), 0);
  // Total balance must be conserved regardless of which transfers succeeded
  expect(total).toBe(20000);
});

Testing Serializable Isolation

CockroachDB uses serializable isolation by default — the strongest isolation level. Verify that your application behavior matches serializable semantics:

test("serializable isolation prevents phantom reads", async () => {
  await pool.query(
    "INSERT INTO accounts (owner_id, balance) VALUES (gen_random_uuid(), 100)"
  );

  const client1 = await pool.connect();
  const client2 = await pool.connect();

  try {
    await client1.query("BEGIN ISOLATION LEVEL SERIALIZABLE");
    await client2.query("BEGIN ISOLATION LEVEL SERIALIZABLE");

    // Client 1 reads the count
    const { rows: before } = await client1.query("SELECT COUNT(*) FROM accounts");
    const countBefore = parseInt(before[0].count);

    // Client 2 inserts a new row and commits
    await client2.query(
      "INSERT INTO accounts (owner_id, balance) VALUES (gen_random_uuid(), 200)"
    );
    await client2.query("COMMIT");

    // Client 1 reads again — in serializable, should see same count
    // (or the transaction should fail with 40001)
    let sawPhantom = false;
    try {
      const { rows: after } = await client1.query("SELECT COUNT(*) FROM accounts");
      const countAfter = parseInt(after[0].count);
      sawPhantom = countAfter !== countBefore;
      await client1.query("COMMIT");
    } catch (err: any) {
      // 40001 is also correct — CockroachDB detected the conflict
      expect(err.code).toBe("40001");
      await client1.query("ROLLBACK");
      sawPhantom = false;
    }

    // Either no phantom read, OR transaction was aborted. Never a phantom that committed.
    expect(sawPhantom).toBe(false);
  } finally {
    client1.release();
    client2.release();
  }
});

Chaos Testing with Toxiproxy

Toxiproxy is a TCP proxy that can simulate network conditions. Use it to test how your application behaves when CockroachDB is slow or unreachable:

# Install Toxiproxy
brew install toxiproxy

<span class="hljs-comment"># Start it
toxiproxy-server &

<span class="hljs-comment"># Create a proxy in front of CockroachDB
toxiproxy-cli create crdb --listen localhost:26258 --upstream localhost:26257
// test/chaos/network-partition.test.ts
import axios from "axios";
import { Pool } from "pg";
import { AccountRepository } from "../../src/account-repository";

const TOXIPROXY_API = "http://localhost:8474";

async function addLatency(name: string, latencyMs: number) {
  await axios.post(`${TOXIPROXY_API}/proxies/${name}/toxics`, {
    name: "latency",
    type: "latency",
    attributes: { latency: latencyMs, jitter: latencyMs / 2 },
  });
}

async function addTimeout(name: string) {
  await axios.post(`${TOXIPROXY_API}/proxies/${name}/toxics`, {
    name: "timeout",
    type: "timeout",
    attributes: { timeout: 100 }, // 100ms timeout
  });
}

async function removeToxic(proxyName: string, toxicName: string) {
  await axios.delete(`${TOXIPROXY_API}/proxies/${proxyName}/toxics/${toxicName}`);
}

const chaosPool = new Pool({
  connectionString: "postgresql://root@localhost:26258/defaultdb?sslmode=disable",
  max: 5,
  connectionTimeoutMillis: 2000,
  statement_timeout: 5000,
});

test("transfer retries successfully after transient latency", async () => {
  const repo = new AccountRepository(chaosPool);

  // Seed data via direct connection (not through toxiproxy)
  const directPool = new Pool({
    connectionString: "postgresql://root@localhost:26257/defaultdb?sslmode=disable",
  });
  const { rows } = await directPool.query(
    "INSERT INTO accounts (owner_id, balance) VALUES (gen_random_uuid(), 5000), (gen_random_uuid(), 5000) RETURNING id"
  );
  const [alice, bob] = rows.map((r) => r.id);
  await directPool.end();

  // Add 500ms latency — enough to trigger read timeouts and retry
  await addLatency("crdb", 500);

  try {
    await repo.transfer(alice, bob, 100);
    // Transfer should eventually succeed through retries
  } finally {
    await removeToxic("crdb", "latency");
  }
});

test("application surfaces error after max retries under partition", async () => {
  const repo = new AccountRepository(chaosPool);

  // Full timeout — simulates partition
  await addTimeout("crdb");

  try {
    await expect(repo.transfer("any-id", "other-id", 100)).rejects.toThrow();
  } finally {
    await removeToxic("crdb", "timeout");
  }
});

CRDB-Specific SQL Edge Cases

CockroachDB implements PostgreSQL-compatible SQL but has some differences. Test these explicitly:

test("gen_random_uuid() produces valid UUIDs", async () => {
  const { rows } = await pool.query("SELECT gen_random_uuid() AS id");
  expect(rows[0].id).toMatch(
    /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i
  );
});

test("now() returns TIMESTAMPTZ consistent within a transaction", async () => {
  const client = await pool.connect();
  try {
    await client.query("BEGIN");
    const { rows: r1 } = await client.query("SELECT now() AS t");
    await sleep(50);
    const { rows: r2 } = await client.query("SELECT now() AS t");
    await client.query("COMMIT");

    // In CockroachDB, now() is frozen at transaction start time
    expect(r1[0].t.getTime()).toBe(r2[0].t.getTime());
  } finally {
    client.release();
  }
});

test("RETURNING clause works after INSERT", async () => {
  const { rows } = await pool.query(
    "INSERT INTO accounts (owner_id, balance) VALUES (gen_random_uuid(), $1) RETURNING id, balance",
    [999.99]
  );

  expect(rows[0].id).toBeDefined();
  expect(parseFloat(rows[0].balance)).toBe(999.99);
});

test("ON CONFLICT DO UPDATE works correctly", async () => {
  const ownerId = "00000000-0000-0000-0000-000000000001";

  await pool.query(
    "INSERT INTO accounts (id, owner_id, balance) VALUES ($1, $2, $3) ON CONFLICT (id) DO UPDATE SET balance = EXCLUDED.balance",
    ["test-id", ownerId, 100]
  );
  await pool.query(
    "INSERT INTO accounts (id, owner_id, balance) VALUES ($1, $2, $3) ON CONFLICT (id) DO UPDATE SET balance = EXCLUDED.balance",
    ["test-id", ownerId, 200]
  );

  const { rows } = await pool.query("SELECT balance FROM accounts WHERE id = $1", ["test-id"]);
  expect(parseFloat(rows[0].balance)).toBe(200);
});

Retry Logic Testing Without a Real Database

For unit tests that verify your retry wrapper works correctly, mock the pool to return 40001 errors:

// test/unit/retry-logic.test.ts
import { AccountRepository } from "../../src/account-repository";

function makeFailingPool(failTimes: number) {
  let calls = 0;
  return {
    connect: () =>
      Promise.resolve({
        query: async (sql: string) => {
          if (sql.startsWith("BEGIN") || sql.startsWith("ROLLBACK")) return;
          calls++;
          if (calls <= failTimes) {
            const err = new Error("could not serialize access") as any;
            err.code = "40001";
            throw err;
          }
          return { rows: [{ balance: "1000" }] };
        },
        release: () => {},
      }),
  };
}

test("retries on 40001 and succeeds on third attempt", async () => {
  const pool = makeFailingPool(2) as any;
  const repo = new AccountRepository(pool);

  // Should not throw — succeeds on attempt 3
  await expect(repo.transfer("a", "b", 100)).resolves.not.toThrow();
});

test("gives up after max retries", async () => {
  const pool = makeFailingPool(10) as any;
  const repo = new AccountRepository(pool);

  await expect(repo.transfer("a", "b", 100)).rejects.toMatchObject({ code: "40001" });
});

Go Testing Example

// account_test.go
package db_test

import (
    "database/sql"
    "testing"
    _ "github.com/lib/pq"
)

func TestTransferRetryOnSerializationFailure(t *testing.T) {
    db, err := sql.Open("postgres",
        "postgresql://root@localhost:26257/defaultdb?sslmode=disable")
    if err != nil {
        t.Fatal(err)
    }
    defer db.Close()

    // Create test accounts
    var aliceID, bobID string
    db.QueryRow(`INSERT INTO accounts (owner_id, balance)
                 VALUES (gen_random_uuid(), 1000) RETURNING id`).Scan(&aliceID)
    db.QueryRow(`INSERT INTO accounts (owner_id, balance)
                 VALUES (gen_random_uuid(), 1000) RETURNING id`).Scan(&bobID)

    repo := &AccountRepository{db: db}
    if err := repo.Transfer(aliceID, bobID, 300); err != nil {
        t.Fatalf("expected transfer to succeed, got: %v", err)
    }

    var aliceBal, bobBal float64
    db.QueryRow(`SELECT balance FROM accounts WHERE id = $1`, aliceID).Scan(&aliceBal)
    db.QueryRow(`SELECT balance FROM accounts WHERE id = $1`, bobID).Scan(&bobBal)

    if aliceBal != 700 {
        t.Errorf("alice balance: got %v, want 700", aliceBal)
    }
    if bobBal != 1300 {
        t.Errorf("bob balance: got %v, want 1300", bobBal)
    }
}

HelpMeTest can run your CockroachDB integration tests automatically on every pull request — sign up free.

Read more