Semantic Search with pgvector: Embeddings in PostgreSQL

Introduction#

Semantic search finds documents based on meaning, not just keyword matches. A search for “container orchestration” returns results about Kubernetes even if those words don’t appear. The foundation is vector embeddings — numerical representations of text that capture semantic meaning. pgvector brings vector similarity search directly into PostgreSQL, allowing you to combine semantic search with traditional SQL filtering.

How Embeddings Work#

Text → Embedding Model → Vector (e.g., 1536 floats)

"Kubernetes manages containers" → [0.023, -0.156, 0.087, ...]
"Docker container scheduling"  → [0.019, -0.149, 0.091, ...]
"PostgreSQL query optimization" → [-0.234, 0.412, -0.056, ...]

The first two vectors are close (similar meaning)
The third is far (different topic)

Similarity measured by cosine similarity or dot product
cosine_similarity([0.023, -0.156, ...], [0.019, -0.149, ...]) ≈ 0.98

pgvector Setup#

-- Install pgvector extension
CREATE EXTENSION vector;

-- Create a table with embedding column
CREATE TABLE documents (
    id          BIGSERIAL PRIMARY KEY,
    title       TEXT NOT NULL,
    content     TEXT NOT NULL,
    embedding   vector(1536),  -- OpenAI text-embedding-3-small dimension
    created_at  TIMESTAMPTZ DEFAULT NOW(),
    author_id   BIGINT
);

-- IVFFlat index: approximate nearest neighbor, good for recall/speed tradeoff
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops)
    WITH (lists = 100);  -- more lists = faster search, less recall

-- HNSW index (pgvector 0.5+): better recall, uses more memory
CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops)
    WITH (m = 16, ef_construction = 64);

-- Set search parameters
SET ivfflat.probes = 10;    -- number of lists to search (higher = better recall)
SET hnsw.ef_search = 64;    -- HNSW search depth

Generating Embeddings#

import anthropic
import numpy as np
from typing import Union

client = anthropic.Anthropic()

def embed_text(text: str, model: str = "voyage-3") -> list[float]:
    """Generate embedding for a single text."""
    response = client.beta.messages.batches  # use Voyage via Anthropic
    # For production, use Voyage AI directly:
    # import voyageai; vo = voyageai.Client(); vo.embed([text], model="voyage-3")
    raise NotImplementedError("Configure your embedding provider")

def embed_batch(texts: list[str], model: str = "voyage-3") -> list[list[float]]:
    """Embed multiple texts in a single API call for efficiency."""
    import voyageai
    vo = voyageai.Client()
    result = vo.embed(texts, model=model, input_type="document")
    return result.embeddings

def embed_query(text: str, model: str = "voyage-3") -> list[float]:
    """Embed a search query (use input_type='query' for asymmetric search)."""
    import voyageai
    vo = voyageai.Client()
    result = vo.embed([text], model=model, input_type="query")
    return result.embeddings[0]

# Also works with OpenAI:
def embed_openai(text: str) -> list[float]:
    from openai import OpenAI
    openai = OpenAI()
    response = openai.embeddings.create(
        input=text,
        model="text-embedding-3-small",
    )
    return response.data[0].embedding

Indexing Documents#

import psycopg2
from psycopg2.extras import execute_batch

def index_documents(documents: list[dict], batch_size: int = 100) -> None:
    """Embed and store documents in PostgreSQL."""
    conn = psycopg2.connect("postgresql://localhost/app")

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        texts = [f"{doc['title']}\n\n{doc['content']}" for doc in batch]
        embeddings = embed_batch(texts)

        rows = [
            (doc["id"], doc["title"], doc["content"], embedding)
            for doc, embedding in zip(batch, embeddings)
        ]

        with conn.cursor() as cur:
            execute_batch(cur, """
                INSERT INTO documents (id, title, content, embedding)
                VALUES (%s, %s, %s, %s::vector)
                ON CONFLICT (id) DO UPDATE SET
                    embedding = EXCLUDED.embedding,
                    content = EXCLUDED.content
            """, rows)

        conn.commit()
        print(f"Indexed {min(i + batch_size, len(documents))}/{len(documents)}")

    conn.close()

Semantic Search#

def semantic_search(
    query: str,
    limit: int = 10,
    author_id: int | None = None,
    min_similarity: float = 0.7,
) -> list[dict]:
    """Find documents semantically similar to the query."""
    query_embedding = embed_query(query)

    conn = psycopg2.connect("postgresql://localhost/app")
    with conn.cursor() as cur:
        # pgvector distance operators:
        # <->  L2 distance (Euclidean)
        # <=>  cosine distance (1 - cosine_similarity)
        # <#>  negative inner product (for dot product similarity)

        # Cosine similarity: 1 - cosine_distance
        sql = """
            SELECT
                id,
                title,
                content,
                1 - (embedding <=> %s::vector) AS similarity
            FROM documents
            WHERE 1 - (embedding <=> %s::vector) >= %s
        """
        params = [query_embedding, query_embedding, min_similarity]

        if author_id:
            sql += " AND author_id = %s"
            params.append(author_id)

        sql += " ORDER BY embedding <=> %s::vector LIMIT %s"
        params.extend([query_embedding, limit])

        cur.execute(sql, params)
        columns = [desc[0] for desc in cur.description]
        return [dict(zip(columns, row)) for row in cur.fetchall()]

# Usage
results = semantic_search("how to scale kubernetes horizontally")
for r in results:
    print(f"{r['similarity']:.3f} — {r['title']}")
# 0.934 — Kubernetes HPA and VPA
# 0.891 — Kubernetes Resource Limits and Requests
# 0.856 — Container Orchestration Best Practices

Hybrid Search (Semantic + Full-Text)#

def hybrid_search(
    query: str,
    limit: int = 10,
    semantic_weight: float = 0.7,
) -> list[dict]:
    """Combine semantic similarity with PostgreSQL full-text search (BM25)."""
    query_embedding = embed_query(query)

    conn = psycopg2.connect("postgresql://localhost/app")
    with conn.cursor() as cur:
        cur.execute("""
            WITH semantic AS (
                SELECT
                    id,
                    1 - (embedding <=> %s::vector) AS semantic_score
                FROM documents
                ORDER BY embedding <=> %s::vector
                LIMIT 50
            ),
            fulltext AS (
                SELECT
                    id,
                    ts_rank_cd(
                        to_tsvector('english', title || ' ' || content),
                        plainto_tsquery('english', %s)
                    ) AS text_score
                FROM documents
                WHERE to_tsvector('english', title || ' ' || content)
                      @@ plainto_tsquery('english', %s)
                LIMIT 50
            )
            SELECT
                d.id,
                d.title,
                d.content,
                COALESCE(s.semantic_score, 0) * %s
                    + COALESCE(f.text_score, 0) * (1 - %s) AS hybrid_score
            FROM documents d
            LEFT JOIN semantic s ON d.id = s.id
            LEFT JOIN fulltext f ON d.id = f.id
            WHERE s.id IS NOT NULL OR f.id IS NOT NULL
            ORDER BY hybrid_score DESC
            LIMIT %s
        """, [
            query_embedding, query_embedding,
            query, query,
            semantic_weight, semantic_weight,
            limit
        ])

        columns = [desc[0] for desc in cur.description]
        return [dict(zip(columns, row)) for row in cur.fetchall()]

Chunking Long Documents#

def chunk_document(text: str, chunk_size: int = 512, overlap: int = 64) -> list[str]:
    """Split document into overlapping chunks for embedding."""
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap  # overlap for context continuity

    return chunks

def index_chunked_document(doc_id: int, title: str, content: str) -> None:
    chunks = chunk_document(content)
    embeddings = embed_batch([f"{title}\n\n{chunk}" for chunk in chunks])

    conn = psycopg2.connect("postgresql://localhost/app")
    with conn.cursor() as cur:
        # Store each chunk as a separate row
        cur.executemany("""
            INSERT INTO document_chunks (doc_id, chunk_index, content, embedding)
            VALUES (%s, %s, %s, %s::vector)
        """, [
            (doc_id, i, chunk, emb)
            for i, (chunk, emb) in enumerate(zip(chunks, embeddings))
        ])
    conn.commit()

Conclusion#

pgvector brings production-grade vector similarity search into the database you already run, eliminating the need for a separate vector database for most use cases. HNSW indexes provide excellent recall with sub-millisecond query times at millions of vectors. Hybrid search — combining cosine similarity with PostgreSQL’s full-text ranking — outperforms either approach alone. Chunk long documents with overlap to maintain context across boundaries. For datasets above 10M vectors or requiring real-time updates at scale, dedicated vector databases (Weaviate, Qdrant, Pinecone) offer more tuning options.