Introduction#
Semantic search finds documents based on meaning, not just keyword matches. A search for “container orchestration” returns results about Kubernetes even if those words don’t appear. The foundation is vector embeddings — numerical representations of text that capture semantic meaning. pgvector brings vector similarity search directly into PostgreSQL, allowing you to combine semantic search with traditional SQL filtering.
How Embeddings Work#
1
2
3
4
5
6
7
8
9
10
11
Text → Embedding Model → Vector (e.g., 1536 floats)
"Kubernetes manages containers" → [0.023, -0.156, 0.087, ...]
"Docker container scheduling" → [0.019, -0.149, 0.091, ...]
"PostgreSQL query optimization" → [-0.234, 0.412, -0.056, ...]
The first two vectors are close (similar meaning)
The third is far (different topic)
Similarity measured by cosine similarity or dot product
cosine_similarity([0.023, -0.156, ...], [0.019, -0.149, ...]) ≈ 0.98
pgvector Setup#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
-- Install pgvector extension
CREATE EXTENSION vector;
-- Create a table with embedding column
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
embedding vector(1536), -- OpenAI text-embedding-3-small dimension
created_at TIMESTAMPTZ DEFAULT NOW(),
author_id BIGINT
);
-- IVFFlat index: approximate nearest neighbor, good for recall/speed tradeoff
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100); -- more lists = faster search, less recall
-- HNSW index (pgvector 0.5+): better recall, uses more memory
CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- Set search parameters
SET ivfflat.probes = 10; -- number of lists to search (higher = better recall)
SET hnsw.ef_search = 64; -- HNSW search depth
Generating Embeddings#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import anthropic
import numpy as np
from typing import Union
client = anthropic.Anthropic()
def embed_text(text: str, model: str = "voyage-3") -> list[float]:
"""Generate embedding for a single text."""
response = client.beta.messages.batches # use Voyage via Anthropic
# For production, use Voyage AI directly:
# import voyageai; vo = voyageai.Client(); vo.embed([text], model="voyage-3")
raise NotImplementedError("Configure your embedding provider")
def embed_batch(texts: list[str], model: str = "voyage-3") -> list[list[float]]:
"""Embed multiple texts in a single API call for efficiency."""
import voyageai
vo = voyageai.Client()
result = vo.embed(texts, model=model, input_type="document")
return result.embeddings
def embed_query(text: str, model: str = "voyage-3") -> list[float]:
"""Embed a search query (use input_type='query' for asymmetric search)."""
import voyageai
vo = voyageai.Client()
result = vo.embed([text], model=model, input_type="query")
return result.embeddings[0]
# Also works with OpenAI:
def embed_openai(text: str) -> list[float]:
from openai import OpenAI
openai = OpenAI()
response = openai.embeddings.create(
input=text,
model="text-embedding-3-small",
)
return response.data[0].embedding
Indexing Documents#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import psycopg2
from psycopg2.extras import execute_batch
def index_documents(documents: list[dict], batch_size: int = 100) -> None:
"""Embed and store documents in PostgreSQL."""
conn = psycopg2.connect("postgresql://localhost/app")
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
texts = [f"{doc['title']}\n\n{doc['content']}" for doc in batch]
embeddings = embed_batch(texts)
rows = [
(doc["id"], doc["title"], doc["content"], embedding)
for doc, embedding in zip(batch, embeddings)
]
with conn.cursor() as cur:
execute_batch(cur, """
INSERT INTO documents (id, title, content, embedding)
VALUES (%s, %s, %s, %s::vector)
ON CONFLICT (id) DO UPDATE SET
embedding = EXCLUDED.embedding,
content = EXCLUDED.content
""", rows)
conn.commit()
print(f"Indexed {min(i + batch_size, len(documents))}/{len(documents)}")
conn.close()
Semantic Search#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def semantic_search(
query: str,
limit: int = 10,
author_id: int | None = None,
min_similarity: float = 0.7,
) -> list[dict]:
"""Find documents semantically similar to the query."""
query_embedding = embed_query(query)
conn = psycopg2.connect("postgresql://localhost/app")
with conn.cursor() as cur:
# pgvector distance operators:
# <-> L2 distance (Euclidean)
# <=> cosine distance (1 - cosine_similarity)
# <#> negative inner product (for dot product similarity)
# Cosine similarity: 1 - cosine_distance
sql = """
SELECT
id,
title,
content,
1 - (embedding <=> %s::vector) AS similarity
FROM documents
WHERE 1 - (embedding <=> %s::vector) >= %s
"""
params = [query_embedding, query_embedding, min_similarity]
if author_id:
sql += " AND author_id = %s"
params.append(author_id)
sql += " ORDER BY embedding <=> %s::vector LIMIT %s"
params.extend([query_embedding, limit])
cur.execute(sql, params)
columns = [desc[0] for desc in cur.description]
return [dict(zip(columns, row)) for row in cur.fetchall()]
# Usage
results = semantic_search("how to scale kubernetes horizontally")
for r in results:
print(f"{r['similarity']:.3f} — {r['title']}")
# 0.934 — Kubernetes HPA and VPA
# 0.891 — Kubernetes Resource Limits and Requests
# 0.856 — Container Orchestration Best Practices
Hybrid Search (Semantic + Full-Text)#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def hybrid_search(
query: str,
limit: int = 10,
semantic_weight: float = 0.7,
) -> list[dict]:
"""Combine semantic similarity with PostgreSQL full-text search (BM25)."""
query_embedding = embed_query(query)
conn = psycopg2.connect("postgresql://localhost/app")
with conn.cursor() as cur:
cur.execute("""
WITH semantic AS (
SELECT
id,
1 - (embedding <=> %s::vector) AS semantic_score
FROM documents
ORDER BY embedding <=> %s::vector
LIMIT 50
),
fulltext AS (
SELECT
id,
ts_rank_cd(
to_tsvector('english', title || ' ' || content),
plainto_tsquery('english', %s)
) AS text_score
FROM documents
WHERE to_tsvector('english', title || ' ' || content)
@@ plainto_tsquery('english', %s)
LIMIT 50
)
SELECT
d.id,
d.title,
d.content,
COALESCE(s.semantic_score, 0) * %s
+ COALESCE(f.text_score, 0) * (1 - %s) AS hybrid_score
FROM documents d
LEFT JOIN semantic s ON d.id = s.id
LEFT JOIN fulltext f ON d.id = f.id
WHERE s.id IS NOT NULL OR f.id IS NOT NULL
ORDER BY hybrid_score DESC
LIMIT %s
""", [
query_embedding, query_embedding,
query, query,
semantic_weight, semantic_weight,
limit
])
columns = [desc[0] for desc in cur.description]
return [dict(zip(columns, row)) for row in cur.fetchall()]
Chunking Long Documents#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def chunk_document(text: str, chunk_size: int = 512, overlap: int = 64) -> list[str]:
"""Split document into overlapping chunks for embedding."""
words = text.split()
chunks = []
start = 0
while start < len(words):
end = start + chunk_size
chunk = " ".join(words[start:end])
chunks.append(chunk)
start += chunk_size - overlap # overlap for context continuity
return chunks
def index_chunked_document(doc_id: int, title: str, content: str) -> None:
chunks = chunk_document(content)
embeddings = embed_batch([f"{title}\n\n{chunk}" for chunk in chunks])
conn = psycopg2.connect("postgresql://localhost/app")
with conn.cursor() as cur:
# Store each chunk as a separate row
cur.executemany("""
INSERT INTO document_chunks (doc_id, chunk_index, content, embedding)
VALUES (%s, %s, %s, %s::vector)
""", [
(doc_id, i, chunk, emb)
for i, (chunk, emb) in enumerate(zip(chunks, embeddings))
])
conn.commit()
Conclusion#
pgvector brings production-grade vector similarity search into the database you already run, eliminating the need for a separate vector database for most use cases. HNSW indexes provide excellent recall with sub-millisecond query times at millions of vectors. Hybrid search — combining cosine similarity with PostgreSQL’s full-text ranking — outperforms either approach alone. Chunk long documents with overlap to maintain context across boundaries. For datasets above 10M vectors or requiring real-time updates at scale, dedicated vector databases (Weaviate, Qdrant, Pinecone) offer more tuning options.