Elasticsearch Indexing and Search Internals

Introduction#

Elasticsearch is a distributed search and analytics engine built on Apache Lucene. Understanding how it indexes and searches data helps you design schemas that perform well, troubleshoot slow queries, and avoid common pitfalls like mapping explosions and hot shards.

How Indexing Works#

Document → Index API → Shard selection (hash of _id) → In-memory buffer
→ Translog (WAL) → Refresh (every 1s): creates a new Lucene segment
→ Merge: consolidates small segments into larger ones → Flush: writes to disk

from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

# Index a document
es.index(
    index="products",
    id="prod-123",
    document={
        "name": "Wireless Keyboard",
        "description": "Mechanical keyboard with RGB lighting",
        "price": 79.99,
        "category": "electronics",
        "tags": ["keyboard", "wireless", "mechanical"],
        "in_stock": True,
        "created_at": "2025-10-03T10:00:00Z",
    }
)

# Bulk indexing (10-100x faster than individual indexing)
from elasticsearch.helpers import bulk

def generate_docs(products):
    for product in products:
        yield {
            "_index": "products",
            "_id": product["id"],
            "_source": product,
        }

bulk(es, generate_docs(product_list))

Index Mapping#

Mappings define how fields are stored and indexed. Poor mappings cause poor performance.

# Create index with explicit mapping
es.indices.create(
    index="products",
    body={
        "settings": {
            "number_of_shards": 3,
            "number_of_replicas": 1,
            "refresh_interval": "1s",
            "analysis": {
                "analyzer": {
                    "product_analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["lowercase", "stop", "porter_stem"]
                    }
                }
            }
        },
        "mappings": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "product_analyzer",
                    "fields": {
                        "keyword": {"type": "keyword"}  # for exact match and sorting
                    }
                },
                "description": {
                    "type": "text",
                    "analyzer": "product_analyzer"
                },
                "price": {"type": "float"},
                "category": {"type": "keyword"},  # exact match only, not analyzed
                "tags": {"type": "keyword"},
                "in_stock": {"type": "boolean"},
                "created_at": {
                    "type": "date",
                    "format": "strict_date_optional_time"
                }
            },
            "dynamic": "strict"  # reject documents with unknown fields
        }
    }
)

text vs keyword:

text: analyzed, tokenized, supports full-text search (match queries)
keyword: exact match only, used for filtering, sorting, aggregations

Searching#

# Match query: full-text search
resp = es.search(
    index="products",
    body={
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": "wireless mechanical keyboard",
                            "fields": ["name^3", "description"],  # name boosted 3x
                            "type": "best_fields",
                            "fuzziness": "AUTO"
                        }
                    }
                ],
                "filter": [
                    {"term": {"in_stock": True}},
                    {"range": {"price": {"gte": 20, "lte": 200}}},
                    {"terms": {"category": ["electronics", "accessories"]}}
                ]
            }
        },
        "sort": [
            {"_score": "desc"},
            {"price": "asc"}
        ],
        "aggs": {
            "categories": {
                "terms": {"field": "category", "size": 10}
            },
            "price_ranges": {
                "range": {
                    "field": "price",
                    "ranges": [
                        {"to": 50},
                        {"from": 50, "to": 100},
                        {"from": 100}
                    ]
                }
            }
        },
        "from": 0,
        "size": 20,
        "_source": ["name", "price", "category"]  # return only needed fields
    }
)

Performance Tuning#

# Check shard health and performance
es.cat.indices(v=True, h=["index", "pri", "rep", "docs.count", "store.size"])

# Identify slow queries
# elasticsearch.yml:
# index.search.slowlog.threshold.query.warn: 5s
# index.search.slowlog.threshold.query.info: 1s
# index.search.slowlog.threshold.fetch.warn: 1s

# Check slow log
es.indices.get_settings(index="products")

# Force merge for read-heavy indexes (reduces segment count)
es.indices.forcemerge(index="products-2025-*", max_num_segments=1)

# Refresh interval: increase for bulk indexing
es.indices.put_settings(
    index="products",
    body={"index": {"refresh_interval": "30s"}}
)
# Reset after bulk load
es.indices.put_settings(
    index="products",
    body={"index": {"refresh_interval": "1s"}}
)

Common Mistakes#

Dynamic mapping: Elasticsearch creates mappings automatically. If you have 1000s of unique field names (e.g., JSON with user-defined keys), you get a mapping explosion.

# Use nested objects or flattened type for dynamic keys
"metadata": {
    "type": "flattened"  # stores as a single indexed field, no mapping explosion
}

Deep pagination: from + size is expensive for deep pages (fetches from + size documents per shard and discards).

# Use search_after for deep pagination
resp = es.search(
    index="products",
    body={
        "query": {"match_all": {}},
        "sort": [{"created_at": "desc"}, {"_id": "asc"}],
        "search_after": ["2025-10-01T00:00:00Z", "prod-999"],  # cursor
        "size": 20
    }
)

Conclusion#

Define explicit mappings with dynamic: strict to prevent mapping explosions. Use keyword for filtering and aggregations, text for full-text search. Use bool queries with filter for non-scoring conditions (they are cached). Use search_after for pagination. Monitor slow query logs and merge small segments on write-heavy indexes after bulk operations.