Introduction#
Poor logging is one of the most common impediments to diagnosing production issues. Logs that are too verbose obscure signal in noise; logs that are too sparse leave you guessing. Unstructured logs are difficult to query. This post covers Python logging configuration and patterns for production systems.
Structured Logging#
Plain text logs are difficult to query at scale. Structured logs (JSON) integrate with log aggregators (Elasticsearch, Loki, CloudWatch Logs Insights).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
| import logging
import json
import sys
import traceback
from datetime import datetime, timezone
from typing import Any
class JSONFormatter(logging.Formatter):
def format(self, record: logging.LogRecord) -> str:
log_entry: dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
"line": record.lineno,
}
# Include exception traceback if present
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
# Include extra fields added by the logger
for key, value in record.__dict__.items():
if key not in logging.LogRecord.__dict__ and not key.startswith("_"):
log_entry[key] = value
return json.dumps(log_entry)
def configure_logging(level: str = "INFO") -> None:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(JSONFormatter())
root = logging.getLogger()
root.setLevel(getattr(logging, level.upper()))
root.handlers = [handler]
# Silence noisy third-party libraries
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("botocore").setLevel(logging.WARNING)
|
Using structlog#
structlog is the standard library for structured logging in Python.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
| import structlog
import logging
# Configure structlog
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.ExceptionRenderer(),
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
)
log = structlog.get_logger()
# Basic usage
log.info("order_created", order_id=123, user_id=42, total=99.99)
# {"event": "order_created", "order_id": 123, "user_id": 42, "total": 99.99, ...}
log.error("payment_failed", order_id=123, error_code="CARD_DECLINED")
|
Request Context Binding#
Bind request context (trace ID, user ID) once per request; it is automatically included in all subsequent log calls.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
| import structlog
from fastapi import FastAPI, Request
import uuid
app = FastAPI()
@app.middleware("http")
async def log_requests(request: Request, call_next):
request_id = request.headers.get("X-Request-ID") or str(uuid.uuid4())
user_id = getattr(request.state, "user_id", None)
# Bind context for all logs in this request
structlog.contextvars.bind_contextvars(
request_id=request_id,
user_id=user_id,
method=request.method,
path=request.url.path,
)
log = structlog.get_logger()
log.info("request_started")
try:
response = await call_next(request)
log.info("request_completed", status_code=response.status_code)
return response
except Exception as e:
log.error("request_failed", error=str(e), exc_info=True)
raise
finally:
structlog.contextvars.clear_contextvars()
|
Log Levels: When to Use Each#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| log = structlog.get_logger()
# DEBUG: detailed diagnostic info — disabled in production
log.debug("cache_lookup", key="user:42", hit=False)
# INFO: normal operations worth recording
log.info("user_logged_in", user_id=42)
log.info("payment_processed", order_id=123, amount=99.99)
# WARNING: unexpected but handled situation
log.warning("rate_limit_approaching", user_id=42, requests_remaining=5)
log.warning("config_value_missing", key="FEATURE_FLAG_X", using_default=True)
# ERROR: operation failed, requires attention
log.error("payment_failed", order_id=123, error="CARD_DECLINED", exc_info=True)
# CRITICAL: system is unusable
log.critical("database_unavailable", retries=5)
|
What to Log#
Always log:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| # External calls (with duration)
import time
start = time.monotonic()
result = await external_api.call()
duration_ms = (time.monotonic() - start) * 1000
log.info("external_api_called", service="payment", duration_ms=round(duration_ms, 2))
# State transitions
log.info("order_status_changed", order_id=123, from_status="pending", to_status="completed")
# Errors with context
try:
process_order(order)
except Exception:
log.error("order_processing_failed", order_id=order.id, exc_info=True)
raise
|
Never log:
1
2
3
4
5
6
7
8
| # NEVER log sensitive data
log.info("user_authenticated", password=password) # WRONG
log.info("card_charged", card_number=card.number) # WRONG
log.info("api_call", authorization_header=headers["Authorization"]) # WRONG
# Safe: log only non-sensitive identifiers
log.info("user_authenticated", user_id=user.id)
log.info("card_charged", card_last_four=card.number[-4:])
|
Log Sampling for High-Volume Events#
1
2
3
4
5
6
7
8
9
| import random
def should_sample(rate: float = 0.01) -> bool:
"""Sample rate: 0.01 = log 1% of events."""
return random.random() < rate
# For very high-frequency events, sample to avoid log volume
if should_sample(0.001): # 0.1% of requests
log.debug("cache_hit", key=key)
|
Correlation with Distributed Tracing#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| from opentelemetry import trace
log = structlog.get_logger()
def get_trace_context() -> dict:
span = trace.get_current_span()
if span.is_recording():
ctx = span.get_span_context()
return {
"trace_id": format(ctx.trace_id, "032x"),
"span_id": format(ctx.span_id, "016x"),
}
return {}
# Bind trace context to all logs
structlog.contextvars.bind_contextvars(**get_trace_context())
|
Conclusion#
Use structured (JSON) logging with a library like structlog. Bind request context once per request — trace ID, user ID, request path. Use log levels correctly: INFO for normal operations, ERROR for failures, DEBUG for development. Never log passwords, tokens, or PII. Sample high-frequency debug logs rather than disabling them entirely.