Prompt Engineering for Backend Engineers: Practical Patterns

Introduction#

Prompt engineering is the practice of structuring inputs to language models to reliably produce useful outputs. For backend engineers integrating LLMs into production systems, this means moving beyond trial-and-error and applying repeatable patterns: role definition, chain-of-thought, output structuring, and few-shot examples. This post covers the patterns that produce consistent, parseable results.

Fundamentals: What Makes a Good Prompt#

A good prompt has:
1. Role/persona: tells the model who it is
2. Context: relevant background information
3. Task: specific, unambiguous instruction
4. Constraints: format, length, tone, scope
5. Examples (if needed): few-shot demonstrations

Bad prompt: "Summarize this document"
Good prompt:
  Role: "You are a technical writer for a developer audience."
  Context: "The following is a raw API specification."
  Task: "Summarize it in 3 bullet points covering: purpose, inputs, outputs."
  Constraints: "Each bullet must be under 20 words. Use present tense."

Basic Python Integration#

from anthropic import Anthropic
import json

client = Anthropic()

def prompt(system: str, user: str, model: str = "claude-opus-4-6") -> str:
    response = client.messages.create(
        model=model,
        max_tokens=1024,
        system=system,
        messages=[{"role": "user", "content": user}],
    )
    return response.content[0].text

# Example
result = prompt(
    system="You are a senior software engineer. Be concise and technical.",
    user="Explain the tradeoffs between PostgreSQL and MongoDB in 3 sentences."
)
print(result)

Structured Output: JSON Extraction#

import json
from pydantic import BaseModel

class CodeReview(BaseModel):
    issues: list[str]
    suggestions: list[str]
    severity: str  # "low" | "medium" | "high"
    approved: bool

def review_code(code: str) -> CodeReview:
    system = """You are a senior code reviewer. You must respond with valid JSON only.
No explanation, no markdown, no code blocks. Just the JSON object.

Required schema:
{
  "issues": ["string"],
  "suggestions": ["string"],
  "severity": "low|medium|high",
  "approved": boolean
}"""

    user = f"Review this code:\n\n{code}"

    raw = prompt(system, user)

    # Strip any accidental markdown wrapping
    raw = raw.strip()
    if raw.startswith("```"):
        raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]

    data = json.loads(raw)
    return CodeReview(**data)

# Usage
review = review_code("""
def get_user(user_id):
    query = f"SELECT * FROM users WHERE id = {user_id}"
    return db.execute(query)
""")
print(review.severity)  # "high"
print(review.issues)    # ["SQL injection vulnerability on line 2"]

Chain-of-Thought for Complex Reasoning#

def analyze_incident(log_excerpt: str) -> dict:
    """Use CoT to reason through an incident before giving conclusions."""

    system = """You are an SRE analyzing production incidents.

Think through the problem step by step:
1. Identify what went wrong (symptoms)
2. Determine likely root cause
3. Assess customer impact
4. Recommend immediate mitigation
5. Suggest long-term fix

Format your response as JSON:
{
  "reasoning": "step-by-step analysis",
  "root_cause": "string",
  "impact": "string",
  "immediate_mitigation": "string",
  "long_term_fix": "string",
  "severity": "P1|P2|P3|P4"
}"""

    user = f"Analyze this incident log:\n\n{log_excerpt}"

    raw = prompt(system, user)
    return json.loads(raw.strip())

# Chain-of-thought prompts outperform direct prompts for multi-step reasoning
# The reasoning field captures the model's work before its conclusions

Few-Shot Examples#

FEW_SHOT_EXAMPLES = """
Examples of commit message classification:

Input: "fix: resolve null pointer in UserService.getById"
Output: {"type": "fix", "scope": "UserService", "breaking": false}

Input: "feat!: remove deprecated v1 API endpoints"
Output: {"type": "feat", "scope": "api", "breaking": true}

Input: "chore: update dependencies"
Output: {"type": "chore", "scope": null, "breaking": false}

Input: "refactor: extract payment processing into PaymentService"
Output: {"type": "refactor", "scope": "PaymentService", "breaking": false}
"""

def classify_commit(message: str) -> dict:
    system = f"""Classify git commit messages into structured data.
Return JSON only. No explanation.

{FEW_SHOT_EXAMPLES}"""

    user = f"Input: {message!r}"
    raw = prompt(system, user)
    return json.loads(raw.strip())

# Few-shot examples dramatically improve consistency for classification tasks

Retrieval-Augmented Generation (RAG)#

from anthropic import Anthropic
import numpy as np

client = Anthropic()

def cosine_similarity(a: list[float], b: list[float]) -> float:
    a, b = np.array(a), np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

class SimpleRAG:
    """Minimal RAG: embed documents, retrieve relevant chunks, inject into prompt."""

    def __init__(self):
        self._docs: list[str] = []
        self._embeddings: list[list[float]] = []

    def add_document(self, text: str) -> None:
        response = client.embeddings.create(
            model="voyage-3",
            input=[text],
        )
        self._docs.append(text)
        self._embeddings.append(response.embeddings[0].embedding)

    def retrieve(self, query: str, top_k: int = 3) -> list[str]:
        response = client.embeddings.create(
            model="voyage-3",
            input=[query],
        )
        q_embedding = response.embeddings[0].embedding

        scores = [
            (cosine_similarity(q_embedding, doc_emb), doc)
            for doc_emb, doc in zip(self._embeddings, self._docs)
        ]
        scores.sort(reverse=True)
        return [doc for _, doc in scores[:top_k]]

    def answer(self, question: str) -> str:
        relevant_docs = self.retrieve(question)
        context = "\n\n---\n\n".join(relevant_docs)

        system = """Answer questions using only the provided context.
If the answer is not in the context, say "I don't have information about that."
Be concise and cite the relevant part of the context."""

        user = f"Context:\n{context}\n\nQuestion: {question}"

        return prompt(system, user)

Retry with Validation#

import time

def prompt_with_retry(
    system: str,
    user: str,
    parse_fn,
    max_retries: int = 3,
) -> object:
    """Retry prompt until output passes validation."""
    last_error = None

    for attempt in range(max_retries):
        try:
            raw = prompt(system, user)
            return parse_fn(raw)
        except (json.JSONDecodeError, ValueError, KeyError) as e:
            last_error = e
            if attempt < max_retries - 1:
                # Add error feedback to improve next attempt
                user = f"{user}\n\nPrevious attempt failed with: {e}\nPlease fix the output."
                time.sleep(0.5 * (2 ** attempt))  # exponential backoff

    raise ValueError(f"Failed after {max_retries} attempts: {last_error}")

def parse_classification(raw: str) -> dict:
    data = json.loads(raw.strip())
    assert "category" in data, "missing 'category'"
    assert data["category"] in {"bug", "feature", "question", "docs"}, "invalid category"
    return data

result = prompt_with_retry(
    system="Classify GitHub issues. Return JSON: {\"category\": \"bug|feature|question|docs\"}",
    user="Issue: App crashes when uploading files larger than 10MB",
    parse_fn=parse_classification,
)

System Prompt Template#

from string import Template

SYSTEM_TEMPLATE = Template("""
You are a $role for $company.

Context: $context

Constraints:
- $constraints

Output format: $output_format
""")

def build_system_prompt(
    role: str,
    company: str,
    context: str,
    constraints: list[str],
    output_format: str,
) -> str:
    return SYSTEM_TEMPLATE.substitute(
        role=role,
        company=company,
        context=context,
        constraints="\n- ".join(constraints),
        output_format=output_format,
    )

Token Estimation and Cost Control#

# Approximate token count (Claude: ~4 chars per token)
def estimate_tokens(text: str) -> int:
    return len(text) // 4

def truncate_to_token_limit(text: str, max_tokens: int) -> str:
    max_chars = max_tokens * 4
    if len(text) <= max_chars:
        return text
    return text[:max_chars] + "\n[truncated]"

# For context window management:
# claude-opus-4-6: 200k token context window
# At $15/M input tokens, 200k context = $3 per call — use selectively

# Cost-aware routing: use cheaper model for simple tasks
def classify_issue(text: str) -> str:
    # Simple classification: use haiku (fast, cheap)
    return prompt(
        system="Classify as: bug/feature/docs/question. Return the word only.",
        user=text,
        model="claude-haiku-4-5-20251001",  # ~20x cheaper than Opus
    )

def explain_architecture(diagram: str) -> str:
    # Complex reasoning: use Opus
    return prompt(
        system="You are a principal engineer. Analyze this architecture.",
        user=diagram,
        model="claude-opus-4-6",
    )

Conclusion#

Reliable LLM integration requires structured prompts, JSON output with schema validation, retry logic for parsing failures, and few-shot examples for classification tasks. Chain-of-thought prompts improve accuracy on multi-step reasoning. RAG grounds responses in your data and prevents hallucination on domain-specific questions. Cost and latency tradeoffs between models should be factored into routing decisions — not every request needs the most capable model.