Observability Refresher — Metrics, Tracing, Logging

Table of Contents

0. Setup & Environment — Full Local Stack

Every code example in this page runs against a local Docker Compose stack that mirrors production setups like Grafana Cloud, Datadog, or AWS Managed Prometheus. Stand it up once and keep it running as you work through each section.

What you get

Prometheus (metrics) + Grafana (dashboards) + Loki (logs) + Tempo (traces) + OpenTelemetry Collector + a live Python app generating all three signal types. This is the same architecture used by thousands of production teams, just scaled down to run on your laptop.

Config Files

Create a working directory and populate the config files below:

mkdir -p ~/observability-lab/config
cd ~/observability-lab

docker-compose.yml

version: "3.9"

networks:
  observability:
    driver: bridge

volumes:
  prometheus_data: {}
  grafana_data: {}
  loki_data: {}
  tempo_data: {}

services:
  # ── Metrics ──────────────────────────────────────────────────────────
  prometheus:
    image: prom/prometheus:v2.50.0
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus_data:/prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--storage.tsdb.retention.time=7d"
      - "--web.enable-lifecycle"
    networks:
      - observability
    restart: unless-stopped

  # ── Dashboards ───────────────────────────────────────────────────────
  grafana:
    image: grafana/grafana:10.3.1
    container_name: grafana
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
    networks:
      - observability
    depends_on:
      - prometheus
      - loki
      - tempo
    restart: unless-stopped

  # ── Log Aggregation ──────────────────────────────────────────────────
  loki:
    image: grafana/loki:2.9.4
    container_name: loki
    ports:
      - "3100:3100"
    volumes:
      - ./config/loki.yaml:/etc/loki/local-config.yaml:ro
      - loki_data:/loki
    command: -config.file=/etc/loki/local-config.yaml
    networks:
      - observability
    restart: unless-stopped

  # ── Distributed Tracing ──────────────────────────────────────────────
  tempo:
    image: grafana/tempo:2.4.0
    container_name: tempo
    ports:
      - "3200:3200"
      - "4317"  # internal OTLP gRPC (used by otel-collector)
    volumes:
      - ./config/tempo.yaml:/etc/tempo.yaml:ro
      - tempo_data:/tmp/tempo
    command: -config.file=/etc/tempo.yaml
    networks:
      - observability
    restart: unless-stopped

  # ── Log Shipper ──────────────────────────────────────────────────────
  promtail:
    image: grafana/promtail:2.9.4
    container_name: promtail
    volumes:
      - ./config/promtail-config.yaml:/etc/promtail/config.yaml:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: -config.file=/etc/promtail/config.yaml
    networks:
      - observability
    depends_on:
      - loki
    restart: unless-stopped

  # ── OpenTelemetry Collector ──────────────────────────────────────────
  otel-collector:
    image: otel/opentelemetry-collector-contrib:0.95.0
    container_name: otel-collector
    ports:
      - "4317:4317"   # OTLP gRPC
      - "4318:4318"   # OTLP HTTP
      - "8888:8888"   # Collector self-metrics (Prometheus)
      - "8889:8889"   # Prometheus exporter for app metrics
    volumes:
      - ./config/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro
    networks:
      - observability
    depends_on:
      - prometheus
      - loki
      - tempo
    restart: unless-stopped

  # ── Sample Python App ────────────────────────────────────────────────
  sample-app:
    build:
      context: ./sample-app
      dockerfile: Dockerfile
    container_name: sample-app
    ports:
      - "8000:8000"
    environment:
      - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
      - OTEL_SERVICE_NAME=sample-app
      - OTEL_RESOURCE_ATTRIBUTES=deployment.environment=local,service.version=1.0.0
    networks:
      - observability
    depends_on:
      - otel-collector
    restart: unless-stopped

config/prometheus.yml

global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "alerts.yml"

scrape_configs:
  # Prometheus self-monitoring
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

  # OTel Collector self-metrics
  - job_name: "otel-collector"
    static_configs:
      - targets: ["otel-collector:8888"]

  # App metrics exported by OTel Collector
  - job_name: "sample-app"
    static_configs:
      - targets: ["otel-collector:8889"]

  # Grafana self-metrics
  - job_name: "grafana"
    static_configs:
      - targets: ["grafana:3000"]

config/otel-collector-config.yaml

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

processors:
  batch:
    timeout: 1s
    send_batch_size: 1024
  memory_limiter:
    check_interval: 1s
    limit_mib: 512
  resource:
    attributes:
      - action: insert
        key: loki.resource.labels
        value: service.name, deployment.environment

exporters:
  # Send traces to Tempo
  otlp/tempo:
    endpoint: tempo:4317
    tls:
      insecure: true

  # Expose metrics for Prometheus to scrape
  prometheus:
    endpoint: "0.0.0.0:8889"
    namespace: app

  # Send logs to Loki
  loki:
    endpoint: http://loki:3100/loki/api/v1/push
    default_labels_enabled:
      exporter: false
      job: true

  # Debug output (disable in production)
  debug:
    verbosity: basic

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [otlp/tempo]
    metrics:
      receivers: [otlp]
      processors: [memory_limiter, batch]
      exporters: [prometheus]
    logs:
      receivers: [otlp]
      processors: [memory_limiter, batch, resource]
      exporters: [loki]

config/loki.yaml

auth_enabled: false

server:
  http_listen_port: 3100
  grpc_listen_port: 9096

common:
  instance_addr: 127.0.0.1
  path_prefix: /loki
  storage:
    filesystem:
      chunks_directory: /loki/chunks
      rules_directory: /loki/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory

query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100

schema_config:
  configs:
    - from: 2020-10-24
      store: tsdb
      object_store: filesystem
      schema: v12
      index:
        prefix: index_
        period: 24h

ruler:
  alertmanager_url: http://localhost:9093

limits_config:
  reject_old_samples: false

config/tempo.yaml

server:
  http_listen_port: 3200

distributor:
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: 0.0.0.0:4317

ingester:
  max_block_duration: 5m

compactor:
  compaction:
    block_retention: 1h

storage:
  trace:
    backend: local
    wal:
      path: /tmp/tempo/wal
    local:
      path: /tmp/tempo/blocks

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: local
  storage:
    path: /tmp/tempo/generator/wal
  processors: [service-graphs, span-metrics]

config/promtail-config.yaml

server:
  http_listen_port: 9080
  grpc_listen_port: 0

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://loki:3100/loki/api/v1/push

scrape_configs:
  - job_name: docker
    docker_sd_configs:
      - host: unix:///var/run/docker.sock
        refresh_interval: 5s
        filters:
          - name: status
            values: ["running"]
    relabel_configs:
      - source_labels: [__meta_docker_container_name]
        target_label: container
      - source_labels: [__meta_docker_container_label_com_docker_compose_service]
        target_label: service

Sample App Files

mkdir -p ~/observability-lab/sample-app

Create sample-app/requirements.txt:

fastapi==0.110.0
uvicorn==0.27.1
opentelemetry-sdk==1.23.0
opentelemetry-api==1.23.0
opentelemetry-exporter-otlp-proto-grpc==1.23.0
opentelemetry-instrumentation-fastapi==0.44b0
opentelemetry-instrumentation-logging==0.44b0
prometheus-client==0.20.0
structlog==24.1.0

Create sample-app/app.py:

"""
Sample FastAPI app with full OpenTelemetry instrumentation.
Generates logs, metrics, and traces on every request.
"""
import random
import time
import structlog
from fastapi import FastAPI, HTTPException
from opentelemetry import trace, metrics
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.sdk.resources import Resource
import os

# ── Resource (common labels attached to all telemetry) ─────────────────
resource = Resource.create({
    "service.name": os.getenv("OTEL_SERVICE_NAME", "sample-app"),
    "deployment.environment": "local",
    "service.version": "1.0.0",
})

# ── Tracing setup ───────────────────────────────────────────────────────
otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
    BatchSpanProcessor(OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

# ── Metrics setup ───────────────────────────────────────────────────────
metric_reader = PeriodicExportingMetricReader(
    OTLPMetricExporter(endpoint=otlp_endpoint, insecure=True),
    export_interval_millis=10000,
)
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
meter = metrics.get_meter(__name__)

# Custom instruments
request_counter = meter.create_counter(
    "http_requests_total",
    description="Total number of HTTP requests",
)
request_duration = meter.create_histogram(
    "http_request_duration_seconds",
    description="HTTP request duration in seconds",
    unit="s",
)
active_requests = meter.create_up_down_counter(
    "http_active_requests",
    description="Number of requests currently being processed",
)

# ── Structured logging setup ────────────────────────────────────────────
LoggingInstrumentor().instrument()  # Injects trace_id/span_id into log records

structlog.configure(
    processors=[
        structlog.contextvars.merge_contextvars,
        structlog.processors.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.JSONRenderer(),
    ],
    wrapper_class=structlog.make_filtering_bound_logger(20),  # INFO+
    context_class=dict,
    logger_factory=structlog.PrintLoggerFactory(),
)
log = structlog.get_logger()

# ── App ─────────────────────────────────────────────────────────────────
app = FastAPI(title="Observability Sample App")
FastAPIInstrumentor.instrument_app(app)


@app.get("/hello")
async def hello():
    """Returns a greeting. Generates a trace span and increments counters."""
    with tracer.start_as_current_span("process-hello") as span:
        span.set_attribute("app.greeting", "world")
        latency = random.uniform(0.01, 0.15)
        time.sleep(latency)
        request_counter.add(1, {"route": "/hello", "method": "GET", "status": "200"})
        request_duration.record(latency, {"route": "/hello"})
        log.info("hello request processed", latency_ms=round(latency * 1000, 2))
        return {"message": "Hello, Observability World!", "latency_ms": round(latency * 1000, 2)}


@app.get("/error")
async def error_endpoint():
    """Occasionally raises errors so you can observe error metrics and traces."""
    active_requests.add(1)
    try:
        if random.random() < 0.5:
            request_counter.add(1, {"route": "/error", "method": "GET", "status": "500"})
            log.error("simulated internal error", reason="random failure")
            raise HTTPException(status_code=500, detail="Simulated server error")
        request_counter.add(1, {"route": "/error", "method": "GET", "status": "200"})
        return {"status": "ok"}
    finally:
        active_requests.add(-1)


@app.get("/slow")
async def slow_endpoint():
    """Simulates a slow downstream call — useful for latency histograms."""
    with tracer.start_as_current_span("slow-operation") as span:
        with tracer.start_as_current_span("downstream-db-query") as child:
            latency = random.uniform(0.5, 2.0)
            child.set_attribute("db.system", "postgresql")
            child.set_attribute("db.statement", "SELECT * FROM events")
            time.sleep(latency)
        span.set_attribute("total_latency_ms", round(latency * 1000, 2))
        request_duration.record(latency, {"route": "/slow"})
        log.warning("slow request detected", latency_s=round(latency, 3), threshold_s=0.5)
        return {"latency_ms": round(latency * 1000, 2)}


@app.get("/health")
async def health():
    return {"status": "healthy"}

Create sample-app/Dockerfile:

FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app.py .
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

Starting the Stack

cd ~/observability-lab
docker compose up -d

# Verify all services are running
docker compose ps

# Expected: all 7 containers in "Up" state

Verification & Data Source Setup

Open each endpoint and confirm it responds:

# 1. Prometheus — check targets
open http://localhost:9090/targets
# All targets should show "UP"

# 2. Grafana — default login admin/admin
open http://localhost:3000

# 3. Send traffic to sample app
for i in {1..20}; do
  curl -s http://localhost:8000/hello > /dev/null
  curl -s http://localhost:8000/error > /dev/null || true
  curl -s http://localhost:8000/slow > /dev/null
done

In Grafana, add data sources (Connections > Data sources > Add):

Prometheus — URL: http://prometheus:9090
Loki — URL: http://loki:3100
Tempo — URL: http://tempo:3200 — enable "Trace to logs" linking to Loki

Enable trace-to-logs correlation in Tempo data source

In the Tempo data source settings, scroll to "Trace to logs". Set: Data source = Loki, Tags = service.name. This lets you click a trace span and jump directly to the correlated log lines.

# Tear everything down when done
docker compose down -v

1. The Three Pillars

Observability is the ability to understand the internal state of a system by examining its external outputs. The three primary output types — logs, metrics, and traces — each answer different questions. Mature observability requires all three working together.

Logs, Metrics, Traces — What Each Answers

Signal	What it answers	Cardinality	Cost	Best for
Logs	What happened and when, with full context	High (unbounded)	High at scale	Debugging, audit trails, error details
Metrics	How much / how fast / how often, over time	Low (predefined labels)	Low	Alerting, trending, dashboards
Traces	Where time was spent across a distributed request	High (per-request)	Medium (with sampling)	Latency debugging, dependency mapping

What Each Signal Tells You

Logs are timestamped, discrete event records. They capture rich context — stack traces, user IDs, query parameters, error messages — that metrics cannot represent. The cost is storage and query latency at high volume.

Metrics are numeric measurements sampled over time. A counter of HTTP requests, a gauge of memory usage, a histogram of response times. They are cheap, compress well, and are ideal for alerting because you can set thresholds on them.

Traces follow a request as it flows through multiple services, recording the time spent in each operation (called a span). A trace answers "why was this request slow?" in a way that neither logs nor metrics alone can answer.

Observability vs Monitoring

Monitoring is checking known failure modes: "alert if error rate > 5%." It requires you to predict what can go wrong. Observability is the ability to ask arbitrary questions about system behavior — including questions you did not predict you would need to ask. Observability enables monitoring, but goes further.

The key distinction

Monitoring: you define what to watch. Observability: you can explore anything. A highly observable system lets an on-call engineer investigate an anomaly they have never seen before without requiring a code deploy to add instrumentation.

Correlation: Tying the Three Pillars Together

The real power emerges when you correlate signals. A trace ID (generated per-request) can be embedded in logs and referenced in metrics labels, allowing you to jump from an alert to the relevant traces to the relevant log lines in a single workflow.

# A log line with trace context embedded (generated by our sample app):
{
  "event": "slow request detected",
  "level": "warning",
  "latency_s": 1.83,
  "threshold_s": 0.5,
  "timestamp": "2026-02-23T10:12:34.123Z",
  "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736",  # links to Tempo
  "span_id": "00f067aa0ba902b7"
}

Try it in Grafana: run curl http://localhost:8000/slow, then in Grafana Explore select the Loki data source and query {service="sample-app"} |= "slow". Click the trace ID link in the log line to jump to Tempo.

2. OpenTelemetry

CNCF v1.x stable

OpenTelemetry (OTel) is the vendor-neutral, CNCF-graduated standard for generating, collecting, and exporting telemetry data. It replaces a fragmented ecosystem of vendor SDKs (Datadog agent, Jaeger client, Zipkin client) with a single API that can send to any backend.

Architecture

The OTel architecture has three layers:

SDK — embedded in your application. Provides the API (tracer, meter, logger) and buffers telemetry before export.
Collector — a standalone agent/gateway that receives, processes, and exports telemetry. Decouples your app from backend specifics.
Backend — Prometheus, Grafana Loki, Grafana Tempo, Jaeger, Datadog, etc.

# Your app sends via OTLP to the Collector running in Docker
# The Collector fans out to Prometheus + Loki + Tempo

App (SDK) ──OTLP gRPC─→ OTel Collector ──→ Prometheus (metrics scrape)
                                        ──→ Loki (log push)
                                        ──→ Tempo (trace push)

Auto vs Manual Instrumentation

Auto-instrumentation uses monkey-patching or bytecode injection to instrument popular frameworks (FastAPI, Flask, Django, SQLAlchemy, requests, etc.) with zero code changes. It handles HTTP span creation, error recording, and context propagation automatically.

# Auto-instrumentation: one line instruments all FastAPI routes
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
FastAPIInstrumentor.instrument_app(app)
# Now every HTTP request automatically creates a trace span with:
#   http.method, http.route, http.status_code, http.url

Manual instrumentation lets you add business-level spans for operations the framework knows nothing about — database queries, cache lookups, third-party API calls, background jobs.

from opentelemetry import trace

tracer = trace.get_tracer(__name__)

def process_payment(order_id: str, amount: float):
    with tracer.start_as_current_span("process-payment") as span:
        # Span attributes add searchable dimensions to the trace
        span.set_attribute("order.id", order_id)
        span.set_attribute("payment.amount_cents", int(amount * 100))
        span.set_attribute("payment.currency", "USD")

        try:
            result = stripe_client.charge(amount)
            span.set_attribute("payment.status", "success")
            span.set_attribute("payment.charge_id", result.id)
            return result
        except stripe.CardError as e:
            # Record the exception in the span — visible in Tempo
            span.record_exception(e)
            span.set_status(trace.StatusCode.ERROR, str(e))
            raise

Spans, Attributes, and Events

Span — a named, timed operation. Has a start time, end time, status, and optional attributes.
Span attribute — a key-value pair on a span. Use semantic conventions: http.method, db.system, rpc.service.
Span event — a timestamped annotation within a span. Use for significant moments that are not sub-operations (e.g., "cache miss", "retry attempt 2").
Span link — a reference to a span in another trace. Used for async fan-out (e.g., a Kafka message links back to the producer's trace).

with tracer.start_as_current_span("batch-process") as span:
    span.set_attribute("batch.size", len(items))

    # Span event: a point-in-time annotation
    span.add_event("processing-started", {"worker_id": worker_id})

    for i, item in enumerate(items):
        if i % 100 == 0:
            span.add_event("checkpoint", {"processed": i})

    span.add_event("processing-complete", {"failed": error_count})

Context Propagation — W3C traceparent

For a trace to span multiple services, the trace context must travel with the request. The W3C traceparent header is the standard mechanism:

# Format: version-trace_id-parent_span_id-flags
traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
#            ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^ ^^
#            v  128-bit trace ID                   64-bit span ID    sampled

OTel SDKs inject and extract this header automatically when you use the instrumented HTTP clients. For message queues (Kafka, SQS), you inject the context into message headers at produce time and extract at consume time.

# Injecting context into Kafka message headers (manual)
from opentelemetry.propagate import inject

headers = {}
inject(headers)  # adds traceparent, tracestate headers
producer.produce(topic, value=payload, headers=headers)

Collector Pipeline Deep Dive

The Collector config is structured as receivers → processors → exporters pipelines. You can have multiple pipelines (one per signal type):

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [memory_limiter, batch, tail_sampling]
      exporters: [otlp/tempo, debug]
    metrics:
      receivers: [otlp, prometheus]    # collect from both OTLP and Prometheus scrape
      processors: [memory_limiter, batch, filter/drop_internal]
      exporters: [prometheus, otlp/cloud]
    logs:
      receivers: [otlp, filelog]       # OTLP from app + file tailing
      processors: [memory_limiter, batch, resource, attributes]
      exporters: [loki]

Resource Attributes

Resource attributes describe the entity producing telemetry, not the individual request. They are attached to all signals from a given process. Semantic conventions define standard names:

Attribute	Example	Required?
`service.name`	payment-service	Yes
`service.version`	2.4.1	Recommended
`deployment.environment`	production	Recommended
`host.name`	ip-10-0-1-42	Auto-detected
`k8s.pod.name`	payment-7d9f6b-xkz2p	Auto-detected (k8s)

# Try it: query metrics with service.name label in Prometheus
# Open http://localhost:9090 and run:
app_http_requests_total{job="sample-app"}

# View in Grafana Explore → Tempo, filter by service.name = "sample-app"

3. Structured Logging

Structured logging means emitting logs as machine-parseable key-value records (typically JSON) rather than freeform text strings. This is the single highest-leverage logging improvement you can make.

Why Structured Beats Unstructured

Unstructured	Structured (JSON)
`ERROR: payment failed for user 12345 after 3 retries`	`{"level":"error","event":"payment_failed","user_id":12345,"retries":3}`
Requires regex to extract fields	Direct field-based filtering and aggregation
Brittle — a wording change breaks log parsers	Schema is stable; field names are constants
Cannot aggregate "count errors by user_id"	`count_over_time({app="api"} \| json \| level="error" [5m])`

Python structlog Example

import structlog
import logging

# Configure structlog with JSON output
structlog.configure(
    processors=[
        structlog.contextvars.merge_contextvars,      # thread-local context
        structlog.stdlib.add_logger_name,
        structlog.stdlib.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
        structlog.processors.format_exc_info,
        structlog.processors.JSONRenderer(),
    ],
    wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
    context_class=dict,
    logger_factory=structlog.PrintLoggerFactory(),
)

log = structlog.get_logger()

# Basic structured log
log.info("user_login", user_id=12345, ip="203.0.113.1", method="oauth2")

# Bind context for all subsequent calls in this scope
bound_log = log.bind(request_id="req-abc123", service="auth")
bound_log.info("session_created", session_ttl_s=3600)
bound_log.warning("suspicious_activity", failed_attempts=5)

# Exception logging with full stack trace
try:
    process_payment(order_id="ord-789")
except Exception:
    log.exception("payment_processing_failed", order_id="ord-789", amount_cents=9999)

Log Levels — When to Use Each

Level	Use when	Production default?
DEBUG	Detailed execution flow, variable values. Expensive — disable in prod.	No
INFO	Normal business events: user logged in, job completed, request served.	Yes
WARNING	Unexpected state that the system recovered from: retry succeeded, fallback used, latency spike.	Yes
ERROR	A request or operation failed. Something needs attention.	Yes
CRITICAL	System-level failure that may require immediate intervention: DB unreachable, disk full.	Yes

Log level discipline

Do not log INFO for every loop iteration or internal function call — this will make your log system unaffordable at scale. Reserve DEBUG for verbose tracing you only enable when debugging a specific issue. A common rule: each production request should generate at most 2-5 INFO log lines under normal conditions.

Correlation IDs

Every log line should carry the trace ID and span ID of the current request. This is what enables the "alert → trace → logs" workflow:

# OpenTelemetry's LoggingInstrumentor injects these automatically
# The output JSON looks like:
{
  "event": "payment_processed",
  "level": "info",
  "amount_cents": 9999,
  "timestamp": "2026-02-23T10:15:00.123Z",
  "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736",
  "span_id": "00f067aa0ba902b7",
  "service.name": "payment-service",
  "deployment.environment": "production"
}

Log Aggregation Pipeline

# Our local pipeline:
# App (stdout JSON) → Docker → Promtail (reads Docker socket) → Loki → Grafana

# Query in Grafana Explore → Loki:
{container="sample-app"}                          # all logs from container
{container="sample-app"} |= "error"               # contains "error"
{container="sample-app"} | json | level="error"   # parse JSON, filter by field
{container="sample-app"} | json | latency_ms > 100 # numeric field filter

LogQL Queries

LogQL is Loki's query language. It has two forms: log queries (return log lines) and metric queries (derive metrics from log lines).

# Log query: stream selector + filter pipeline
{service="sample-app", container="sample-app"}
  | json                                    # parse JSON fields
  | level = "warning"                       # filter by level field
  | line_format "{{.timestamp}} {{.event}}" # reformat output

# Metric query: count errors per minute
rate({container="sample-app"} | json | level="error" [1m])

# Metric query: 99th percentile latency (requires numeric field)
quantile_over_time(0.99,
  {container="sample-app"}
    | json
    | unwrap latency_ms [5m]
) by (route)

# Try it in Grafana: Explore → Loki → paste this query:
{container="sample-app"} | json | level != "debug"

4. Metrics

Prometheus OpenMetrics

Metric Types

Type	Description	Example	Operations
Counter	Monotonically increasing. Never decreases (except on reset).	`http_requests_total`	`rate()`, `increase()`
Gauge	Can go up or down. Point-in-time measurement.	`memory_usage_bytes`	Direct value, `avg_over_time()`
Histogram	Samples observations into predefined buckets. Tracks sum and count.	`request_duration_seconds`	`histogram_quantile()`, `rate()`
Summary	Client-side quantile calculation. Accurate but cannot be aggregated across instances.	`rpc_duration_seconds`	Direct `{quantile="0.99"}`

Histogram vs Summary

Prefer histograms over summaries. Histograms are aggregatable across multiple pods: histogram_quantile(0.99, sum(rate(duration_bucket[5m])) by (le)). Summaries calculate quantiles client-side — you cannot aggregate them across instances, making them nearly useless in a multi-pod deployment.

Naming Conventions

# Format: [namespace_][subsystem_]name[_unit][_total|_bucket|_count|_sum]
# Use snake_case. End counters with _total. Include units.

http_requests_total                    # counter
http_request_duration_seconds          # histogram
http_request_duration_seconds_bucket   # (auto-generated by histogram)
http_request_duration_seconds_count    # (auto-generated)
http_request_duration_seconds_sum      # (auto-generated)
process_resident_memory_bytes          # gauge
cache_hits_total                       # counter
kafka_consumer_lag_offsets             # gauge (lag per partition)

Prometheus Exposition Format

# Text format served at /metrics endpoint
# HELP describes the metric; TYPE declares its kind

# HELP http_requests_total Total number of HTTP requests
# TYPE http_requests_total counter
http_requests_total{method="GET",route="/hello",status="200"} 142.0
http_requests_total{method="GET",route="/error",status="500"} 23.0

# HELP http_request_duration_seconds HTTP request duration in seconds
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{route="/hello",le="0.05"} 89.0
http_request_duration_seconds_bucket{route="/hello",le="0.1"} 130.0
http_request_duration_seconds_bucket{route="/hello",le="0.25"} 142.0
http_request_duration_seconds_bucket{route="/hello",le="+Inf"} 142.0
http_request_duration_seconds_count{route="/hello"} 142.0
http_request_duration_seconds_sum{route="/hello"} 11.43

Custom Metrics in Python

from prometheus_client import Counter, Gauge, Histogram, start_http_server

# Define metrics at module level (global singletons)
REQUEST_COUNT = Counter(
    "http_requests_total",
    "Total HTTP request count",
    ["method", "endpoint", "http_status"],  # label dimensions
)

ACTIVE_CONNECTIONS = Gauge(
    "active_websocket_connections",
    "Number of active WebSocket connections",
)

REQUEST_LATENCY = Histogram(
    "http_request_duration_seconds",
    "HTTP request latency in seconds",
    ["endpoint"],
    buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
)

# Usage
REQUEST_COUNT.labels(method="GET", endpoint="/hello", http_status=200).inc()
ACTIVE_CONNECTIONS.inc()
ACTIVE_CONNECTIONS.dec()

# Context manager measures duration automatically
with REQUEST_LATENCY.labels(endpoint="/payments").time():
    result = process_payment(order_id)

# Expose metrics endpoint (alternative to OTel push)
start_http_server(port=8001)  # Prometheus scrapes http://app:8001/metrics

PromQL

PromQL is Prometheus's query language. Open http://localhost:9090 and run these:

# Rate of requests per second over the last 5 minutes
rate(app_http_requests_total[5m])

# Total requests in the last hour by route
increase(app_http_requests_total[1h])

# Error rate as a percentage
rate(app_http_requests_total{status="500"}[5m])
  /
rate(app_http_requests_total[5m])
* 100

# 99th percentile latency (requires histogram)
histogram_quantile(0.99,
  sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le, route)
)

# Current memory usage across all pods
sum(container_memory_working_set_bytes{namespace="production"}) by (pod)

# CPU usage rate
rate(container_cpu_usage_seconds_total[5m])

Recording Rules

Recording rules pre-compute expensive queries and store results as new metrics. Use them for aggregations that are queried repeatedly (e.g., dashboard panels, alert expressions):

# config/prometheus-alerts.yml
groups:
  - name: recording_rules
    interval: 30s
    rules:
      # Pre-compute per-route 5m request rate
      - record: job:http_requests:rate5m
        expr: sum(rate(app_http_requests_total[5m])) by (job, route)

      # Pre-compute error ratio
      - record: job:http_error_ratio:rate5m
        expr: |
          sum(rate(app_http_requests_total{status=~"5.."}[5m])) by (job)
          /
          sum(rate(app_http_requests_total[5m])) by (job)

5. Distributed Tracing

Spans and Traces

A trace is the complete journey of a single request through your system. It consists of spans — individual units of work — arranged in a parent-child tree. The root span represents the entry point (e.g., HTTP request received); child spans represent downstream calls (database query, cache lookup, external API call).

# Trace waterfall view (conceptual):
[──────────────── HTTP GET /checkout (root span, 450ms) ─────────────────]
  [── auth-check (12ms) ──]
  [── cart-service.get_cart (80ms) ──────]
    [── postgres.SELECT orders (65ms) ──]
  [── inventory-service.reserve (180ms) ────────────────────]
    [── redis.GET sku:123 (3ms)]
    [── postgres.UPDATE inventory (170ms) ─────────────────]
  [── payment-service.charge (120ms) ────────────]

This waterfall immediately shows that postgres.UPDATE inventory is the bottleneck — something you cannot see from a single service's metrics alone.

Sampling Strategies

Tracing every request at high volume is expensive. Sampling controls what fraction of traces you keep.

Strategy	How it works	Pros	Cons
Head-based (probabilistic)	Decision made at trace root before any spans are recorded. Keep N% of traces.	Simple, low overhead	May discard interesting (slow/error) traces
Tail-based	Buffer entire trace, then decide based on outcome (was it an error? was it slow?)	Keeps all errors and slow traces	Requires buffering — memory overhead
Rate-limiting	Keep at most N traces per second per service	Cost predictable	Low-traffic services may be under-sampled

# OTel Collector tail-sampling processor
processors:
  tail_sampling:
    decision_wait: 10s        # wait this long after trace starts
    num_traces: 100000        # max traces in memory
    policies:
      # Always keep errors
      - name: keep-errors
        type: status_code
        status_code: {status_codes: [ERROR]}
      # Keep slow traces (>500ms)
      - name: keep-slow
        type: latency
        latency: {threshold_ms: 500}
      # Sample 5% of everything else
      - name: sample-remaining
        type: probabilistic
        probabilistic: {sampling_percentage: 5}

Manual Span Creation

from opentelemetry import trace
from opentelemetry.trace import SpanKind

tracer = trace.get_tracer("payment-service", "1.0.0")

async def charge_card(card_token: str, amount_cents: int):
    # SpanKind.CLIENT signals this span calls an external service
    with tracer.start_as_current_span(
        "stripe.charge",
        kind=SpanKind.CLIENT,
    ) as span:
        span.set_attribute("rpc.system", "http")
        span.set_attribute("rpc.service", "Stripe")
        span.set_attribute("rpc.method", "PaymentIntents.create")
        span.set_attribute("payment.amount_cents", amount_cents)
        span.set_attribute("payment.currency", "usd")

        response = await stripe.PaymentIntent.create(
            amount=amount_cents,
            currency="usd",
            payment_method=card_token,
        )

        span.set_attribute("payment.intent_id", response.id)
        span.set_attribute("payment.status", response.status)
        return response

# Try it: generate traces and view in Tempo
for i in {1..10}; do curl -s http://localhost:8000/slow; done

# In Grafana: Explore → Tempo → Search
# Filter: service.name = "sample-app"
# You'll see waterfall views with parent-child spans

6. Alerting

Alert Design Philosophy

Good alerts are actionable (a human can do something about them), symptom-based (user-visible impact, not internal causes), and rare enough to be taken seriously. Alert fatigue — the condition where on-call engineers stop responding because alerts are too frequent or too noisy — is one of the most common reliability failures in engineering organizations.

Alert on symptoms, not causes

Bad: "CPU usage > 80%" — a symptom of something, but doesn't tell you if users are impacted. Good: "error rate > 1% for 5 minutes" — directly measures user impact. A high CPU with zero errors is not a problem worth waking someone up at 3am.

Prometheus Alerting Rules

# config/alerts.yml (referenced in prometheus.yml rule_files)
groups:
  - name: sample-app-alerts
    rules:
      # Alert when error rate exceeds 1% for 5 minutes
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(app_http_requests_total{status=~"5.."}[5m]))
            /
            sum(rate(app_http_requests_total[5m]))
          ) > 0.01
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
          runbook_url: "https://wiki.internal/runbooks/high-error-rate"

      # Alert when p99 latency exceeds 1 second
      - alert: HighLatencyP99
        expr: |
          histogram_quantile(0.99,
            sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le, route)
          ) > 1.0
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: "p99 latency exceeds 1s on {{ $labels.route }}"
          description: "p99 is {{ $value | humanizeDuration }}"

      # Alert when service is down (no data = firing)
      - alert: ServiceDown
        expr: up{job="sample-app"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.job }} is down"

Severity Levels

Severity	Impact	Response time	Action
P1 / Critical	Complete service outage, data loss, security breach	Immediate (24/7 page)	Wake up on-call, incident declared
P2 / High	Significant degradation, major feature broken	15 min (business hours page)	Engage on-call engineer
P3 / Warning	Minor degradation, affecting small % of users	Next business hour	Ticket created, triaged in standup
P4 / Info	No user impact, proactive notification	End of day	Slack notification only

SLO-Based Alerting

Rather than alerting on raw thresholds, alert on how fast you are burning through your error budget (covered in depth in Section 8). The key concept is burn rate: at what multiple of normal are errors occurring?

# Burn rate alert: if errors are burning budget 14x faster than allowed
# (i.e., in 1 hour you'd exhaust a week's budget), page immediately
- alert: ErrorBudgetBurnRateFast
  expr: |
    (
      sum(rate(app_http_requests_total{status=~"5.."}[1h]))
      /
      sum(rate(app_http_requests_total[1h]))
    ) > 14 * 0.001  # 14x burn rate with 0.1% error budget
  for: 5m
  labels:
    severity: critical
  annotations:
    summary: "Error budget burning fast — {{ $value | humanizePercentage }} error rate"

7. Dashboards

RED and USE Methods

Two mental models tell you exactly which panels belong on every dashboard:

RED (for services) — Rate, Errors, Duration. Applies to any system that handles requests.
USE (for resources) — Utilization, Saturation, Errors. Applies to any resource (CPU, memory, disk, connection pool).

Method	Applies to	Panel examples
Rate	Services	Requests/sec, events/sec, transactions/min
Errors	Services	Error rate %, 5xx count, failed payments/min
Duration	Services	p50/p95/p99 latency, Apdex score
Utilization	Resources	CPU %, memory %, disk %, connection pool %
Saturation	Resources	Queue depth, wait time, thread pool queue length
Errors	Resources	Disk errors/sec, network drops, OOM events

Key PromQL Queries for Dashboard Panels

# Paste these into Grafana: Dashboards -> New -> Add visualization -> Prometheus

# 1. Request rate (requests/sec) — Stat or Time Series panel
sum(rate(app_http_requests_total[5m])) by (route)

# 2. Error rate percentage — Gauge panel with thresholds at 1% / 5%
(
  sum(rate(app_http_requests_total{status=~"5.."}[5m]))
  /
  sum(rate(app_http_requests_total[5m]))
) * 100

# 3. p50 / p95 / p99 latency — Time Series panel with 3 queries
histogram_quantile(0.50, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.95, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.99, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))

# 4. Active requests — Stat panel
sum(app_http_active_requests)

Dashboard as Code

Storing dashboards as JSON (or using Grafonnet) prevents configuration drift and makes code reviews possible. Export any dashboard from Grafana: Dashboard settings → JSON Model → Copy.

{
  "title": "Sample App — RED Dashboard",
  "uid": "sample-app-red",
  "schemaVersion": 38,
  "panels": [
    {
      "type": "stat",
      "title": "Request Rate",
      "targets": [{
        "expr": "sum(rate(app_http_requests_total[5m]))",
        "legendFormat": "req/s"
      }],
      "fieldConfig": {
        "defaults": {
          "unit": "reqps",
          "thresholds": {
            "steps": [
              {"color": "green", "value": null},
              {"color": "yellow", "value": 100},
              {"color": "red", "value": 500}
            ]
          }
        }
      }
    }
  ]
}

Anti-Patterns to Avoid

Vanity dashboards — panels showing metrics that no one acts on (e.g., "total requests all time"). Every panel should answer an operational question.
Too many panels — a dashboard with 40 panels is unreadable. Start with 6-8 panels covering RED/USE. Add more only when you have a proven need.
No thresholds — panels without color thresholds force the viewer to interpret raw numbers. Add green/yellow/red thresholds so the panel communicates health at a glance.
Hardcoded time ranges — use template variables so dashboards can be filtered by environment, service, or pod.

8. SLIs, SLOs, and Error Budgets

SLOs (Service Level Objectives) translate reliability into a mathematical budget. They align engineering decisions — feature velocity vs reliability — around a shared, quantifiable target.

Definitions

SLI (Service Level Indicator) — the metric you measure. Must be something users care about. Examples: request success rate, latency at p99, data freshness.
SLO (Service Level Objective) — the target you set for the SLI. "99.9% of requests succeed in under 500ms, measured over a 30-day rolling window."
SLA (Service Level Agreement) — a contractual commitment with financial penalties. Set SLAs above your internal SLOs to create a safety buffer.
Error budget — 1 - SLO. If SLO = 99.9%, error budget = 0.1% = 43.8 minutes of downtime per month.

Worked Example

# SLI: proportion of successful requests in a 30-day window
# SLO: 99.9% success rate

# Error budget calculation:
total_requests_per_month = 50_000_000
slo = 0.999
error_budget_requests = total_requests_per_month * (1 - slo)
# = 50,000 allowed failures per month

# If you've had 30,000 failures in 20 days:
remaining_budget = error_budget_requests - 30_000  # = 20,000 failures remaining
budget_consumed_percent = 30_000 / error_budget_requests * 100  # = 60%

# Decision: you've consumed 60% of budget in 66% of the window.
# Slightly ahead of pace — proceed with features but watch closely.

Measuring SLOs in PromQL

# SLI: success rate over 30 days (rolling window)
(
  sum(increase(app_http_requests_total{status!~"5.."}[30d]))
  /
  sum(increase(app_http_requests_total[30d]))
)

# Error budget remaining (as fraction)
1 - (
  (1 - sum(increase(app_http_requests_total{status!~"5.."}[30d]))
         / sum(increase(app_http_requests_total[30d])))
  / (1 - 0.999)
)

Burn Rate Alerts

A burn rate of 1 means you are consuming error budget exactly as fast as the window allows. A burn rate of 14 means you will exhaust the entire monthly budget in ~2 days.

# Multi-window burn rate alerting (Google SRE recommendation)
- alert: HighErrorBudgetBurn
  expr: |
    (sum(rate(app_http_requests_total{status=~"5.."}[1h]))
     / sum(rate(app_http_requests_total[1h])))
    > 14 * 0.001
    AND
    (sum(rate(app_http_requests_total{status=~"5.."}[5m]))
     / sum(rate(app_http_requests_total[5m])))
    > 14 * 0.001
  for: 2m
  labels:
    severity: critical

How SLOs Drive Engineering Decisions

When the error budget is healthy, teams ship faster (accepting more risk).
When the error budget is nearly exhausted, the team pauses feature work and focuses on reliability.
This removes subjectivity from the "features vs reliability" debate — the math drives the decision.

9. Observability for Microservices

Golden Signals per Service

Google SRE introduced the four golden signals. Each service should expose all four:

Latency — response time, distinguishing successful vs failed requests
Traffic — requests/sec, events/sec — demand on the system
Errors — rate of failed requests (explicit HTTP 5xx, implicit timeouts)
Saturation — how full the service is (connection pool, thread pool, queue depth)

Service Dependency Maps

Tempo's service graph feature (enabled by the metrics_generator block in tempo.yaml) automatically builds a dependency map from trace data. In Grafana: Explore → Tempo → Service Graph. You will see nodes for each service and edges labeled with request rate and error rate.

# Generate traffic then view service graph in Grafana
for i in {1..50}; do
  curl -s http://localhost:8000/hello > /dev/null
  curl -s http://localhost:8000/slow > /dev/null
done
# Grafana: Explore -> Tempo -> Service Graph tab

Cross-Service Context Propagation

# Service A: inject trace context into outgoing HTTP call
import httpx
from opentelemetry.propagate import inject

async def call_inventory_service(sku_id: str):
    headers = {"Content-Type": "application/json"}
    inject(headers)  # adds traceparent + tracestate headers
    async with httpx.AsyncClient() as client:
        response = await client.get(
            f"http://inventory-service/sku/{sku_id}",
            headers=headers,
        )
    return response.json()

# Service B: FastAPIInstrumentor extracts traceparent automatically.
# The incoming span is linked as a child of Service A's span.

Common Failure Patterns in Traces

Retry storms — a span with multiple child spans retrying the same operation. Each retry adds delay, making the parent span very long.
Cascading failures — one slow service causes all callers to queue, exhausting connection pools. Traces show all parallel calls backed up behind one bottleneck.
N+1 queries — a loop in application code issues one DB query per item instead of one batch query. Shows as N identical db.query child spans under a single service span.

# N+1 anti-pattern visible as 100 sequential DB spans in a trace
orders = db.execute("SELECT * FROM orders LIMIT 100").fetchall()
for order in orders:
    # Separate query per order: 100 queries total
    user = db.execute("SELECT * FROM users WHERE id = ?", order.user_id).fetchone()

# Fix: single query, one DB span
orders_with_users = db.execute("""
    SELECT o.*, u.name, u.email
    FROM orders o JOIN users u ON o.user_id = u.id
    LIMIT 100
""").fetchall()

10. Observability for Data Engineering

Airflow Spark Kafka dbt

Pipeline Observability Metrics

Data pipelines have different SLIs than request-serving systems. The core questions are: Did the job run? Did it process the expected volume? Is the data fresh? Is the data correct?

Metric	Type	SLO example
Job duration	Histogram	p95 < 30 min
Records processed	Counter	Alert if < 95% of expected volume
Data freshness (lag)	Gauge	Source table updated within 1 hour
Failed records / null rate	Gauge	< 0.1% null in non-nullable fields
Job failure rate	Counter	0 failures in critical path per day

Airflow Metrics and Alerting

Airflow exposes StatsD metrics that can be forwarded to Prometheus via statsd_exporter:

# airflow.cfg metrics section
[metrics]
statsd_on = True
statsd_host = localhost
statsd_port = 8125
statsd_prefix = airflow

# Key Airflow metrics after statsd_exporter translation:
# airflow_dagrun_duration_success{dag_id="..."}
# airflow_task_instance_created_<state>
# airflow_executor_running_tasks

# Alert on DAG failure
- alert: AirflowDAGFailed
  expr: increase(airflow_dagrun_duration_failed_total[1h]) > 0
  labels:
    severity: high
  annotations:
    summary: "DAG {{ $labels.dag_id }} failed"

# Alert on stale DAG (did not run on schedule)
- alert: AirflowDAGNotRunning
  expr: time() - airflow_dag_last_run_timestamp{dag_id="daily_etl"} > 5400
  labels:
    severity: warning
  annotations:
    summary: "daily_etl has not run in 90 minutes"

Kafka Consumer Lag Monitoring

Consumer lag — the difference between the latest offset and the committed offset — is the most important Kafka health metric. Growing lag means consumers cannot keep up with producers.

# Using kminion which exports Prometheus metrics
# Key metric: kminion_kafka_consumer_group_topic_partition_lag

# PromQL: total lag across all partitions for a consumer group
sum(kminion_kafka_consumer_group_topic_partition_lag{
  consumer_group="order-processor"
}) by (topic)

# Alert on high lag
- alert: KafkaConsumerLagHigh
  expr: |
    sum(kminion_kafka_consumer_group_topic_partition_lag{
      consumer_group="order-processor"
    }) > 10000
  for: 5m
  annotations:
    summary: "Kafka consumer lag is {{ $value }} messages"

Data Quality as Metrics

# Emit dbt test results as Prometheus metrics
import json
from prometheus_client import Gauge

DBT_TEST_PASS = Gauge("dbt_test_passed_total", "Passing dbt tests", ["model"])
DBT_TEST_FAIL = Gauge("dbt_test_failed_total", "Failing dbt tests", ["model"])

with open("target/run_results.json") as f:
    results = json.load(f)

for result in results["results"]:
    model = result["unique_id"].split(".")[2]
    if result["status"] == "pass":
        DBT_TEST_PASS.labels(model=model).inc()
    elif result["status"] == "fail":
        DBT_TEST_FAIL.labels(model=model).inc()
        log.error("dbt_test_failed", model=model, test=result["unique_id"])

Freshness SLOs

# Alert: events table not updated in 1 hour
- alert: DataFreshnessViolation
  expr: sql_table_age_seconds{table_name="events"} > 3600
  for: 5m
  labels:
    severity: high
  annotations:
    summary: "events table is {{ $value | humanizeDuration }} stale"

# Schema drift detection: alert on unexpected column additions/removals
# Track column count as a gauge; alert if it changes unexpectedly
- alert: SchemaDriftDetected
  expr: delta(sql_table_column_count{table="events"}[1h]) != 0
  annotations:
    summary: "Schema changed on table events"

11. Observability for MLOps

MLflow Prometheus PyTorch

Model Inference Monitoring

from prometheus_client import Counter, Histogram, Gauge
import time

PREDICTION_COUNTER = Counter(
    "ml_predictions_total",
    "Total predictions made",
    ["model_name", "model_version", "status"],
)
PREDICTION_LATENCY = Histogram(
    "ml_prediction_duration_seconds",
    "Time to generate a prediction",
    ["model_name"],
    buckets=[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0],
)
PREDICTION_CONFIDENCE = Histogram(
    "ml_prediction_confidence",
    "Distribution of model confidence scores",
    ["model_name"],
    buckets=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0],
)

class MonitoredModelServer:
    def __init__(self, model, model_name: str, model_version: str):
        self.model = model
        self.model_name = model_name
        self.model_version = model_version

    def predict(self, features):
        start = time.time()
        try:
            result = self.model.predict(features)
            confidence = float(result.max())
            PREDICTION_COUNTER.labels(
                model_name=self.model_name,
                model_version=self.model_version,
                status="success",
            ).inc()
            PREDICTION_CONFIDENCE.labels(model_name=self.model_name).observe(confidence)
            return result
        except Exception:
            PREDICTION_COUNTER.labels(
                model_name=self.model_name,
                model_version=self.model_version,
                status="error",
            ).inc()
            raise
        finally:
            PREDICTION_LATENCY.labels(model_name=self.model_name).observe(time.time() - start)

Data Drift Detection

Model accuracy degrades when incoming feature distributions shift away from the training distribution (data drift). Detect it by tracking feature statistics over time:

from scipy.stats import ks_2samp
from prometheus_client import Gauge

FEATURE_DRIFT_SCORE = Gauge(
    "ml_feature_drift_ks_statistic",
    "Kolmogorov-Smirnov drift score vs training distribution",
    ["model_name", "feature_name"],
)

class DriftDetector:
    def __init__(self, training_stats: dict):
        self.training_stats = training_stats

    def check_drift(self, model_name: str, current_batch: dict):
        for feature_name, production_values in current_batch.items():
            if feature_name not in self.training_stats:
                continue
            # KS statistic: 0 = identical distributions, 1 = completely different
            statistic, p_value = ks_2samp(
                self.training_stats[feature_name],
                production_values,
            )
            FEATURE_DRIFT_SCORE.labels(
                model_name=model_name,
                feature_name=feature_name,
            ).set(statistic)
            if p_value < 0.05:
                log.warning(
                    "feature_drift_detected",
                    model=model_name,
                    feature=feature_name,
                    ks_statistic=round(statistic, 4),
                    p_value=round(p_value, 6),
                )

Training Pipeline Observability

import mlflow
import time
from prometheus_client import Gauge, push_to_gateway

GPU_UTILIZATION = Gauge("training_gpu_utilization_percent", "GPU utilization", ["gpu_id"])
TRAINING_LOSS = Gauge("training_loss_current", "Current training loss", ["model", "split"])
EPOCH_DURATION = Gauge("training_epoch_duration_seconds", "Duration of last epoch", ["model"])

def train_epoch(model, dataloader, optimizer, epoch: int, run_name: str):
    epoch_start = time.time()
    total_loss = 0.0

    for batch_idx, (inputs, targets) in enumerate(dataloader):
        if batch_idx % 10 == 0:
            # Track GPU utilization every 10 batches (requires pynvml)
            # GPU_UTILIZATION.labels(gpu_id="0").set(get_gpu_util())
            pass

        optimizer.zero_grad()
        loss = criterion(model(inputs), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    epoch_secs = time.time() - epoch_start

    # Dual-write: Prometheus push gateway + MLflow for experiment tracking
    TRAINING_LOSS.labels(model=run_name, split="train").set(avg_loss)
    EPOCH_DURATION.labels(model=run_name).set(epoch_secs)
    push_to_gateway("localhost:9091", job="training")

    mlflow.log_metrics({
        "train_loss": avg_loss,
        "epoch_duration_s": epoch_secs,
    }, step=epoch)

A/B Test Metrics

from prometheus_client import Counter

VARIANT_METRIC = Counter(
    "ml_ab_test_outcome_total",
    "A/B test outcomes",
    ["experiment_id", "variant", "outcome"],
)

def serve_prediction(user_id: str, experiment_id: str):
    # Deterministic assignment by hash — same user always gets same variant
    variant = "treatment" if hash(user_id) % 100 < 50 else "control"
    model = treatment_model if variant == "treatment" else control_model

    with tracer.start_as_current_span("ab-prediction") as span:
        span.set_attribute("experiment.id", experiment_id)
        span.set_attribute("experiment.variant", variant)
        result = model.predict(get_features(user_id))

    return {"prediction": result, "variant": variant}

def record_outcome(experiment_id: str, variant: str, outcome: str):
    VARIANT_METRIC.labels(
        experiment_id=experiment_id,
        variant=variant,
        outcome=outcome,  # "click", "convert", "dismiss"
    ).inc()

12. Log Analysis & Querying

Grafana Loki + LogQL Deep Dive

LogQL queries consist of a stream selector (fast label index lookup) plus an optional filter pipeline (scans log content). Run all of these in Grafana Explore → Loki:

# ── Basic queries ──────────────────────────────────────────────────────
{container="sample-app"}
{container="sample-app"} |= "error"
{container="sample-app"} != "health"
{container="sample-app"} |~ "latency_ms.*[0-9]{3,}"

# ── JSON parsing ───────────────────────────────────────────────────────
{container="sample-app"} | json | level="warning"
{container="sample-app"} | json | latency_ms > 100
{container="sample-app"} | pattern `<_> latency=<latency>ms <_>`

# ── Metric queries ─────────────────────────────────────────────────────
# Request rate
rate({container="sample-app"} [1m])

# Error count per minute
count_over_time({container="sample-app"} | json | level="error" [1m])

# p99 latency from log field
quantile_over_time(0.99,
  {container="sample-app"}
    | json
    | unwrap latency_ms
    | __error__="" [5m]
) by (route)

# Log volume in bytes/sec (cost tracking)
bytes_rate({container="sample-app"} [5m])

Loki vs Elasticsearch/Kibana

	Loki (Grafana)	Elasticsearch + Kibana
Indexing	Only indexes labels. Log content not indexed.	Full-text indexes all fields.
Cost	Very cheap. Object storage + minimal compute.	Expensive. Large JVM heap, lots of SSD.
Query speed	Fast on labels. Slow on content (must scan).	Fast on all indexed fields.
Best for	High-volume application logs with well-defined labels.	Security logs (SIEM), complex free-text search.

When to Use Each Signal for Debugging

Question	Best signal	Why
Is my service healthy right now?	Metrics	Low latency, always available, easy to alert on
Why did this specific request fail?	Traces + Logs	Traces show where, logs show what
Which service is the bottleneck?	Traces	Waterfall view shows latency breakdown
Did this error affect other users?	Metrics + Logs	Metrics for rate, logs for user-specific details
What did the user do before the error?	Logs	Event sequence with full context
Is there a trend in errors over the past week?	Metrics	Long-retention, efficient time-series storage

13. Infrastructure Observability

node_exporter cAdvisor kube-state-metrics

Node Exporter — Host Metrics

# Add to docker-compose.yml for host metrics
  node-exporter:
    image: prom/node-exporter:v1.7.0
    container_name: node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - "--path.procfs=/host/proc"
      - "--path.sysfs=/host/sys"
      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
    networks:
      - observability

# Key node_exporter PromQL queries

# CPU utilization %
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# Memory available %
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100

# Disk usage %
(node_filesystem_size_bytes - node_filesystem_avail_bytes)
  / node_filesystem_size_bytes * 100

# Network traffic bytes/sec
rate(node_network_receive_bytes_total{device!="lo"}[5m])
rate(node_network_transmit_bytes_total{device!="lo"}[5m])

# System load
node_load1    # 1-minute
node_load5    # 5-minute
node_load15   # 15-minute

Container Metrics — cAdvisor

# Add to docker-compose.yml
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.2
    container_name: cadvisor
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    networks:
      - observability

# cAdvisor PromQL queries

# Container CPU usage rate
rate(container_cpu_usage_seconds_total{name="sample-app"}[5m]) * 100

# Container memory
container_memory_working_set_bytes{name="sample-app"}

# Network I/O
rate(container_network_receive_bytes_total{name="sample-app"}[5m])

# OOM kills (container killed for exceeding memory limit)
increase(container_oom_events_total[5m])

Kubernetes Metrics

# kube-state-metrics exposes cluster state as Prometheus metrics

# Pods not in Running or Succeeded state
kube_pod_status_phase{phase!="Running", phase!="Succeeded"} == 1

# Deployment rollout health
kube_deployment_status_replicas_available
  / kube_deployment_spec_replicas

# Resource requests vs limits by namespace
sum(kube_pod_container_resource_requests{resource="cpu"}) by (namespace)

# PVC capacity remaining %
(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) * 100

# Alert: pod crash loop
- alert: PodCrashLooping
  expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
  for: 5m
  annotations:
    summary: "Pod {{ $labels.pod }} is crash-looping"

Pre-built Grafana dashboards — import by ID

1860 — Node Exporter Full (host metrics)
893 — Docker and system monitoring (cAdvisor)
15661 — Kubernetes / Views / Global
3662 — Prometheus 2.0 Stats (self-monitoring)

In Grafana: Dashboards → Import → enter dashboard ID.

14. Incident Response & Debugging

The Debugging Workflow

Alert fires — Which service? What symptom (errors, latency, saturation)?
Dashboard — Open the service RED dashboard. Confirm the symptom. Note when it started. Check if it's one route/pod or systemic.
Traces — Grafana Explore → Tempo, filter by service.name and status=error or sort by duration. Find a representative bad trace.
Logs — Click the trace ID in Tempo to jump to correlated Loki logs. Read the full error message and stack trace.
Infrastructure — If logs show no application error, check host/container metrics. Was there a deploy? A memory spike? Disk full?
Root cause + fix — Document the timeline as you go.

# Rapid triage against the local stack

# 1. Current error rate in Prometheus
curl -s 'http://localhost:9090/api/v1/query?query=rate(app_http_requests_total%7Bstatus%3D~%225..%22%7D%5B5m%5D)'

# 2. Find recent errors in Loki (Grafana Explore -> Loki):
{container="sample-app"} | json | level="error"
  | line_format "{{.timestamp}} {{.event}} {{.trace_id}}"

# 3. Jump to trace in Tempo
# Grafana Explore -> Tempo -> TraceID tab: paste trace_id from log line

# 4. Container health
docker stats --no-stream sample-app

MTTD and MTTR

MTTD (Mean Time To Detect) — time from incident start to alert firing. Reduce by lowering alert for: durations and improving SLI coverage.
MTTR (Mean Time To Resolve) — time from detection to resolution. Reduce with better runbooks, dashboards, and trace-to-log correlation. Distributed tracing directly reduces MTTR by accelerating root cause identification.

Blameless Post-Mortem Template

## Incident Summary
- Date/time: 2026-02-23 14:30 UTC
- Duration: 47 minutes
- Impact: 12% of /checkout requests returned 500 (~8,400 failures)
- Severity: P2

## Timeline
- 14:30 — Alert: HighErrorRate fired
- 14:32 — On-call acknowledged; opened service dashboard
- 14:35 — Traces showed errors in postgres.UPDATE orders span (timeout)
- 14:38 — Logs: "too many connections" in DB span
- 14:41 — Root cause: connection pool exhausted after deploy at 14:15
- 14:47 — Mitigated by rolling back connection pool config
- 15:17 — Full resolution confirmed

## Root Cause
Deploy at 14:15 reduced connection pool max from 20 to 5.
At 14:30 traffic peak, pool was exhausted causing timeouts.

## Action Items
- Add connection pool exhaustion alert: @alice by 2026-03-01
- Add pool utilization panel to service dashboard: @bob by 2026-02-28
- Require load test for connection pool changes: @team

15. Cost & Scaling

High Cardinality — The Number One Pitfall

Never use user_id, order_id, or trace_id as a Prometheus label

With 1 million users, http_requests_total{user_id="..."} creates 1 million distinct time series. Prometheus will run out of memory. This is called a cardinality explosion. Use low-cardinality labels only: route, status_code, region, service. For per-user data, use logs or traces.

# Bad: cardinality = number of users (potentially millions of series)
REQUEST_COUNT.labels(user_id=user_id, route="/checkout").inc()

# Good: low-cardinality labels only
REQUEST_COUNT.labels(
    route="/checkout",
    status=str(response.status_code),
    region="us-east-1",
).inc()

# For per-user data: structured log line instead
log.info("checkout_request",
    user_id=user_id,           # lives in log body, not metric label
    amount_cents=amount,
    trace_id=get_trace_id(),
)

Retention Policies

Signal	Hot retention	Cold retention
Metrics (raw)	15-30 days	Downsampled to 1h resolution, kept 1 year (Thanos/Cortex)
Logs	7-30 days (fast SSD)	90-365 days on object storage (S3/GCS)
Traces	7-14 days	Deleted (high volume, hard to compress efficiently)

Reducing Trace Volume via Sampling

# At 10,000 req/s with 100% sampling: ~1 GB/hour of trace data
# With 1% head sampling:             ~10 MB/hour
# With tail sampling (errors+slow):  ~50 MB/hour

# Production recommendation:
# - 100% sample errors (tail-based)
# - 100% sample p99 latency outliers (tail-based)
# - 1% probabilistic for healthy traces (head-based)

# OTel Collector tail sampling config
processors:
  tail_sampling:
    decision_wait: 10s
    num_traces: 100000
    policies:
      - name: keep-errors
        type: status_code
        status_code: {status_codes: [ERROR]}
      - name: keep-slow
        type: latency
        latency: {threshold_ms: 500}
      - name: sample-rest
        type: probabilistic
        probabilistic: {sampling_percentage: 1}

Log Level Filtering in Production

import logging
import os

LOG_LEVEL = "WARNING" if os.getenv("ENV") == "production" else "INFO"
logging.basicConfig(level=getattr(logging, LOG_LEVEL))

# Suppress noisy third-party libraries selectively
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
# Keep your own app at INFO or DEBUG as needed
logging.getLogger("myapp").setLevel(logging.DEBUG)

Self-Hosted vs SaaS Comparison

	Self-hosted OSS	Grafana Cloud	Datadog	New Relic
Metrics cost	Infra only (low)	$8/1k active series/mo	~$18-25/host/mo	Included in platform fee
Logs cost	Storage only	$0.50/GB ingested	$0.10/GB ingested	$0.25/GB ingested
Traces cost	Storage only	$0.35/GB ingested	$1.70/million spans	$0.25/GB ingested
Ops complexity	High — you run it	Low — managed	None	None
Features	Full control	Good, growing	Best-in-class	Strong APM
Best for	Cost-sensitive, infra-experienced team	OSS stack + managed reliability	Enterprise, high budget	Mid-size, APM focus

Quick Reference: PromQL Cheat Sheet

# ── Selectors ─────────────────────────────────────────────────────────
metric_name                             # current value
metric_name{label="value"}             # exact label match
metric_name{label=~"val1|val2"}        # regex match
metric_name{label!="value"}            # exclude
metric_name[5m]                         # range vector

# ── Functions ──────────────────────────────────────────────────────────
rate(counter[5m])                       # per-second rate (counter-safe)
increase(counter[1h])                   # total increase over window
irate(counter[5m])                      # instantaneous rate (2 samples)
delta(gauge[5m])                        # change in gauge
avg_over_time(gauge[5m])               # time average
histogram_quantile(0.99, rate(hist_bucket[5m]))

# ── Aggregations ───────────────────────────────────────────────────────
sum(metric) by (label)
avg(metric) without (pod)
topk(5, sum(rate(req[5m])) by (svc))
count(metric{status="500"})

# ── Binary operations ──────────────────────────────────────────────────
metric_a / metric_b
metric_a / on(service) metric_b        # explicit label matching
metric_a unless metric_b               # a where b has no value

# ── Common patterns ────────────────────────────────────────────────────
# Error ratio
sum(rate(req{status=~"5.."}[5m])) / sum(rate(req[5m]))

# Memory used %
(total - available) / total * 100

# Apdex score
(
  sum(rate(duration_bucket{le="0.3"}[5m]))
  + sum(rate(duration_bucket{le="1.2"}[5m])) / 2
) / sum(rate(duration_count[5m]))

Quick Reference: LogQL Cheat Sheet

# ── Stream selectors ──────────────────────────────────────────────────
{app="myapp"}
{app=~"myapp.*"}
{app="myapp", env="prod"}

# ── Line filters ───────────────────────────────────────────────────────
|= "error"                             # contains
!= "health"                            # does not contain
|~ "error|exception"                   # regex
!~ "debug|trace"                       # regex negate

# ── Parsers ────────────────────────────────────────────────────────────
| json
| logfmt
| pattern `<method> <path> <status>`
| regexp `(?P<ip>\d+\.\d+\.\d+\.\d+)`

# ── Label filters (post-parse) ─────────────────────────────────────────
| level="error"
| status_code >= 500
| duration > 1s

# ── Output formatting ──────────────────────────────────────────────────
| line_format "{{.level}} {{.msg}}"
| label_format level=severity

# ── Metric queries ─────────────────────────────────────────────────────
rate({app="myapp"} [5m])
count_over_time({app="myapp"} [5m])
bytes_rate({app="myapp"} [5m])
quantile_over_time(0.99,
  {app="myapp"} | json | unwrap latency_ms [5m]
) by (service)

Grafana Dashboard JSON — Sample App RED (import-ready)

Import in Grafana: Dashboards → Import → paste JSON → Load.

{
  "title": "Sample App RED",
  "uid": "sample-app-red-v1",
  "schemaVersion": 38,
  "time": {"from": "now-15m", "to": "now"},
  "refresh": "10s",
  "panels": [
    {
      "id": 1, "type": "stat", "title": "Request Rate",
      "gridPos": {"x":0,"y":0,"w":6,"h":4},
      "targets": [{"expr":"sum(rate(app_http_requests_total[5m]))","legendFormat":"req/s"}],
      "fieldConfig": {"defaults": {"unit": "reqps",
        "thresholds": {"steps": [
          {"color":"green","value":null},
          {"color":"yellow","value":50},
          {"color":"red","value":200}
        ]}}}
    },
    {
      "id": 2, "type": "stat", "title": "Error Rate",
      "gridPos": {"x":6,"y":0,"w":6,"h":4},
      "targets": [{"expr":"sum(rate(app_http_requests_total{status=~\"5..\"}[5m])) / sum(rate(app_http_requests_total[5m])) * 100","legendFormat":"error %"}],
      "fieldConfig": {"defaults": {"unit": "percent",
        "thresholds": {"steps": [
          {"color":"green","value":null},
          {"color":"yellow","value":1},
          {"color":"red","value":5}
        ]}}}
    },
    {
      "id": 3, "type": "timeseries", "title": "Latency Percentiles",
      "gridPos": {"x":0,"y":4,"w":12,"h":8},
      "targets": [
        {"expr":"histogram_quantile(0.50, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))","legendFormat":"p50"},
        {"expr":"histogram_quantile(0.95, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))","legendFormat":"p95"},
        {"expr":"histogram_quantile(0.99, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))","legendFormat":"p99"}
      ],
      "fieldConfig": {"defaults": {"unit": "s"}}
    }
  ]
}