Observability Refresher
Logs, metrics, traces, alerting, and dashboards for software engineering, data engineering, and MLOps
Table of Contents
0. Setup & Environment — Full Local Stack
Every code example in this page runs against a local Docker Compose stack that mirrors production setups like Grafana Cloud, Datadog, or AWS Managed Prometheus. Stand it up once and keep it running as you work through each section.
Config Files
Create a working directory and populate the config files below:
mkdir -p ~/observability-lab/config
cd ~/observability-lab
docker-compose.yml
version: "3.9"
networks:
observability:
driver: bridge
volumes:
prometheus_data: {}
grafana_data: {}
loki_data: {}
tempo_data: {}
services:
# ── Metrics ──────────────────────────────────────────────────────────
prometheus:
image: prom/prometheus:v2.50.0
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=7d"
- "--web.enable-lifecycle"
networks:
- observability
restart: unless-stopped
# ── Dashboards ───────────────────────────────────────────────────────
grafana:
image: grafana/grafana:10.3.1
container_name: grafana
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
networks:
- observability
depends_on:
- prometheus
- loki
- tempo
restart: unless-stopped
# ── Log Aggregation ──────────────────────────────────────────────────
loki:
image: grafana/loki:2.9.4
container_name: loki
ports:
- "3100:3100"
volumes:
- ./config/loki.yaml:/etc/loki/local-config.yaml:ro
- loki_data:/loki
command: -config.file=/etc/loki/local-config.yaml
networks:
- observability
restart: unless-stopped
# ── Distributed Tracing ──────────────────────────────────────────────
tempo:
image: grafana/tempo:2.4.0
container_name: tempo
ports:
- "3200:3200"
- "4317" # internal OTLP gRPC (used by otel-collector)
volumes:
- ./config/tempo.yaml:/etc/tempo.yaml:ro
- tempo_data:/tmp/tempo
command: -config.file=/etc/tempo.yaml
networks:
- observability
restart: unless-stopped
# ── Log Shipper ──────────────────────────────────────────────────────
promtail:
image: grafana/promtail:2.9.4
container_name: promtail
volumes:
- ./config/promtail-config.yaml:/etc/promtail/config.yaml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
command: -config.file=/etc/promtail/config.yaml
networks:
- observability
depends_on:
- loki
restart: unless-stopped
# ── OpenTelemetry Collector ──────────────────────────────────────────
otel-collector:
image: otel/opentelemetry-collector-contrib:0.95.0
container_name: otel-collector
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
- "8888:8888" # Collector self-metrics (Prometheus)
- "8889:8889" # Prometheus exporter for app metrics
volumes:
- ./config/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro
networks:
- observability
depends_on:
- prometheus
- loki
- tempo
restart: unless-stopped
# ── Sample Python App ────────────────────────────────────────────────
sample-app:
build:
context: ./sample-app
dockerfile: Dockerfile
container_name: sample-app
ports:
- "8000:8000"
environment:
- OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
- OTEL_SERVICE_NAME=sample-app
- OTEL_RESOURCE_ATTRIBUTES=deployment.environment=local,service.version=1.0.0
networks:
- observability
depends_on:
- otel-collector
restart: unless-stopped
config/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "alerts.yml"
scrape_configs:
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# OTel Collector self-metrics
- job_name: "otel-collector"
static_configs:
- targets: ["otel-collector:8888"]
# App metrics exported by OTel Collector
- job_name: "sample-app"
static_configs:
- targets: ["otel-collector:8889"]
# Grafana self-metrics
- job_name: "grafana"
static_configs:
- targets: ["grafana:3000"]
config/otel-collector-config.yaml
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 512
resource:
attributes:
- action: insert
key: loki.resource.labels
value: service.name, deployment.environment
exporters:
# Send traces to Tempo
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
# Expose metrics for Prometheus to scrape
prometheus:
endpoint: "0.0.0.0:8889"
namespace: app
# Send logs to Loki
loki:
endpoint: http://loki:3100/loki/api/v1/push
default_labels_enabled:
exporter: false
job: true
# Debug output (disable in production)
debug:
verbosity: basic
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [prometheus]
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resource]
exporters: [loki]
config/loki.yaml
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v12
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
reject_old_samples: false
config/tempo.yaml
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
ingester:
max_block_duration: 5m
compactor:
compaction:
block_retention: 1h
storage:
trace:
backend: local
wal:
path: /tmp/tempo/wal
local:
path: /tmp/tempo/blocks
metrics_generator:
registry:
external_labels:
source: tempo
cluster: local
storage:
path: /tmp/tempo/generator/wal
processors: [service-graphs, span-metrics]
config/promtail-config.yaml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
filters:
- name: status
values: ["running"]
relabel_configs:
- source_labels: [__meta_docker_container_name]
target_label: container
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
target_label: service
Sample App Files
mkdir -p ~/observability-lab/sample-app
Create sample-app/requirements.txt:
fastapi==0.110.0
uvicorn==0.27.1
opentelemetry-sdk==1.23.0
opentelemetry-api==1.23.0
opentelemetry-exporter-otlp-proto-grpc==1.23.0
opentelemetry-instrumentation-fastapi==0.44b0
opentelemetry-instrumentation-logging==0.44b0
prometheus-client==0.20.0
structlog==24.1.0
Create sample-app/app.py:
"""
Sample FastAPI app with full OpenTelemetry instrumentation.
Generates logs, metrics, and traces on every request.
"""
import random
import time
import structlog
from fastapi import FastAPI, HTTPException
from opentelemetry import trace, metrics
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.sdk.resources import Resource
import os
# ── Resource (common labels attached to all telemetry) ─────────────────
resource = Resource.create({
"service.name": os.getenv("OTEL_SERVICE_NAME", "sample-app"),
"deployment.environment": "local",
"service.version": "1.0.0",
})
# ── Tracing setup ───────────────────────────────────────────────────────
otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(
BatchSpanProcessor(OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True))
)
trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)
# ── Metrics setup ───────────────────────────────────────────────────────
metric_reader = PeriodicExportingMetricReader(
OTLPMetricExporter(endpoint=otlp_endpoint, insecure=True),
export_interval_millis=10000,
)
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
meter = metrics.get_meter(__name__)
# Custom instruments
request_counter = meter.create_counter(
"http_requests_total",
description="Total number of HTTP requests",
)
request_duration = meter.create_histogram(
"http_request_duration_seconds",
description="HTTP request duration in seconds",
unit="s",
)
active_requests = meter.create_up_down_counter(
"http_active_requests",
description="Number of requests currently being processed",
)
# ── Structured logging setup ────────────────────────────────────────────
LoggingInstrumentor().instrument() # Injects trace_id/span_id into log records
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(20), # INFO+
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
)
log = structlog.get_logger()
# ── App ─────────────────────────────────────────────────────────────────
app = FastAPI(title="Observability Sample App")
FastAPIInstrumentor.instrument_app(app)
@app.get("/hello")
async def hello():
"""Returns a greeting. Generates a trace span and increments counters."""
with tracer.start_as_current_span("process-hello") as span:
span.set_attribute("app.greeting", "world")
latency = random.uniform(0.01, 0.15)
time.sleep(latency)
request_counter.add(1, {"route": "/hello", "method": "GET", "status": "200"})
request_duration.record(latency, {"route": "/hello"})
log.info("hello request processed", latency_ms=round(latency * 1000, 2))
return {"message": "Hello, Observability World!", "latency_ms": round(latency * 1000, 2)}
@app.get("/error")
async def error_endpoint():
"""Occasionally raises errors so you can observe error metrics and traces."""
active_requests.add(1)
try:
if random.random() < 0.5:
request_counter.add(1, {"route": "/error", "method": "GET", "status": "500"})
log.error("simulated internal error", reason="random failure")
raise HTTPException(status_code=500, detail="Simulated server error")
request_counter.add(1, {"route": "/error", "method": "GET", "status": "200"})
return {"status": "ok"}
finally:
active_requests.add(-1)
@app.get("/slow")
async def slow_endpoint():
"""Simulates a slow downstream call — useful for latency histograms."""
with tracer.start_as_current_span("slow-operation") as span:
with tracer.start_as_current_span("downstream-db-query") as child:
latency = random.uniform(0.5, 2.0)
child.set_attribute("db.system", "postgresql")
child.set_attribute("db.statement", "SELECT * FROM events")
time.sleep(latency)
span.set_attribute("total_latency_ms", round(latency * 1000, 2))
request_duration.record(latency, {"route": "/slow"})
log.warning("slow request detected", latency_s=round(latency, 3), threshold_s=0.5)
return {"latency_ms": round(latency * 1000, 2)}
@app.get("/health")
async def health():
return {"status": "healthy"}
Create sample-app/Dockerfile:
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app.py .
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
Starting the Stack
cd ~/observability-lab
docker compose up -d
# Verify all services are running
docker compose ps
# Expected: all 7 containers in "Up" state
Verification & Data Source Setup
Open each endpoint and confirm it responds:
# 1. Prometheus — check targets
open http://localhost:9090/targets
# All targets should show "UP"
# 2. Grafana — default login admin/admin
open http://localhost:3000
# 3. Send traffic to sample app
for i in {1..20}; do
curl -s http://localhost:8000/hello > /dev/null
curl -s http://localhost:8000/error > /dev/null || true
curl -s http://localhost:8000/slow > /dev/null
done
In Grafana, add data sources (Connections > Data sources > Add):
- Prometheus — URL:
http://prometheus:9090 - Loki — URL:
http://loki:3100 - Tempo — URL:
http://tempo:3200— enable "Trace to logs" linking to Loki
service.name. This lets you click a trace span and jump directly to the correlated log lines.
# Tear everything down when done
docker compose down -v
1. The Three Pillars
Observability is the ability to understand the internal state of a system by examining its external outputs. The three primary output types — logs, metrics, and traces — each answer different questions. Mature observability requires all three working together.
Logs, Metrics, Traces — What Each Answers
| Signal | What it answers | Cardinality | Cost | Best for |
|---|---|---|---|---|
| Logs | What happened and when, with full context | High (unbounded) | High at scale | Debugging, audit trails, error details |
| Metrics | How much / how fast / how often, over time | Low (predefined labels) | Low | Alerting, trending, dashboards |
| Traces | Where time was spent across a distributed request | High (per-request) | Medium (with sampling) | Latency debugging, dependency mapping |
What Each Signal Tells You
Logs are timestamped, discrete event records. They capture rich context — stack traces, user IDs, query parameters, error messages — that metrics cannot represent. The cost is storage and query latency at high volume.
Metrics are numeric measurements sampled over time. A counter of HTTP requests, a gauge of memory usage, a histogram of response times. They are cheap, compress well, and are ideal for alerting because you can set thresholds on them.
Traces follow a request as it flows through multiple services, recording the time spent in each operation (called a span). A trace answers "why was this request slow?" in a way that neither logs nor metrics alone can answer.
Observability vs Monitoring
Monitoring is checking known failure modes: "alert if error rate > 5%." It requires you to predict what can go wrong. Observability is the ability to ask arbitrary questions about system behavior — including questions you did not predict you would need to ask. Observability enables monitoring, but goes further.
Correlation: Tying the Three Pillars Together
The real power emerges when you correlate signals. A trace ID (generated per-request) can be embedded in logs and referenced in metrics labels, allowing you to jump from an alert to the relevant traces to the relevant log lines in a single workflow.
# A log line with trace context embedded (generated by our sample app):
{
"event": "slow request detected",
"level": "warning",
"latency_s": 1.83,
"threshold_s": 0.5,
"timestamp": "2026-02-23T10:12:34.123Z",
"trace_id": "4bf92f3577b34da6a3ce929d0e0e4736", # links to Tempo
"span_id": "00f067aa0ba902b7"
}
Try it in Grafana: run curl http://localhost:8000/slow, then in Grafana Explore select the Loki data source and query {service="sample-app"} |= "slow". Click the trace ID link in the log line to jump to Tempo.
2. OpenTelemetry
CNCF v1.x stable
OpenTelemetry (OTel) is the vendor-neutral, CNCF-graduated standard for generating, collecting, and exporting telemetry data. It replaces a fragmented ecosystem of vendor SDKs (Datadog agent, Jaeger client, Zipkin client) with a single API that can send to any backend.
Architecture
The OTel architecture has three layers:
- SDK — embedded in your application. Provides the API (
tracer,meter,logger) and buffers telemetry before export. - Collector — a standalone agent/gateway that receives, processes, and exports telemetry. Decouples your app from backend specifics.
- Backend — Prometheus, Grafana Loki, Grafana Tempo, Jaeger, Datadog, etc.
# Your app sends via OTLP to the Collector running in Docker
# The Collector fans out to Prometheus + Loki + Tempo
App (SDK) ──OTLP gRPC─→ OTel Collector ──→ Prometheus (metrics scrape)
──→ Loki (log push)
──→ Tempo (trace push)
Auto vs Manual Instrumentation
Auto-instrumentation uses monkey-patching or bytecode injection to instrument popular frameworks (FastAPI, Flask, Django, SQLAlchemy, requests, etc.) with zero code changes. It handles HTTP span creation, error recording, and context propagation automatically.
# Auto-instrumentation: one line instruments all FastAPI routes
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
FastAPIInstrumentor.instrument_app(app)
# Now every HTTP request automatically creates a trace span with:
# http.method, http.route, http.status_code, http.url
Manual instrumentation lets you add business-level spans for operations the framework knows nothing about — database queries, cache lookups, third-party API calls, background jobs.
from opentelemetry import trace
tracer = trace.get_tracer(__name__)
def process_payment(order_id: str, amount: float):
with tracer.start_as_current_span("process-payment") as span:
# Span attributes add searchable dimensions to the trace
span.set_attribute("order.id", order_id)
span.set_attribute("payment.amount_cents", int(amount * 100))
span.set_attribute("payment.currency", "USD")
try:
result = stripe_client.charge(amount)
span.set_attribute("payment.status", "success")
span.set_attribute("payment.charge_id", result.id)
return result
except stripe.CardError as e:
# Record the exception in the span — visible in Tempo
span.record_exception(e)
span.set_status(trace.StatusCode.ERROR, str(e))
raise
Spans, Attributes, and Events
- Span — a named, timed operation. Has a start time, end time, status, and optional attributes.
- Span attribute — a key-value pair on a span. Use semantic conventions:
http.method,db.system,rpc.service. - Span event — a timestamped annotation within a span. Use for significant moments that are not sub-operations (e.g., "cache miss", "retry attempt 2").
- Span link — a reference to a span in another trace. Used for async fan-out (e.g., a Kafka message links back to the producer's trace).
with tracer.start_as_current_span("batch-process") as span:
span.set_attribute("batch.size", len(items))
# Span event: a point-in-time annotation
span.add_event("processing-started", {"worker_id": worker_id})
for i, item in enumerate(items):
if i % 100 == 0:
span.add_event("checkpoint", {"processed": i})
span.add_event("processing-complete", {"failed": error_count})
Context Propagation — W3C traceparent
For a trace to span multiple services, the trace context must travel with the request. The W3C traceparent header is the standard mechanism:
# Format: version-trace_id-parent_span_id-flags
traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
# ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^ ^^
# v 128-bit trace ID 64-bit span ID sampled
OTel SDKs inject and extract this header automatically when you use the instrumented HTTP clients. For message queues (Kafka, SQS), you inject the context into message headers at produce time and extract at consume time.
# Injecting context into Kafka message headers (manual)
from opentelemetry.propagate import inject
headers = {}
inject(headers) # adds traceparent, tracestate headers
producer.produce(topic, value=payload, headers=headers)
Collector Pipeline Deep Dive
The Collector config is structured as receivers → processors → exporters pipelines. You can have multiple pipelines (one per signal type):
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, tail_sampling]
exporters: [otlp/tempo, debug]
metrics:
receivers: [otlp, prometheus] # collect from both OTLP and Prometheus scrape
processors: [memory_limiter, batch, filter/drop_internal]
exporters: [prometheus, otlp/cloud]
logs:
receivers: [otlp, filelog] # OTLP from app + file tailing
processors: [memory_limiter, batch, resource, attributes]
exporters: [loki]
Resource Attributes
Resource attributes describe the entity producing telemetry, not the individual request. They are attached to all signals from a given process. Semantic conventions define standard names:
| Attribute | Example | Required? |
|---|---|---|
service.name | payment-service | Yes |
service.version | 2.4.1 | Recommended |
deployment.environment | production | Recommended |
host.name | ip-10-0-1-42 | Auto-detected |
k8s.pod.name | payment-7d9f6b-xkz2p | Auto-detected (k8s) |
# Try it: query metrics with service.name label in Prometheus
# Open http://localhost:9090 and run:
app_http_requests_total{job="sample-app"}
# View in Grafana Explore → Tempo, filter by service.name = "sample-app"
3. Structured Logging
Structured logging means emitting logs as machine-parseable key-value records (typically JSON) rather than freeform text strings. This is the single highest-leverage logging improvement you can make.
Why Structured Beats Unstructured
| Unstructured | Structured (JSON) |
|---|---|
ERROR: payment failed for user 12345 after 3 retries |
{"level":"error","event":"payment_failed","user_id":12345,"retries":3} |
| Requires regex to extract fields | Direct field-based filtering and aggregation |
| Brittle — a wording change breaks log parsers | Schema is stable; field names are constants |
| Cannot aggregate "count errors by user_id" | count_over_time({app="api"} | json | level="error" [5m]) |
Python structlog Example
import structlog
import logging
# Configure structlog with JSON output
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars, # thread-local context
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
)
log = structlog.get_logger()
# Basic structured log
log.info("user_login", user_id=12345, ip="203.0.113.1", method="oauth2")
# Bind context for all subsequent calls in this scope
bound_log = log.bind(request_id="req-abc123", service="auth")
bound_log.info("session_created", session_ttl_s=3600)
bound_log.warning("suspicious_activity", failed_attempts=5)
# Exception logging with full stack trace
try:
process_payment(order_id="ord-789")
except Exception:
log.exception("payment_processing_failed", order_id="ord-789", amount_cents=9999)
Log Levels — When to Use Each
| Level | Use when | Production default? |
|---|---|---|
| DEBUG | Detailed execution flow, variable values. Expensive — disable in prod. | No |
| INFO | Normal business events: user logged in, job completed, request served. | Yes |
| WARNING | Unexpected state that the system recovered from: retry succeeded, fallback used, latency spike. | Yes |
| ERROR | A request or operation failed. Something needs attention. | Yes |
| CRITICAL | System-level failure that may require immediate intervention: DB unreachable, disk full. | Yes |
Correlation IDs
Every log line should carry the trace ID and span ID of the current request. This is what enables the "alert → trace → logs" workflow:
# OpenTelemetry's LoggingInstrumentor injects these automatically
# The output JSON looks like:
{
"event": "payment_processed",
"level": "info",
"amount_cents": 9999,
"timestamp": "2026-02-23T10:15:00.123Z",
"trace_id": "4bf92f3577b34da6a3ce929d0e0e4736",
"span_id": "00f067aa0ba902b7",
"service.name": "payment-service",
"deployment.environment": "production"
}
Log Aggregation Pipeline
# Our local pipeline:
# App (stdout JSON) → Docker → Promtail (reads Docker socket) → Loki → Grafana
# Query in Grafana Explore → Loki:
{container="sample-app"} # all logs from container
{container="sample-app"} |= "error" # contains "error"
{container="sample-app"} | json | level="error" # parse JSON, filter by field
{container="sample-app"} | json | latency_ms > 100 # numeric field filter
LogQL Queries
LogQL is Loki's query language. It has two forms: log queries (return log lines) and metric queries (derive metrics from log lines).
# Log query: stream selector + filter pipeline
{service="sample-app", container="sample-app"}
| json # parse JSON fields
| level = "warning" # filter by level field
| line_format "{{.timestamp}} {{.event}}" # reformat output
# Metric query: count errors per minute
rate({container="sample-app"} | json | level="error" [1m])
# Metric query: 99th percentile latency (requires numeric field)
quantile_over_time(0.99,
{container="sample-app"}
| json
| unwrap latency_ms [5m]
) by (route)
# Try it in Grafana: Explore → Loki → paste this query:
{container="sample-app"} | json | level != "debug"
4. Metrics
Prometheus OpenMetrics
Metric Types
| Type | Description | Example | Operations |
|---|---|---|---|
| Counter | Monotonically increasing. Never decreases (except on reset). | http_requests_total |
rate(), increase() |
| Gauge | Can go up or down. Point-in-time measurement. | memory_usage_bytes |
Direct value, avg_over_time() |
| Histogram | Samples observations into predefined buckets. Tracks sum and count. | request_duration_seconds |
histogram_quantile(), rate() |
| Summary | Client-side quantile calculation. Accurate but cannot be aggregated across instances. | rpc_duration_seconds |
Direct {quantile="0.99"} |
histogram_quantile(0.99, sum(rate(duration_bucket[5m])) by (le)). Summaries calculate quantiles client-side — you cannot aggregate them across instances, making them nearly useless in a multi-pod deployment.
Naming Conventions
# Format: [namespace_][subsystem_]name[_unit][_total|_bucket|_count|_sum]
# Use snake_case. End counters with _total. Include units.
http_requests_total # counter
http_request_duration_seconds # histogram
http_request_duration_seconds_bucket # (auto-generated by histogram)
http_request_duration_seconds_count # (auto-generated)
http_request_duration_seconds_sum # (auto-generated)
process_resident_memory_bytes # gauge
cache_hits_total # counter
kafka_consumer_lag_offsets # gauge (lag per partition)
Prometheus Exposition Format
# Text format served at /metrics endpoint
# HELP describes the metric; TYPE declares its kind
# HELP http_requests_total Total number of HTTP requests
# TYPE http_requests_total counter
http_requests_total{method="GET",route="/hello",status="200"} 142.0
http_requests_total{method="GET",route="/error",status="500"} 23.0
# HELP http_request_duration_seconds HTTP request duration in seconds
# TYPE http_request_duration_seconds histogram
http_request_duration_seconds_bucket{route="/hello",le="0.05"} 89.0
http_request_duration_seconds_bucket{route="/hello",le="0.1"} 130.0
http_request_duration_seconds_bucket{route="/hello",le="0.25"} 142.0
http_request_duration_seconds_bucket{route="/hello",le="+Inf"} 142.0
http_request_duration_seconds_count{route="/hello"} 142.0
http_request_duration_seconds_sum{route="/hello"} 11.43
Custom Metrics in Python
from prometheus_client import Counter, Gauge, Histogram, start_http_server
# Define metrics at module level (global singletons)
REQUEST_COUNT = Counter(
"http_requests_total",
"Total HTTP request count",
["method", "endpoint", "http_status"], # label dimensions
)
ACTIVE_CONNECTIONS = Gauge(
"active_websocket_connections",
"Number of active WebSocket connections",
)
REQUEST_LATENCY = Histogram(
"http_request_duration_seconds",
"HTTP request latency in seconds",
["endpoint"],
buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
)
# Usage
REQUEST_COUNT.labels(method="GET", endpoint="/hello", http_status=200).inc()
ACTIVE_CONNECTIONS.inc()
ACTIVE_CONNECTIONS.dec()
# Context manager measures duration automatically
with REQUEST_LATENCY.labels(endpoint="/payments").time():
result = process_payment(order_id)
# Expose metrics endpoint (alternative to OTel push)
start_http_server(port=8001) # Prometheus scrapes http://app:8001/metrics
PromQL
PromQL is Prometheus's query language. Open http://localhost:9090 and run these:
# Rate of requests per second over the last 5 minutes
rate(app_http_requests_total[5m])
# Total requests in the last hour by route
increase(app_http_requests_total[1h])
# Error rate as a percentage
rate(app_http_requests_total{status="500"}[5m])
/
rate(app_http_requests_total[5m])
* 100
# 99th percentile latency (requires histogram)
histogram_quantile(0.99,
sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le, route)
)
# Current memory usage across all pods
sum(container_memory_working_set_bytes{namespace="production"}) by (pod)
# CPU usage rate
rate(container_cpu_usage_seconds_total[5m])
Recording Rules
Recording rules pre-compute expensive queries and store results as new metrics. Use them for aggregations that are queried repeatedly (e.g., dashboard panels, alert expressions):
# config/prometheus-alerts.yml
groups:
- name: recording_rules
interval: 30s
rules:
# Pre-compute per-route 5m request rate
- record: job:http_requests:rate5m
expr: sum(rate(app_http_requests_total[5m])) by (job, route)
# Pre-compute error ratio
- record: job:http_error_ratio:rate5m
expr: |
sum(rate(app_http_requests_total{status=~"5.."}[5m])) by (job)
/
sum(rate(app_http_requests_total[5m])) by (job)
5. Distributed Tracing
Spans and Traces
A trace is the complete journey of a single request through your system. It consists of spans — individual units of work — arranged in a parent-child tree. The root span represents the entry point (e.g., HTTP request received); child spans represent downstream calls (database query, cache lookup, external API call).
# Trace waterfall view (conceptual):
[──────────────── HTTP GET /checkout (root span, 450ms) ─────────────────]
[── auth-check (12ms) ──]
[── cart-service.get_cart (80ms) ──────]
[── postgres.SELECT orders (65ms) ──]
[── inventory-service.reserve (180ms) ────────────────────]
[── redis.GET sku:123 (3ms)]
[── postgres.UPDATE inventory (170ms) ─────────────────]
[── payment-service.charge (120ms) ────────────]
This waterfall immediately shows that postgres.UPDATE inventory is the bottleneck — something you cannot see from a single service's metrics alone.
Sampling Strategies
Tracing every request at high volume is expensive. Sampling controls what fraction of traces you keep.
| Strategy | How it works | Pros | Cons |
|---|---|---|---|
| Head-based (probabilistic) | Decision made at trace root before any spans are recorded. Keep N% of traces. | Simple, low overhead | May discard interesting (slow/error) traces |
| Tail-based | Buffer entire trace, then decide based on outcome (was it an error? was it slow?) | Keeps all errors and slow traces | Requires buffering — memory overhead |
| Rate-limiting | Keep at most N traces per second per service | Cost predictable | Low-traffic services may be under-sampled |
# OTel Collector tail-sampling processor
processors:
tail_sampling:
decision_wait: 10s # wait this long after trace starts
num_traces: 100000 # max traces in memory
policies:
# Always keep errors
- name: keep-errors
type: status_code
status_code: {status_codes: [ERROR]}
# Keep slow traces (>500ms)
- name: keep-slow
type: latency
latency: {threshold_ms: 500}
# Sample 5% of everything else
- name: sample-remaining
type: probabilistic
probabilistic: {sampling_percentage: 5}
Manual Span Creation
from opentelemetry import trace
from opentelemetry.trace import SpanKind
tracer = trace.get_tracer("payment-service", "1.0.0")
async def charge_card(card_token: str, amount_cents: int):
# SpanKind.CLIENT signals this span calls an external service
with tracer.start_as_current_span(
"stripe.charge",
kind=SpanKind.CLIENT,
) as span:
span.set_attribute("rpc.system", "http")
span.set_attribute("rpc.service", "Stripe")
span.set_attribute("rpc.method", "PaymentIntents.create")
span.set_attribute("payment.amount_cents", amount_cents)
span.set_attribute("payment.currency", "usd")
response = await stripe.PaymentIntent.create(
amount=amount_cents,
currency="usd",
payment_method=card_token,
)
span.set_attribute("payment.intent_id", response.id)
span.set_attribute("payment.status", response.status)
return response
# Try it: generate traces and view in Tempo
for i in {1..10}; do curl -s http://localhost:8000/slow; done
# In Grafana: Explore → Tempo → Search
# Filter: service.name = "sample-app"
# You'll see waterfall views with parent-child spans
6. Alerting
Alert Design Philosophy
Good alerts are actionable (a human can do something about them), symptom-based (user-visible impact, not internal causes), and rare enough to be taken seriously. Alert fatigue — the condition where on-call engineers stop responding because alerts are too frequent or too noisy — is one of the most common reliability failures in engineering organizations.
Prometheus Alerting Rules
# config/alerts.yml (referenced in prometheus.yml rule_files)
groups:
- name: sample-app-alerts
rules:
# Alert when error rate exceeds 1% for 5 minutes
- alert: HighErrorRate
expr: |
(
sum(rate(app_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(app_http_requests_total[5m]))
) > 0.01
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
runbook_url: "https://wiki.internal/runbooks/high-error-rate"
# Alert when p99 latency exceeds 1 second
- alert: HighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le, route)
) > 1.0
for: 3m
labels:
severity: warning
annotations:
summary: "p99 latency exceeds 1s on {{ $labels.route }}"
description: "p99 is {{ $value | humanizeDuration }}"
# Alert when service is down (no data = firing)
- alert: ServiceDown
expr: up{job="sample-app"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.job }} is down"
Severity Levels
| Severity | Impact | Response time | Action |
|---|---|---|---|
| P1 / Critical | Complete service outage, data loss, security breach | Immediate (24/7 page) | Wake up on-call, incident declared |
| P2 / High | Significant degradation, major feature broken | 15 min (business hours page) | Engage on-call engineer |
| P3 / Warning | Minor degradation, affecting small % of users | Next business hour | Ticket created, triaged in standup |
| P4 / Info | No user impact, proactive notification | End of day | Slack notification only |
SLO-Based Alerting
Rather than alerting on raw thresholds, alert on how fast you are burning through your error budget (covered in depth in Section 8). The key concept is burn rate: at what multiple of normal are errors occurring?
# Burn rate alert: if errors are burning budget 14x faster than allowed
# (i.e., in 1 hour you'd exhaust a week's budget), page immediately
- alert: ErrorBudgetBurnRateFast
expr: |
(
sum(rate(app_http_requests_total{status=~"5.."}[1h]))
/
sum(rate(app_http_requests_total[1h]))
) > 14 * 0.001 # 14x burn rate with 0.1% error budget
for: 5m
labels:
severity: critical
annotations:
summary: "Error budget burning fast — {{ $value | humanizePercentage }} error rate"
7. Dashboards
RED and USE Methods
Two mental models tell you exactly which panels belong on every dashboard:
- RED (for services) — Rate, Errors, Duration. Applies to any system that handles requests.
- USE (for resources) — Utilization, Saturation, Errors. Applies to any resource (CPU, memory, disk, connection pool).
| Method | Applies to | Panel examples |
|---|---|---|
| Rate | Services | Requests/sec, events/sec, transactions/min |
| Errors | Services | Error rate %, 5xx count, failed payments/min |
| Duration | Services | p50/p95/p99 latency, Apdex score |
| Utilization | Resources | CPU %, memory %, disk %, connection pool % |
| Saturation | Resources | Queue depth, wait time, thread pool queue length |
| Errors | Resources | Disk errors/sec, network drops, OOM events |
Key PromQL Queries for Dashboard Panels
# Paste these into Grafana: Dashboards -> New -> Add visualization -> Prometheus
# 1. Request rate (requests/sec) — Stat or Time Series panel
sum(rate(app_http_requests_total[5m])) by (route)
# 2. Error rate percentage — Gauge panel with thresholds at 1% / 5%
(
sum(rate(app_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(app_http_requests_total[5m]))
) * 100
# 3. p50 / p95 / p99 latency — Time Series panel with 3 queries
histogram_quantile(0.50, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.95, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.99, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))
# 4. Active requests — Stat panel
sum(app_http_active_requests)
Dashboard as Code
Storing dashboards as JSON (or using Grafonnet) prevents configuration drift and makes code reviews possible. Export any dashboard from Grafana: Dashboard settings → JSON Model → Copy.
{
"title": "Sample App — RED Dashboard",
"uid": "sample-app-red",
"schemaVersion": 38,
"panels": [
{
"type": "stat",
"title": "Request Rate",
"targets": [{
"expr": "sum(rate(app_http_requests_total[5m]))",
"legendFormat": "req/s"
}],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 100},
{"color": "red", "value": 500}
]
}
}
}
}
]
}
Anti-Patterns to Avoid
- Vanity dashboards — panels showing metrics that no one acts on (e.g., "total requests all time"). Every panel should answer an operational question.
- Too many panels — a dashboard with 40 panels is unreadable. Start with 6-8 panels covering RED/USE. Add more only when you have a proven need.
- No thresholds — panels without color thresholds force the viewer to interpret raw numbers. Add green/yellow/red thresholds so the panel communicates health at a glance.
- Hardcoded time ranges — use template variables so dashboards can be filtered by environment, service, or pod.
8. SLIs, SLOs, and Error Budgets
SLOs (Service Level Objectives) translate reliability into a mathematical budget. They align engineering decisions — feature velocity vs reliability — around a shared, quantifiable target.
Definitions
- SLI (Service Level Indicator) — the metric you measure. Must be something users care about. Examples: request success rate, latency at p99, data freshness.
- SLO (Service Level Objective) — the target you set for the SLI. "99.9% of requests succeed in under 500ms, measured over a 30-day rolling window."
- SLA (Service Level Agreement) — a contractual commitment with financial penalties. Set SLAs above your internal SLOs to create a safety buffer.
- Error budget —
1 - SLO. If SLO = 99.9%, error budget = 0.1% = 43.8 minutes of downtime per month.
Worked Example
# SLI: proportion of successful requests in a 30-day window
# SLO: 99.9% success rate
# Error budget calculation:
total_requests_per_month = 50_000_000
slo = 0.999
error_budget_requests = total_requests_per_month * (1 - slo)
# = 50,000 allowed failures per month
# If you've had 30,000 failures in 20 days:
remaining_budget = error_budget_requests - 30_000 # = 20,000 failures remaining
budget_consumed_percent = 30_000 / error_budget_requests * 100 # = 60%
# Decision: you've consumed 60% of budget in 66% of the window.
# Slightly ahead of pace — proceed with features but watch closely.
Measuring SLOs in PromQL
# SLI: success rate over 30 days (rolling window)
(
sum(increase(app_http_requests_total{status!~"5.."}[30d]))
/
sum(increase(app_http_requests_total[30d]))
)
# Error budget remaining (as fraction)
1 - (
(1 - sum(increase(app_http_requests_total{status!~"5.."}[30d]))
/ sum(increase(app_http_requests_total[30d])))
/ (1 - 0.999)
)
Burn Rate Alerts
A burn rate of 1 means you are consuming error budget exactly as fast as the window allows. A burn rate of 14 means you will exhaust the entire monthly budget in ~2 days.
# Multi-window burn rate alerting (Google SRE recommendation)
- alert: HighErrorBudgetBurn
expr: |
(sum(rate(app_http_requests_total{status=~"5.."}[1h]))
/ sum(rate(app_http_requests_total[1h])))
> 14 * 0.001
AND
(sum(rate(app_http_requests_total{status=~"5.."}[5m]))
/ sum(rate(app_http_requests_total[5m])))
> 14 * 0.001
for: 2m
labels:
severity: critical
How SLOs Drive Engineering Decisions
- When the error budget is healthy, teams ship faster (accepting more risk).
- When the error budget is nearly exhausted, the team pauses feature work and focuses on reliability.
- This removes subjectivity from the "features vs reliability" debate — the math drives the decision.
9. Observability for Microservices
Golden Signals per Service
Google SRE introduced the four golden signals. Each service should expose all four:
- Latency — response time, distinguishing successful vs failed requests
- Traffic — requests/sec, events/sec — demand on the system
- Errors — rate of failed requests (explicit HTTP 5xx, implicit timeouts)
- Saturation — how full the service is (connection pool, thread pool, queue depth)
Service Dependency Maps
Tempo's service graph feature (enabled by the metrics_generator block in tempo.yaml) automatically builds a dependency map from trace data. In Grafana: Explore → Tempo → Service Graph. You will see nodes for each service and edges labeled with request rate and error rate.
# Generate traffic then view service graph in Grafana
for i in {1..50}; do
curl -s http://localhost:8000/hello > /dev/null
curl -s http://localhost:8000/slow > /dev/null
done
# Grafana: Explore -> Tempo -> Service Graph tab
Cross-Service Context Propagation
# Service A: inject trace context into outgoing HTTP call
import httpx
from opentelemetry.propagate import inject
async def call_inventory_service(sku_id: str):
headers = {"Content-Type": "application/json"}
inject(headers) # adds traceparent + tracestate headers
async with httpx.AsyncClient() as client:
response = await client.get(
f"http://inventory-service/sku/{sku_id}",
headers=headers,
)
return response.json()
# Service B: FastAPIInstrumentor extracts traceparent automatically.
# The incoming span is linked as a child of Service A's span.
Common Failure Patterns in Traces
- Retry storms — a span with multiple child spans retrying the same operation. Each retry adds delay, making the parent span very long.
- Cascading failures — one slow service causes all callers to queue, exhausting connection pools. Traces show all parallel calls backed up behind one bottleneck.
- N+1 queries — a loop in application code issues one DB query per item instead of one batch query. Shows as N identical
db.querychild spans under a single service span.
# N+1 anti-pattern visible as 100 sequential DB spans in a trace
orders = db.execute("SELECT * FROM orders LIMIT 100").fetchall()
for order in orders:
# Separate query per order: 100 queries total
user = db.execute("SELECT * FROM users WHERE id = ?", order.user_id).fetchone()
# Fix: single query, one DB span
orders_with_users = db.execute("""
SELECT o.*, u.name, u.email
FROM orders o JOIN users u ON o.user_id = u.id
LIMIT 100
""").fetchall()
10. Observability for Data Engineering
Airflow Spark Kafka dbt
Pipeline Observability Metrics
Data pipelines have different SLIs than request-serving systems. The core questions are: Did the job run? Did it process the expected volume? Is the data fresh? Is the data correct?
| Metric | Type | SLO example |
|---|---|---|
| Job duration | Histogram | p95 < 30 min |
| Records processed | Counter | Alert if < 95% of expected volume |
| Data freshness (lag) | Gauge | Source table updated within 1 hour |
| Failed records / null rate | Gauge | < 0.1% null in non-nullable fields |
| Job failure rate | Counter | 0 failures in critical path per day |
Airflow Metrics and Alerting
Airflow exposes StatsD metrics that can be forwarded to Prometheus via statsd_exporter:
# airflow.cfg metrics section
[metrics]
statsd_on = True
statsd_host = localhost
statsd_port = 8125
statsd_prefix = airflow
# Key Airflow metrics after statsd_exporter translation:
# airflow_dagrun_duration_success{dag_id="..."}
# airflow_task_instance_created_<state>
# airflow_executor_running_tasks
# Alert on DAG failure
- alert: AirflowDAGFailed
expr: increase(airflow_dagrun_duration_failed_total[1h]) > 0
labels:
severity: high
annotations:
summary: "DAG {{ $labels.dag_id }} failed"
# Alert on stale DAG (did not run on schedule)
- alert: AirflowDAGNotRunning
expr: time() - airflow_dag_last_run_timestamp{dag_id="daily_etl"} > 5400
labels:
severity: warning
annotations:
summary: "daily_etl has not run in 90 minutes"
Kafka Consumer Lag Monitoring
Consumer lag — the difference between the latest offset and the committed offset — is the most important Kafka health metric. Growing lag means consumers cannot keep up with producers.
# Using kminion which exports Prometheus metrics
# Key metric: kminion_kafka_consumer_group_topic_partition_lag
# PromQL: total lag across all partitions for a consumer group
sum(kminion_kafka_consumer_group_topic_partition_lag{
consumer_group="order-processor"
}) by (topic)
# Alert on high lag
- alert: KafkaConsumerLagHigh
expr: |
sum(kminion_kafka_consumer_group_topic_partition_lag{
consumer_group="order-processor"
}) > 10000
for: 5m
annotations:
summary: "Kafka consumer lag is {{ $value }} messages"
Data Quality as Metrics
# Emit dbt test results as Prometheus metrics
import json
from prometheus_client import Gauge
DBT_TEST_PASS = Gauge("dbt_test_passed_total", "Passing dbt tests", ["model"])
DBT_TEST_FAIL = Gauge("dbt_test_failed_total", "Failing dbt tests", ["model"])
with open("target/run_results.json") as f:
results = json.load(f)
for result in results["results"]:
model = result["unique_id"].split(".")[2]
if result["status"] == "pass":
DBT_TEST_PASS.labels(model=model).inc()
elif result["status"] == "fail":
DBT_TEST_FAIL.labels(model=model).inc()
log.error("dbt_test_failed", model=model, test=result["unique_id"])
Freshness SLOs
# Alert: events table not updated in 1 hour
- alert: DataFreshnessViolation
expr: sql_table_age_seconds{table_name="events"} > 3600
for: 5m
labels:
severity: high
annotations:
summary: "events table is {{ $value | humanizeDuration }} stale"
# Schema drift detection: alert on unexpected column additions/removals
# Track column count as a gauge; alert if it changes unexpectedly
- alert: SchemaDriftDetected
expr: delta(sql_table_column_count{table="events"}[1h]) != 0
annotations:
summary: "Schema changed on table events"
11. Observability for MLOps
MLflow Prometheus PyTorch
Model Inference Monitoring
from prometheus_client import Counter, Histogram, Gauge
import time
PREDICTION_COUNTER = Counter(
"ml_predictions_total",
"Total predictions made",
["model_name", "model_version", "status"],
)
PREDICTION_LATENCY = Histogram(
"ml_prediction_duration_seconds",
"Time to generate a prediction",
["model_name"],
buckets=[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0],
)
PREDICTION_CONFIDENCE = Histogram(
"ml_prediction_confidence",
"Distribution of model confidence scores",
["model_name"],
buckets=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0],
)
class MonitoredModelServer:
def __init__(self, model, model_name: str, model_version: str):
self.model = model
self.model_name = model_name
self.model_version = model_version
def predict(self, features):
start = time.time()
try:
result = self.model.predict(features)
confidence = float(result.max())
PREDICTION_COUNTER.labels(
model_name=self.model_name,
model_version=self.model_version,
status="success",
).inc()
PREDICTION_CONFIDENCE.labels(model_name=self.model_name).observe(confidence)
return result
except Exception:
PREDICTION_COUNTER.labels(
model_name=self.model_name,
model_version=self.model_version,
status="error",
).inc()
raise
finally:
PREDICTION_LATENCY.labels(model_name=self.model_name).observe(time.time() - start)
Data Drift Detection
Model accuracy degrades when incoming feature distributions shift away from the training distribution (data drift). Detect it by tracking feature statistics over time:
from scipy.stats import ks_2samp
from prometheus_client import Gauge
FEATURE_DRIFT_SCORE = Gauge(
"ml_feature_drift_ks_statistic",
"Kolmogorov-Smirnov drift score vs training distribution",
["model_name", "feature_name"],
)
class DriftDetector:
def __init__(self, training_stats: dict):
self.training_stats = training_stats
def check_drift(self, model_name: str, current_batch: dict):
for feature_name, production_values in current_batch.items():
if feature_name not in self.training_stats:
continue
# KS statistic: 0 = identical distributions, 1 = completely different
statistic, p_value = ks_2samp(
self.training_stats[feature_name],
production_values,
)
FEATURE_DRIFT_SCORE.labels(
model_name=model_name,
feature_name=feature_name,
).set(statistic)
if p_value < 0.05:
log.warning(
"feature_drift_detected",
model=model_name,
feature=feature_name,
ks_statistic=round(statistic, 4),
p_value=round(p_value, 6),
)
Training Pipeline Observability
import mlflow
import time
from prometheus_client import Gauge, push_to_gateway
GPU_UTILIZATION = Gauge("training_gpu_utilization_percent", "GPU utilization", ["gpu_id"])
TRAINING_LOSS = Gauge("training_loss_current", "Current training loss", ["model", "split"])
EPOCH_DURATION = Gauge("training_epoch_duration_seconds", "Duration of last epoch", ["model"])
def train_epoch(model, dataloader, optimizer, epoch: int, run_name: str):
epoch_start = time.time()
total_loss = 0.0
for batch_idx, (inputs, targets) in enumerate(dataloader):
if batch_idx % 10 == 0:
# Track GPU utilization every 10 batches (requires pynvml)
# GPU_UTILIZATION.labels(gpu_id="0").set(get_gpu_util())
pass
optimizer.zero_grad()
loss = criterion(model(inputs), targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(dataloader)
epoch_secs = time.time() - epoch_start
# Dual-write: Prometheus push gateway + MLflow for experiment tracking
TRAINING_LOSS.labels(model=run_name, split="train").set(avg_loss)
EPOCH_DURATION.labels(model=run_name).set(epoch_secs)
push_to_gateway("localhost:9091", job="training")
mlflow.log_metrics({
"train_loss": avg_loss,
"epoch_duration_s": epoch_secs,
}, step=epoch)
A/B Test Metrics
from prometheus_client import Counter
VARIANT_METRIC = Counter(
"ml_ab_test_outcome_total",
"A/B test outcomes",
["experiment_id", "variant", "outcome"],
)
def serve_prediction(user_id: str, experiment_id: str):
# Deterministic assignment by hash — same user always gets same variant
variant = "treatment" if hash(user_id) % 100 < 50 else "control"
model = treatment_model if variant == "treatment" else control_model
with tracer.start_as_current_span("ab-prediction") as span:
span.set_attribute("experiment.id", experiment_id)
span.set_attribute("experiment.variant", variant)
result = model.predict(get_features(user_id))
return {"prediction": result, "variant": variant}
def record_outcome(experiment_id: str, variant: str, outcome: str):
VARIANT_METRIC.labels(
experiment_id=experiment_id,
variant=variant,
outcome=outcome, # "click", "convert", "dismiss"
).inc()
12. Log Analysis & Querying
Grafana Loki + LogQL Deep Dive
LogQL queries consist of a stream selector (fast label index lookup) plus an optional filter pipeline (scans log content). Run all of these in Grafana Explore → Loki:
# ── Basic queries ──────────────────────────────────────────────────────
{container="sample-app"}
{container="sample-app"} |= "error"
{container="sample-app"} != "health"
{container="sample-app"} |~ "latency_ms.*[0-9]{3,}"
# ── JSON parsing ───────────────────────────────────────────────────────
{container="sample-app"} | json | level="warning"
{container="sample-app"} | json | latency_ms > 100
{container="sample-app"} | pattern `<_> latency=<latency>ms <_>`
# ── Metric queries ─────────────────────────────────────────────────────
# Request rate
rate({container="sample-app"} [1m])
# Error count per minute
count_over_time({container="sample-app"} | json | level="error" [1m])
# p99 latency from log field
quantile_over_time(0.99,
{container="sample-app"}
| json
| unwrap latency_ms
| __error__="" [5m]
) by (route)
# Log volume in bytes/sec (cost tracking)
bytes_rate({container="sample-app"} [5m])
Loki vs Elasticsearch/Kibana
| Loki (Grafana) | Elasticsearch + Kibana | |
|---|---|---|
| Indexing | Only indexes labels. Log content not indexed. | Full-text indexes all fields. |
| Cost | Very cheap. Object storage + minimal compute. | Expensive. Large JVM heap, lots of SSD. |
| Query speed | Fast on labels. Slow on content (must scan). | Fast on all indexed fields. |
| Best for | High-volume application logs with well-defined labels. | Security logs (SIEM), complex free-text search. |
When to Use Each Signal for Debugging
| Question | Best signal | Why |
|---|---|---|
| Is my service healthy right now? | Metrics | Low latency, always available, easy to alert on |
| Why did this specific request fail? | Traces + Logs | Traces show where, logs show what |
| Which service is the bottleneck? | Traces | Waterfall view shows latency breakdown |
| Did this error affect other users? | Metrics + Logs | Metrics for rate, logs for user-specific details |
| What did the user do before the error? | Logs | Event sequence with full context |
| Is there a trend in errors over the past week? | Metrics | Long-retention, efficient time-series storage |
13. Infrastructure Observability
node_exporter cAdvisor kube-state-metrics
Node Exporter — Host Metrics
# Add to docker-compose.yml for host metrics
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
networks:
- observability
# Key node_exporter PromQL queries
# CPU utilization %
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory available %
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100
# Disk usage %
(node_filesystem_size_bytes - node_filesystem_avail_bytes)
/ node_filesystem_size_bytes * 100
# Network traffic bytes/sec
rate(node_network_receive_bytes_total{device!="lo"}[5m])
rate(node_network_transmit_bytes_total{device!="lo"}[5m])
# System load
node_load1 # 1-minute
node_load5 # 5-minute
node_load15 # 15-minute
Container Metrics — cAdvisor
# Add to docker-compose.yml
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: cadvisor
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
networks:
- observability
# cAdvisor PromQL queries
# Container CPU usage rate
rate(container_cpu_usage_seconds_total{name="sample-app"}[5m]) * 100
# Container memory
container_memory_working_set_bytes{name="sample-app"}
# Network I/O
rate(container_network_receive_bytes_total{name="sample-app"}[5m])
# OOM kills (container killed for exceeding memory limit)
increase(container_oom_events_total[5m])
Kubernetes Metrics
# kube-state-metrics exposes cluster state as Prometheus metrics
# Pods not in Running or Succeeded state
kube_pod_status_phase{phase!="Running", phase!="Succeeded"} == 1
# Deployment rollout health
kube_deployment_status_replicas_available
/ kube_deployment_spec_replicas
# Resource requests vs limits by namespace
sum(kube_pod_container_resource_requests{resource="cpu"}) by (namespace)
# PVC capacity remaining %
(kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) * 100
# Alert: pod crash loop
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
for: 5m
annotations:
summary: "Pod {{ $labels.pod }} is crash-looping"
- 1860 — Node Exporter Full (host metrics)
- 893 — Docker and system monitoring (cAdvisor)
- 15661 — Kubernetes / Views / Global
- 3662 — Prometheus 2.0 Stats (self-monitoring)
14. Incident Response & Debugging
The Debugging Workflow
- Alert fires — Which service? What symptom (errors, latency, saturation)?
- Dashboard — Open the service RED dashboard. Confirm the symptom. Note when it started. Check if it's one route/pod or systemic.
- Traces — Grafana Explore → Tempo, filter by
service.nameandstatus=erroror sort by duration. Find a representative bad trace. - Logs — Click the trace ID in Tempo to jump to correlated Loki logs. Read the full error message and stack trace.
- Infrastructure — If logs show no application error, check host/container metrics. Was there a deploy? A memory spike? Disk full?
- Root cause + fix — Document the timeline as you go.
# Rapid triage against the local stack
# 1. Current error rate in Prometheus
curl -s 'http://localhost:9090/api/v1/query?query=rate(app_http_requests_total%7Bstatus%3D~%225..%22%7D%5B5m%5D)'
# 2. Find recent errors in Loki (Grafana Explore -> Loki):
{container="sample-app"} | json | level="error"
| line_format "{{.timestamp}} {{.event}} {{.trace_id}}"
# 3. Jump to trace in Tempo
# Grafana Explore -> Tempo -> TraceID tab: paste trace_id from log line
# 4. Container health
docker stats --no-stream sample-app
MTTD and MTTR
- MTTD (Mean Time To Detect) — time from incident start to alert firing. Reduce by lowering alert
for:durations and improving SLI coverage. - MTTR (Mean Time To Resolve) — time from detection to resolution. Reduce with better runbooks, dashboards, and trace-to-log correlation. Distributed tracing directly reduces MTTR by accelerating root cause identification.
Blameless Post-Mortem Template
## Incident Summary
- Date/time: 2026-02-23 14:30 UTC
- Duration: 47 minutes
- Impact: 12% of /checkout requests returned 500 (~8,400 failures)
- Severity: P2
## Timeline
- 14:30 — Alert: HighErrorRate fired
- 14:32 — On-call acknowledged; opened service dashboard
- 14:35 — Traces showed errors in postgres.UPDATE orders span (timeout)
- 14:38 — Logs: "too many connections" in DB span
- 14:41 — Root cause: connection pool exhausted after deploy at 14:15
- 14:47 — Mitigated by rolling back connection pool config
- 15:17 — Full resolution confirmed
## Root Cause
Deploy at 14:15 reduced connection pool max from 20 to 5.
At 14:30 traffic peak, pool was exhausted causing timeouts.
## Action Items
- Add connection pool exhaustion alert: @alice by 2026-03-01
- Add pool utilization panel to service dashboard: @bob by 2026-02-28
- Require load test for connection pool changes: @team
15. Cost & Scaling
High Cardinality — The Number One Pitfall
http_requests_total{user_id="..."} creates 1 million distinct time series. Prometheus will run out of memory. This is called a cardinality explosion. Use low-cardinality labels only: route, status_code, region, service. For per-user data, use logs or traces.
# Bad: cardinality = number of users (potentially millions of series)
REQUEST_COUNT.labels(user_id=user_id, route="/checkout").inc()
# Good: low-cardinality labels only
REQUEST_COUNT.labels(
route="/checkout",
status=str(response.status_code),
region="us-east-1",
).inc()
# For per-user data: structured log line instead
log.info("checkout_request",
user_id=user_id, # lives in log body, not metric label
amount_cents=amount,
trace_id=get_trace_id(),
)
Retention Policies
| Signal | Hot retention | Cold retention |
|---|---|---|
| Metrics (raw) | 15-30 days | Downsampled to 1h resolution, kept 1 year (Thanos/Cortex) |
| Logs | 7-30 days (fast SSD) | 90-365 days on object storage (S3/GCS) |
| Traces | 7-14 days | Deleted (high volume, hard to compress efficiently) |
Reducing Trace Volume via Sampling
# At 10,000 req/s with 100% sampling: ~1 GB/hour of trace data
# With 1% head sampling: ~10 MB/hour
# With tail sampling (errors+slow): ~50 MB/hour
# Production recommendation:
# - 100% sample errors (tail-based)
# - 100% sample p99 latency outliers (tail-based)
# - 1% probabilistic for healthy traces (head-based)
# OTel Collector tail sampling config
processors:
tail_sampling:
decision_wait: 10s
num_traces: 100000
policies:
- name: keep-errors
type: status_code
status_code: {status_codes: [ERROR]}
- name: keep-slow
type: latency
latency: {threshold_ms: 500}
- name: sample-rest
type: probabilistic
probabilistic: {sampling_percentage: 1}
Log Level Filtering in Production
import logging
import os
LOG_LEVEL = "WARNING" if os.getenv("ENV") == "production" else "INFO"
logging.basicConfig(level=getattr(logging, LOG_LEVEL))
# Suppress noisy third-party libraries selectively
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
# Keep your own app at INFO or DEBUG as needed
logging.getLogger("myapp").setLevel(logging.DEBUG)
Self-Hosted vs SaaS Comparison
| Self-hosted OSS | Grafana Cloud | Datadog | New Relic | |
|---|---|---|---|---|
| Metrics cost | Infra only (low) | $8/1k active series/mo | ~$18-25/host/mo | Included in platform fee |
| Logs cost | Storage only | $0.50/GB ingested | $0.10/GB ingested | $0.25/GB ingested |
| Traces cost | Storage only | $0.35/GB ingested | $1.70/million spans | $0.25/GB ingested |
| Ops complexity | High — you run it | Low — managed | None | None |
| Features | Full control | Good, growing | Best-in-class | Strong APM |
| Best for | Cost-sensitive, infra-experienced team | OSS stack + managed reliability | Enterprise, high budget | Mid-size, APM focus |
Quick Reference: PromQL Cheat Sheet
# ── Selectors ─────────────────────────────────────────────────────────
metric_name # current value
metric_name{label="value"} # exact label match
metric_name{label=~"val1|val2"} # regex match
metric_name{label!="value"} # exclude
metric_name[5m] # range vector
# ── Functions ──────────────────────────────────────────────────────────
rate(counter[5m]) # per-second rate (counter-safe)
increase(counter[1h]) # total increase over window
irate(counter[5m]) # instantaneous rate (2 samples)
delta(gauge[5m]) # change in gauge
avg_over_time(gauge[5m]) # time average
histogram_quantile(0.99, rate(hist_bucket[5m]))
# ── Aggregations ───────────────────────────────────────────────────────
sum(metric) by (label)
avg(metric) without (pod)
topk(5, sum(rate(req[5m])) by (svc))
count(metric{status="500"})
# ── Binary operations ──────────────────────────────────────────────────
metric_a / metric_b
metric_a / on(service) metric_b # explicit label matching
metric_a unless metric_b # a where b has no value
# ── Common patterns ────────────────────────────────────────────────────
# Error ratio
sum(rate(req{status=~"5.."}[5m])) / sum(rate(req[5m]))
# Memory used %
(total - available) / total * 100
# Apdex score
(
sum(rate(duration_bucket{le="0.3"}[5m]))
+ sum(rate(duration_bucket{le="1.2"}[5m])) / 2
) / sum(rate(duration_count[5m]))
Quick Reference: LogQL Cheat Sheet
# ── Stream selectors ──────────────────────────────────────────────────
{app="myapp"}
{app=~"myapp.*"}
{app="myapp", env="prod"}
# ── Line filters ───────────────────────────────────────────────────────
|= "error" # contains
!= "health" # does not contain
|~ "error|exception" # regex
!~ "debug|trace" # regex negate
# ── Parsers ────────────────────────────────────────────────────────────
| json
| logfmt
| pattern `<method> <path> <status>`
| regexp `(?P<ip>\d+\.\d+\.\d+\.\d+)`
# ── Label filters (post-parse) ─────────────────────────────────────────
| level="error"
| status_code >= 500
| duration > 1s
# ── Output formatting ──────────────────────────────────────────────────
| line_format "{{.level}} {{.msg}}"
| label_format level=severity
# ── Metric queries ─────────────────────────────────────────────────────
rate({app="myapp"} [5m])
count_over_time({app="myapp"} [5m])
bytes_rate({app="myapp"} [5m])
quantile_over_time(0.99,
{app="myapp"} | json | unwrap latency_ms [5m]
) by (service)
Grafana Dashboard JSON — Sample App RED (import-ready)
Import in Grafana: Dashboards → Import → paste JSON → Load.
{
"title": "Sample App RED",
"uid": "sample-app-red-v1",
"schemaVersion": 38,
"time": {"from": "now-15m", "to": "now"},
"refresh": "10s",
"panels": [
{
"id": 1, "type": "stat", "title": "Request Rate",
"gridPos": {"x":0,"y":0,"w":6,"h":4},
"targets": [{"expr":"sum(rate(app_http_requests_total[5m]))","legendFormat":"req/s"}],
"fieldConfig": {"defaults": {"unit": "reqps",
"thresholds": {"steps": [
{"color":"green","value":null},
{"color":"yellow","value":50},
{"color":"red","value":200}
]}}}
},
{
"id": 2, "type": "stat", "title": "Error Rate",
"gridPos": {"x":6,"y":0,"w":6,"h":4},
"targets": [{"expr":"sum(rate(app_http_requests_total{status=~\"5..\"}[5m])) / sum(rate(app_http_requests_total[5m])) * 100","legendFormat":"error %"}],
"fieldConfig": {"defaults": {"unit": "percent",
"thresholds": {"steps": [
{"color":"green","value":null},
{"color":"yellow","value":1},
{"color":"red","value":5}
]}}}
},
{
"id": 3, "type": "timeseries", "title": "Latency Percentiles",
"gridPos": {"x":0,"y":4,"w":12,"h":8},
"targets": [
{"expr":"histogram_quantile(0.50, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))","legendFormat":"p50"},
{"expr":"histogram_quantile(0.95, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))","legendFormat":"p95"},
{"expr":"histogram_quantile(0.99, sum(rate(app_http_request_duration_seconds_bucket[5m])) by (le))","legendFormat":"p99"}
],
"fieldConfig": {"defaults": {"unit": "s"}}
}
]
}