Monitoring Guide
Track and monitor TTL cleanup activity for optimal performance.
Quick Status Check
-- One-stop monitoring query
SELECT
table_name,
active,
expire_after_seconds / 3600.0 AS expire_hours,
last_run,
time_since_last_run,
rows_deleted_last_run,
total_rows_deleted
FROM ttl_summary()
ORDER BY table_name;
Worker Health Monitoring
Check Worker Status
-- Is the worker running?
SELECT * FROM ttl_worker_status();
Expected output when healthy:
worker_pid | application_name | state | backend_start | database_name
-----------+-------------------+-------+---------------------+---------------
12345 | TTL Worker DB... | idle | 2026-01-03 02:00... | mydb
Worker Uptime
SELECT
worker_pid,
NOW() - backend_start AS uptime,
state
FROM ttl_worker_status();
Detect Missing Worker
-- Alert if no worker running
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM ttl_worker_status()) THEN
RAISE WARNING 'TTL worker is not running!';
END IF;
END $$;
Cleanup Activity Monitoring
Recent Cleanup Activity
SELECT
table_name,
last_run,
time_since_last_run,
rows_deleted_last_run
FROM ttl_summary()
WHERE last_run > NOW() - INTERVAL '1 hour'
ORDER BY last_run DESC;
Tables with Overdue Cleanup
-- Find tables where cleanup hasn't run recently
SELECT
table_name,
time_since_last_run,
active
FROM ttl_summary()
WHERE active = true
AND (last_run IS NULL OR last_run < NOW() - INTERVAL '5 minutes')
ORDER BY last_run NULLS FIRST;
High-Volume Deletion Tracking
SELECT
table_name,
rows_deleted_last_run,
total_rows_deleted,
last_run
FROM ttl_summary()
WHERE rows_deleted_last_run > 10000
ORDER BY rows_deleted_last_run DESC;
Performance Monitoring
Deletion Rate Analysis
SELECT
table_name,
rows_deleted_last_run,
batch_size,
CASE
WHEN rows_deleted_last_run >= batch_size
THEN 'May need larger batch'
WHEN rows_deleted_last_run < batch_size * 0.1
THEN 'Batch size OK'
ELSE 'Optimal'
END AS batch_assessment
FROM ttl_summary()
WHERE active = true;
Average Deletion Per Day
SELECT
table_name,
total_rows_deleted,
EXTRACT(EPOCH FROM (NOW() -created_at)) / 86400 AS days_active,
ROUND(total_rows_deleted / NULLIF(EXTRACT(EPOCH FROM (NOW() - created_at)) / 86400, 0)) AS avg_per_day
FROM ttl_index_table
WHERE created_at < NOW() - INTERVAL '1 day';
Alerting Queries
Critical: Worker Down
SELECT
CASE
WHEN COUNT(*) = 0 THEN 'CRITICAL: TTL worker not running'
ELSE 'OK: Worker running'
END AS status
FROM ttl_worker_status();
Warning: Cleanup Lag
SELECT
table_name,
time_since_last_run
FROM ttl_summary()
WHERE active = true
AND time_since_last_run > INTERVAL '10 minutes'
ORDER BY time_since_last_run DESC;
Info: High Deletion Rate
SELECT
table_name,
rows_deleted_last_run
FROM ttl_summary()
WHERE rows_deleted_last_run > 50000;
Dashboard Queries
Summary Dashboard
SELECT
COUNT(*) FILTER (WHERE active = true) AS active_ttl_indexes,
COUNT(*) AS total_ttl_indexes,
SUM(total_rows_deleted) AS total_deletions,
MAX(last_run) AS most_recent_cleanup
FROM ttl_index_table;
Per-Table Dashboard
SELECT
table_name,
expire_after_seconds || 's' AS ttl,
CASE WHEN active THEN '✓' ELSE '✗' END AS active,
COALESCE(rows_deleted_last_run::TEXT, '-') AS last_run_deletions,
COALESCE(time_since_last_run::TEXT, 'Never') AS last_cleanup
FROM ttl_summary()
ORDER BY table_name;
Integration with Monitoring Tools
Prometheus/Grafana Metrics
Create a view for metrics export:
CREATE OR REPLACE VIEW ttl_metrics AS
SELECT
table_name AS table,
expire_after_seconds,
CASE WHEN active THEN 1 ELSE 0 END AS is_active,
COALESCE(rows_deleted_last_run, 0) AS rows_deleted_last,
COALESCE(total_rows_deleted, 0) AS rows_deleted_total,
EXTRACT(EPOCH FROM COALESCE(time_since_last_run, INTERVAL '0')) AS seconds_since_last_run
FROM ttl_summary();
Datadog/New Relic Integration
-- Export TTL statistics as JSON
SELECT json_agg(json_build_object(
'table', table_name,
'active', active,
'rows_deleted_last_run', rows_deleted_last_run,
'total_rows_deleted', total_rows_deleted,
'last_run_timestamp', EXTRACT(EPOCH FROM last_run)
)) AS ttl_stats
FROM ttl_summary();
Logging and Audit
Enable Detailed Logging
-- Enable debug logging
ALTER SYSTEM SET log_min_messages = 'debug1';
SELECT pg_reload_conf();
-- Check PostgreSQL logs
-- tail -f /var/log/postgresql/postgresql-*.log | grep TTL
Track Configuration Changes
-- View current vs default configuration
SELECT
name,
setting AS current_value,
boot_val AS default_value,
source
FROM pg_settings
WHERE name LIKE 'pg_ttl_index%';
Automated Monitoring Scripts
Daily Health Check (SQL)
-- Save as daily_ttl_check.sql
DO $$
DECLARE
worker_count INTEGER;
stale_count INTEGER;
BEGIN
-- Check worker
SELECT COUNT(*) INTO worker_count FROM ttl_worker_status();
IF worker_count = 0 THEN
RAISE WARNING 'TTL worker is not running';
END IF;
-- Check stale cleanups
SELECT COUNT(*) INTO stale_count
FROM ttl_summary()
WHERE active = true
AND time_since_last_run > INTERVAL '10 minutes';
IF stale_count > 0 THEN
RAISE WARNING '% tables have stale cleanups', stale_count;
END IF;
RAISE NOTICE 'Health check complete. Worker: %, Stale tables: %',
worker_count, stale_count;
END $$;
Monitoring Functions
-- Create custom monitoring function
CREATE OR REPLACE FUNCTION ttl_health_check()
RETURNS TABLE(
check_name TEXT,
status TEXT,
details TEXT
) AS $$
BEGIN
-- Worker status
RETURN QUERY
SELECT
'Worker Status'::TEXT,
CASE WHEN EXISTS(SELECT 1 FROM ttl_worker_status())
THEN 'OK' ELSE 'FAIL' END,
COALESCE((SELECT COUNT(*)::TEXT FROM ttl_worker_status()), '0') || ' workers';
-- Active tables
RETURN QUERY
SELECT
'Active TTL Tables'::TEXT,
'INFO'::TEXT,
COUNT(*)::TEXT || ' tables'
FROM ttl_index_table
WHERE active = true;
-- Stale cleanups
RETURN QUERY
SELECT
'Stale Cleanups'::TEXT,
CASE WHEN COUNT(*) = 0 THEN 'OK' ELSE 'WARN' END,
COUNT(*)::TEXT || ' tables'
FROM ttl_summary()
WHERE active = true
AND time_since_last_run > INTERVAL '10 minutes';
END;
$$ LANGUAGE plpgsql;
-- Use it
SELECT * FROM ttl_health_check();
Best Practices
- Monitor worker uptime - Restart after PostgreSQL restarts
- Check cleanup lag - Ensure cleanup runs regularly
- Track deletion rates - Adjust batch sizes accordingly
- Log configuration changes - Document TTL modifications
- Set up alerts - Proactive issue detection
See Also
- Configuration Guide - Tuning parameters
- Troubleshooting - Common issues
- API Reference - Monitoring functions