alert-management

aj-geddes

Updated Today

14 views

Othergeneral

About

This skill helps developers implement comprehensive alert management systems using PagerDuty. It provides capabilities for setting up alert routing, managing on-call schedules, and creating escalation policies. Use it when integrating alerting systems or coordinating incident response workflows.

Documentation

Alert Management

Overview

Design and implement sophisticated alert management systems with PagerDuty integration, escalation policies, alert routing, and incident coordination.

When to Use

Setting up alert routing
Managing on-call schedules
Coordinating incident response
Creating escalation policies
Integrating alerting systems

Instructions

1. PagerDuty Client Integration

// pagerduty-client.js
const axios = require('axios');

class PagerDutyClient {
  constructor(apiToken) {
    this.apiToken = apiToken;
    this.baseUrl = 'https://api.pagerduty.com';
    this.eventUrl = 'https://events.pagerduty.com/v2/enqueue';

    this.client = axios.create({
      baseURL: this.baseUrl,
      headers: {
        'Authorization': `Token token=${apiToken}`,
        'Accept': 'application/vnd.pagerduty+json;version=2'
      }
    });
  }

  async triggerEvent(config) {
    const event = {
      routing_key: config.routingKey,
      event_action: config.eventAction || 'trigger',
      dedup_key: config.dedupKey || `event-${Date.now()}`,
      payload: {
        summary: config.summary,
        timestamp: new Date().toISOString(),
        severity: config.severity || 'error',
        source: config.source || 'Monitoring System',
        component: config.component,
        custom_details: config.customDetails || {}
      }
    };

    try {
      const response = await axios.post(this.eventUrl, event);
      return response.data;
    } catch (error) {
      console.error('Failed to trigger PagerDuty event:', error);
      throw error;
    }
  }

  async resolveEvent(dedupKey) {
    const event = {
      routing_key: process.env.PAGERDUTY_ROUTING_KEY,
      event_action: 'resolve',
      dedup_key: dedupKey
    };

    try {
      return await axios.post(this.eventUrl, event);
    } catch (error) {
      console.error('Failed to resolve event:', error);
      throw error;
    }
  }

  async getServices() {
    const response = await this.client.get('/services');
    return response.data.services;
  }

  async getEscalationPolicies() {
    const response = await this.client.get('/escalation_policies');
    return response.data.escalation_policies;
  }

  async createIncident(config) {
    const incident = {
      type: 'incident',
      title: config.title,
      service: {
        id: config.serviceId,
        type: 'service_reference'
      },
      escalation_policy: {
        id: config.escalationPolicyId,
        type: 'escalation_policy_reference'
      },
      body: {
        type: 'incident_body',
        details: config.details || ''
      }
    };

    try {
      const response = await this.client.post('/incidents', incident, {
        headers: { 'From': process.env.PAGERDUTY_EMAIL }
      });
      return response.data.incident;
    } catch (error) {
      console.error('Failed to create incident:', error);
      throw error;
    }
  }

  async acknowledgeIncident(incidentId, userId) {
    try {
      const response = await this.client.put(
        `/incidents/${incidentId}`,
        {
          incidents: [{
            id: incidentId,
            type: 'incident_reference',
            status: 'acknowledged'
          }]
        },
        { headers: { 'From': process.env.PAGERDUTY_EMAIL } }
      );
      return response.data.incidents[0];
    } catch (error) {
      console.error('Failed to acknowledge:', error);
      throw error;
    }
  }

  async resolveIncident(incidentId) {
    try {
      const response = await this.client.put(
        `/incidents/${incidentId}`,
        {
          incidents: [{
            id: incidentId,
            type: 'incident_reference',
            status: 'resolved'
          }]
        },
        { headers: { 'From': process.env.PAGERDUTY_EMAIL } }
      );
      return response.data.incidents[0];
    } catch (error) {
      console.error('Failed to resolve:', error);
      throw error;
    }
  }
}

module.exports = PagerDutyClient;

2. Alertmanager Configuration

# /etc/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  slack_api_url: '${SLACK_WEBHOOK_URL}'

templates:
  - '/etc/alertmanager/templates/*.tmpl'

route:
  receiver: 'default'
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 4h

  routes:
    - match:
        severity: critical
      receiver: pagerduty
      continue: true
      group_wait: 0s

    - match:
        severity: warning
      receiver: slack

    - match:
        service: payment-service
      receiver: payment-team
      group_wait: 30s

receivers:
  - name: 'default'
    slack_configs:
      - channel: '#alerts'
        title: 'Alert: {{ .GroupLabels.alertname }}'

  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: '${PAGERDUTY_SERVICE_KEY}'
        description: '{{ .GroupLabels.alertname }}'

  - name: 'slack'
    slack_configs:
      - channel: '#alerts'
        title: 'Warning: {{ .GroupLabels.alertname }}'

  - name: 'payment-team'
    pagerduty_configs:
      - service_key: '${PAYMENT_PAGERDUTY_KEY}'
    slack_configs:
      - channel: '#payment-alerts'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'service']

3. Alert Handler Middleware

// alert-handler.js
const PagerDutyClient = require('./pagerduty-client');

const pdClient = new PagerDutyClient(process.env.PAGERDUTY_API_TOKEN);

class AlertHandler {
  constructor() {
    this.alertCache = new Map();
    this.deduplicationWindow = 300000; // 5 minutes
  }

  shouldSendAlert(dedupKey) {
    const cacheEntry = this.alertCache.get(dedupKey);

    if (!cacheEntry) return true;

    const timeSinceLastAlert = Date.now() - cacheEntry.timestamp;
    return timeSinceLastAlert >= this.deduplicationWindow;
  }

  recordAlert(dedupKey) {
    this.alertCache.set(dedupKey, { timestamp: Date.now() });
  }

  determineSeverity(value, thresholds) {
    if (value >= thresholds.critical) return 'critical';
    if (value >= thresholds.warning) return 'warning';
    return 'info';
  }

  async sendAlert(config) {
    const dedupKey = config.dedupKey || `alert-${config.alertName}-${Date.now()}`;

    try {
      if (!this.shouldSendAlert(dedupKey)) {
        console.log('Alert recently sent, skipping');
        return;
      }

      const event = {
        routingKey: config.routingKey,
        eventAction: config.eventAction || 'trigger',
        dedupKey: dedupKey,
        summary: config.summary,
        severity: config.severity,
        source: config.source || 'Monitoring System',
        component: config.component,
        customDetails: {
          ...config.customDetails,
          alertName: config.alertName,
          timestamp: new Date().toISOString()
        }
      };

      const result = await pdClient.triggerEvent(event);
      this.recordAlert(dedupKey);

      console.log('Alert sent', {
        alertName: config.alertName,
        severity: config.severity
      });

      return result;
    } catch (error) {
      console.error('Failed to send alert:', error);
      await this.sendSlackAlert(config);
    }
  }

  async sendSlackAlert(config) {
    const axios = require('axios');
    const webhookUrl = process.env.SLACK_WEBHOOK_URL;

    const message = {
      color: config.severity === 'critical' ? 'danger' : 'warning',
      title: config.summary,
      text: config.customDetails?.description || '',
      fields: [
        { title: 'Severity', value: config.severity, short: true },
        { title: 'Component', value: config.component, short: true }
      ]
    };

    try {
      await axios.post(webhookUrl, { attachments: [message] });
    } catch (error) {
      console.error('Failed to send Slack alert:', error);
    }
  }

  async resolveAlert(dedupKey) {
    try {
      await pdClient.resolveEvent(dedupKey);
      console.log('Alert resolved');
    } catch (error) {
      console.error('Failed to resolve alert:', error);
    }
  }
}

module.exports = new AlertHandler();

4. Alert Routing Engine

// alert-router.js
class AlertRouter {
  constructor() {
    this.routes = [];
  }

  addRoute(rule) {
    this.routes.push({
      priority: rule.priority || 0,
      condition: rule.condition,
      handler: rule.handler,
      escalation: rule.escalation
    });
    this.routes.sort((a, b) => b.priority - a.priority);
  }

  async route(alert) {
    for (const route of this.routes) {
      if (route.condition(alert)) {
        return await route.handler(alert, route.escalation);
      }
    }
    return this.defaultHandler(alert);
  }

  async defaultHandler(alert) {
    console.log('Routing to default handler:', alert.name);
    return { routed: true, handler: 'default' };
  }
}

// Usage
const router = new AlertRouter();

router.addRoute({
  priority: 100,
  condition: (alert) => alert.severity === 'critical' && alert.component === 'database',
  handler: async (alert) => {
    console.log('Routing critical database alert to DBA team');
    return { team: 'dba', escalation: 'immediate' };
  }
});

router.addRoute({
  priority: 90,
  condition: (alert) => alert.component === 'payment-service',
  handler: async (alert) => {
    console.log('Routing to payment team');
    return { team: 'payment', escalation: 'payment-policy' };
  }
});

router.addRoute({
  priority: 10,
  condition: (alert) => alert.severity === 'warning',
  handler: async (alert) => {
    console.log('Routing warning to Slack');
    return { handler: 'slack-only' };
  }
});

module.exports = router;

5. Docker Compose Alert Stack

# docker-compose.yml
version: '3.8'
services:
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    environment:
      SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
      PAGERDUTY_SERVICE_KEY: ${PAGERDUTY_SERVICE_KEY}
    depends_on:
      - prometheus

  alert-handler:
    build: .
    environment:
      PAGERDUTY_API_TOKEN: ${PAGERDUTY_API_TOKEN}
      SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
    ports:
      - "3000:3000"
    depends_on:
      - alertmanager

Best Practices

✅ DO

Set appropriate thresholds
Implement alert deduplication
Use clear alert names
Include runbook links
Configure escalation properly
Test alert rules
Monitor alert quality
Set repeat intervals
Track alert metrics
Document alert meanings

❌ DON'T

Alert on every anomaly
Ignore alert fatigue
Set thresholds arbitrarily
Skip runbooks
Alert without action
Disable alerts in production
Use vague alert names
Forget escalation policies
Re-alert too frequently

Alert Severity Levels

Critical: Immediate action required, customer impact
Warning: Investigation needed, potential issues
Info: Informational, no action required

Key Metrics

Alert volume
Resolution time
False positive rate
Escalation frequency
MTTD (Mean Time to Detection)
MTTR (Mean Time to Resolution)

Quick Install

/plugin add https://github.com/aj-geddes/useful-ai-prompts/tree/main/alert-management

Copy and paste this command in Claude Code to install this skill

GitHub 仓库

aj-geddes/useful-ai-prompts

Path: skills/alert-management

Related Skills

subagent-driven-development

Development

This skill executes implementation plans by dispatching a fresh subagent for each independent task, with code review between tasks. It enables fast iteration while maintaining quality gates through this review process. Use it when working on mostly independent tasks within the same session to ensure continuous progress with built-in quality checks.

View skill

algorithmic-art

executing-plans

Design

Use the executing-plans skill when you have a complete implementation plan to execute in controlled batches with review checkpoints. It loads and critically reviews the plan, then executes tasks in small batches (default 3 tasks) while reporting progress between each batch for architect review. This ensures systematic implementation with built-in quality control checkpoints.

View skill

cost-optimization

Other

This Claude Skill helps developers optimize cloud costs through resource rightsizing, tagging strategies, and spending analysis. It provides a framework for reducing cloud expenses and implementing cost governance across AWS, Azure, and GCP. Use it when you need to analyze infrastructure costs, right-size resources, or meet budget constraints.

View skill