server-scripts/check-resource-usage.sh at main · amieiro/server-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/bin/bash

################################################################################
# DESCRIPTION:
#   Monitors system resource usage and sends alerts when thresholds are exceeded.
#   Currently monitors disk space usage. Future metrics are documented below.
#
# USAGE:
#   sudo ./check-resource-usage.sh [no-webhook]
#
# PARAMETERS:
#   1. no-webhook: Optional flag to disable webhook notifications and
#      display the output only in the terminal.
#
# FEATURES:
#   - Disk space monitoring with WARNING and CRITICAL thresholds
#   - Cooldown period to prevent alert fatigue
#   - Logging to /var/log/check-resource-usage.log
#   - Webhook notifications via Slack
#
# FUTURE METRICS (To be implemented):
#   - CPU usage monitoring
#   - Memory (RAM) usage monitoring
#   - Disk I/O monitoring
#   - Network usage monitoring
#   - Load average monitoring
#   - Process count monitoring
#   - Swap usage monitoring
#   - System uptime tracking
#   - Temperature monitoring (if applicable)
################################################################################

# --- Load Common Functions (includes auto-update) ---
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/common-functions.sh"

# --- Load Configuration ---
CONFIG_FILE="${SCRIPT_DIR}/config.sh"

if [ ! -f "$CONFIG_FILE" ]; then
    echo "Error: Configuration file not found at $CONFIG_FILE"
    echo "Please copy config.sh.example to config.sh and configure it."
    exit 1
fi

source "$CONFIG_FILE"

# --- Initialization ---
# Ensure the script is run as root
if [[ $EUID -ne 0 ]]; then
   echo "This script must be run as root (use sudo)."
   exit 1
fi

LOG_FILE="/var/log/check-resource-usage.log"
STATE_FILE="${SCRIPT_DIR}/.check-resource-usage-state"
SEND_WEBHOOK=true

# --- Argument Parsing ---
for arg in "$@"; do
    if [ "$arg" == "no-webhook" ]; then
        SEND_WEBHOOK=false
    fi
done

# --- Logging Function ---
log_message() {
    local message="$1"
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $message" | tee -a "$LOG_FILE"
}

# --- Cooldown Management ---
# Check if enough time has passed since the last alert for a specific partition and severity
should_send_alert() {
    local partition="$1"
    local severity="$2"
    local cooldown_minutes="$CHECK_RESOURCE_USAGE_ALERT_COOLDOWN_MINUTES"
    local key="${partition}_${severity}"

    # Create state file if it doesn't exist
    touch "$STATE_FILE" 2>/dev/null || true

    # Get the last alert timestamp for this partition/severity
    local last_alert_time=$(grep "^${key}=" "$STATE_FILE" 2>/dev/null | cut -d'=' -f2)

    if [ -z "$last_alert_time" ]; then
        # No previous alert found, send it
        return 0
    fi

    local current_time=$(date +%s)
    local cooldown_seconds=$((cooldown_minutes * 60))
    local time_diff=$((current_time - last_alert_time))

    if [ $time_diff -ge $cooldown_seconds ]; then
        # Cooldown period has passed
        return 0
    else
        # Still in cooldown period
        local remaining_minutes=$(( (cooldown_seconds - time_diff) / 60 ))
        log_message "Alert suppressed for ${partition} (${severity}): cooldown active (${remaining_minutes} minutes remaining)"
        return 1
    fi
}

# Record that an alert was sent
record_alert() {
    local partition="$1"
    local severity="$2"
    local key="${partition}_${severity}"
    local current_time=$(date +%s)

    # Remove old entry if exists and add new one
    grep -v "^${key}=" "$STATE_FILE" > "${STATE_FILE}.tmp" 2>/dev/null || true
    echo "${key}=${current_time}" >> "${STATE_FILE}.tmp"
    mv "${STATE_FILE}.tmp" "$STATE_FILE"
}

# --- Disk Usage Monitoring ---
check_disk_usage() {
    local partitions="$CHECK_RESOURCE_USAGE_PARTITIONS"
    local warning_threshold="$CHECK_RESOURCE_USAGE_DISK_WARNING_THRESHOLD"
    local critical_threshold="$CHECK_RESOURCE_USAGE_DISK_CRITICAL_THRESHOLD"
    local alerts_sent=0
    local alert_details=""

    log_message "Starting disk usage check"

    for partition in $partitions; do
        # Get disk usage percentage (without % sign)
        local usage=$(df -h "$partition" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}')

        if [ -z "$usage" ]; then
            log_message "Warning: Could not read usage for partition $partition"
            continue
        fi

        log_message "Partition $partition: ${usage}% used"

        # Determine severity
        local severity=""
        local color=""
        local emoji=""

        if [ "$usage" -ge "$critical_threshold" ]; then
            severity="CRITICAL"
            color="#E01E5A"  # Red
            emoji="🚨"
        elif [ "$usage" -ge "$warning_threshold" ]; then
            severity="WARNING"
            color="#E8A317"  # Amber
            emoji="⚠️"
        else
            # Usage is within acceptable range
            continue
        fi

        # Check if we should send an alert (cooldown check)
        if should_send_alert "$partition" "$severity"; then
            log_message "Alert triggered: $partition at ${usage}% ($severity)"

            # Get additional disk info
            local disk_info=$(df -h "$partition" | awk 'NR==2 {print "Size: "$2", Used: "$3", Available: "$4}')

            alert_details+="\n• *${partition}*: ${usage}% used ($severity)\n   └ ${disk_info}"

            # Send individual alert via webhook
            if [ "$SEND_WEBHOOK" = true ]; then
                send_disk_alert "$partition" "$usage" "$severity" "$color" "$emoji" "$disk_info"
            fi

            # Record that we sent this alert
            record_alert "$partition" "$severity"
            ((alerts_sent++))
        fi
    done

    if [ $alerts_sent -eq 0 ]; then
        log_message "No disk alerts triggered (all partitions within thresholds)"
    else
        log_message "Sent $alerts_sent disk usage alert(s)"
    fi
}

# --- Send Alert via Webhook ---
send_disk_alert() {
    local partition="$1"
    local usage="$2"
    local severity="$3"
    local color="$4"
    local emoji="$5"
    local disk_info="$6"

    # Build user mentions
    local mentions=""
    for user in $CHECK_RESOURCE_USAGE_PING_USERS; do
        mentions+="<@$user> "
    done

    local title="${severity}: Disk space alert on ${partition}"
    local details="Partition *${partition}* is at *${usage}% capacity*\n${disk_info}"

    if [ "$severity" = "CRITICAL" ]; then
        details+="\n\n⚠️ *Immediate action required!*\nAttention: $mentions"
    else
        details+="\n\nAttention: $mentions"
    fi

    # Create JSON payload
    local payload=$(cat <<EOF
{
  "text": "$emoji *Resource Alert - $CHECK_RESOURCE_USAGE_SERVER_NAME*",
  "attachments": [
    {
      "color": "$color",
      "title": "$title",
      "text": "$details",
      "footer": "Check executed on: $(date '+%Y-%m-%d %H:%M:%S')"
    }
  ]
}
EOF
)

    # Send to webhook
    curl -s -X POST -H 'Content-type: application/json' --data "$payload" "$CHECK_RESOURCE_USAGE_SLACK_WEBHOOK_URL" > /dev/null
    log_message "Webhook notification sent for $partition ($severity)"
}

# =============================================================================
# FUTURE MONITORING FUNCTIONS (To be implemented)
# =============================================================================

# --- CPU Usage Monitoring ---
# check_cpu_usage() {
#     # Use: top, mpstat, or /proc/stat
#     # Monitor: Overall CPU percentage, per-core usage
#     # Thresholds: WARNING at 80%, CRITICAL at 95%
#     # Alert if sustained high usage over time period (e.g., 5 minutes)
#     log_message "CPU monitoring not yet implemented"
# }

# --- Memory Usage Monitoring ---
# check_memory_usage() {
#     # Use: free -m, /proc/meminfo
#     # Monitor: Used RAM percentage, available memory
#     # Thresholds: WARNING at 85%, CRITICAL at 95%
#     # Consider buffers/cache vs actual application memory
#     log_message "Memory monitoring not yet implemented"
# }

# --- Disk I/O Monitoring ---
# check_disk_io() {
#     # Use: iostat, /proc/diskstats
#     # Monitor: Read/write speeds, queue length, I/O wait percentage
#     # Thresholds: Based on baseline performance metrics
#     # Detect sustained high I/O wait times
#     log_message "Disk I/O monitoring not yet implemented"
# }

# --- Network Usage Monitoring ---
# check_network_usage() {
#     # Use: ifstat, /proc/net/dev, vnstat
#     # Monitor: Incoming/outgoing traffic rates
#     # Thresholds: Spike detection, bandwidth saturation
#     # Track per-interface statistics
#     log_message "Network monitoring not yet implemented"
# }

# --- Load Average Monitoring ---
# check_load_average() {
#     # Use: uptime, /proc/loadavg
#     # Monitor: 1, 5, and 15 minute load averages
#     # Thresholds: Relative to number of CPU cores
#     # Alert if load > (cores * threshold_multiplier)
#     log_message "Load average monitoring not yet implemented"
# }

# --- Process Count Monitoring ---
# check_process_count() {
#     # Use: ps, /proc
#     # Monitor: Total running processes, zombie processes
#     # Thresholds: Absolute count or unusual spike detection
#     # Identify process leaks or fork bombs
#     log_message "Process count monitoring not yet implemented"
# }

# --- Swap Usage Monitoring ---
# check_swap_usage() {
#     # Use: free -m, /proc/swaps
#     # Monitor: Swap space usage percentage
#     # Thresholds: WARNING at 50%, CRITICAL at 80%
#     # High swap usage indicates memory pressure
#     log_message "Swap usage monitoring not yet implemented"
# }

# --- System Uptime Tracking ---
# check_system_uptime() {
#     # Use: uptime, /proc/uptime
#     # Monitor: Days since last reboot
#     # Alert: Informational for maintenance planning
#     # No critical thresholds typically needed
#     log_message "Uptime tracking not yet implemented"
# }

# --- Temperature Monitoring ---
# check_temperature() {
#     # Use: sensors (lm-sensors package), /sys/class/thermal
#     # Monitor: CPU temperature, system temperature
#     # Thresholds: WARNING at 75°C, CRITICAL at 85°C
#     # Hardware-specific, may not be available on VMs
#     log_message "Temperature monitoring not yet implemented"
# }

# =============================================================================
# MAIN EXECUTION
# =============================================================================

log_message "========================================="
log_message "Resource usage check started"
log_message "========================================="

# Execute monitoring checks
check_disk_usage

# Future checks (uncomment when implemented):
# check_cpu_usage
# check_memory_usage
# check_disk_io
# check_network_usage
# check_load_average
# check_process_count
# check_swap_usage
# check_system_uptime
# check_temperature

log_message "========================================="
log_message "Resource usage check completed"
log_message "========================================="

# Console output if webhook is disabled
if [ "$SEND_WEBHOOK" = false ]; then
    echo ""
    echo "--- RESOURCE USAGE REPORT ($CHECK_RESOURCE_USAGE_SERVER_NAME) ---"
    echo "Check completed. See $LOG_FILE for details."
    echo "-----------------------------------------------------------"
fi