secubox-openwrt/package/secubox/secubox-app-watchdog/files/usr/sbin/watchdogctl
CyberMind-FR 1bbd345cee refactor(luci): Mass KissTheme UI rework across all LuCI apps
Convert 90+ LuCI view files from legacy cbi-button-* classes to
KissTheme kiss-btn-* classes for consistent dark theme styling.

Pattern conversions applied:
- cbi-button-positive → kiss-btn-green
- cbi-button-negative/remove → kiss-btn-red
- cbi-button-apply → kiss-btn-cyan
- cbi-button-action → kiss-btn-blue
- cbi-button (plain) → kiss-btn

Also replaced hardcoded colors (#080, #c00, #888, etc.) with
CSS variables (--kiss-green, --kiss-red, --kiss-muted, etc.)
for proper dark theme compatibility.

Apps updated include: ai-gateway, auth-guardian, bandwidth-manager,
cloner, config-advisor, crowdsec-dashboard, dns-provider, exposure,
glances, haproxy, hexojs, iot-guard, jellyfin, ksm-manager,
mac-guardian, magicmirror2, master-link, meshname-dns, metablogizer,
metabolizer, mqtt-bridge, netdata-dashboard, picobrew, routes-status,
secubox-admin, secubox-mirror, secubox-p2p, secubox-security-threats,
service-registry, simplex, streamlit, system-hub, tor-shield,
traffic-shaper, vhost-manager, vortex-dns, vortex-firewall,
webradio, wireguard-dashboard, zigbee2mqtt, zkp, and more.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-12 11:09:34 +01:00

716 lines
19 KiB
Bash

#!/bin/sh
# SecuBox Watchdog Control
# Service health monitoring and auto-recovery
. /lib/functions.sh
CONFIG_NAME="watchdog"
LOG_FILE="/var/log/watchdog.log"
ALERT_STATE_DIR="/tmp/watchdog"
VERSION="1.0.0"
# Colors for terminal output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
# Load configuration
load_config() {
config_load "$CONFIG_NAME"
config_get LOG_FILE main log_file '/var/log/watchdog.log'
config_get ALERT_COOLDOWN main alert_cooldown '300'
config_get MAX_LOG_LINES main max_log_lines '1000'
mkdir -p "$ALERT_STATE_DIR"
}
# Logging
log_msg() {
local level="$1"
local msg="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $msg" >> "$LOG_FILE"
# Rotate log if too large
local lines=$(wc -l < "$LOG_FILE" 2>/dev/null || echo 0)
if [ "$lines" -gt "$MAX_LOG_LINES" ]; then
tail -n "$((MAX_LOG_LINES / 2))" "$LOG_FILE" > "$LOG_FILE.tmp"
mv "$LOG_FILE.tmp" "$LOG_FILE"
fi
}
log_alert() {
log_msg "ALERT" "$1"
echo -e "${RED}SPUNK ALERT${NC} - $1" >&2
}
log_info() {
log_msg "INFO" "$1"
}
log_ok() {
log_msg "OK" "$1"
}
# Check if we should alert (cooldown)
should_alert() {
local service="$1"
local state_file="$ALERT_STATE_DIR/$service.alert"
if [ -f "$state_file" ]; then
local last_alert=$(cat "$state_file")
local now=$(date +%s)
local diff=$((now - last_alert))
[ "$diff" -lt "$ALERT_COOLDOWN" ] && return 1
fi
date +%s > "$state_file"
return 0
}
clear_alert() {
local service="$1"
rm -f "$ALERT_STATE_DIR/$service.alert"
}
# Check LXC container status
check_container() {
local section="$1"
local enabled name critical start_service service_name
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get critical "$section" critical '0'
config_get start_service "$section" start_service '0'
config_get service_name "$section" service_name ''
local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}')
if [ "$state" = "RUNNING" ]; then
clear_alert "container_$name"
return 0
else
if should_alert "container_$name"; then
log_alert "Container $name is $state"
fi
return 1
fi
}
# Restart LXC container
restart_container() {
local section="$1"
local name start_service service_name
config_get name "$section" name
config_get start_service "$section" start_service '0'
config_get service_name "$section" service_name ''
log_info "Restarting container: $name"
# Stop if running
lxc-stop -n "$name" 2>/dev/null
sleep 1
# Start container
lxc-start -n "$name" 2>/dev/null
sleep 2
# Start service inside if configured
if [ "$start_service" = "1" ] && [ -n "$service_name" ]; then
sleep 2
lxc-attach -n "$name" -- /etc/init.d/"$service_name" start 2>/dev/null
log_info "Started $service_name service inside $name"
fi
# Verify
local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}')
if [ "$state" = "RUNNING" ]; then
log_ok "Container $name recovered successfully"
return 0
else
log_alert "Container $name failed to start"
return 1
fi
}
# Check host service status
check_service() {
local section="$1"
local enabled name process critical
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get process "$section" process
config_get critical "$section" critical '0'
if pgrep "$process" >/dev/null 2>&1; then
clear_alert "service_$name"
return 0
else
if should_alert "service_$name"; then
log_alert "Service $name is not running"
fi
return 1
fi
}
# Restart host service
restart_service() {
local section="$1"
local name process init_script
config_get name "$section" name
config_get process "$section" process
config_get init_script "$section" init_script
log_info "Restarting service: $name"
if [ -x "$init_script" ]; then
"$init_script" restart 2>/dev/null
sleep 2
if pgrep "$process" >/dev/null 2>&1; then
log_ok "Service $name recovered successfully"
return 0
else
log_alert "Service $name failed to restart"
return 1
fi
else
log_alert "No init script found for $name"
return 1
fi
}
# Check mountpoint
check_mountpoint() {
local section="$1"
local enabled name path critical
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get path "$section" path
config_get critical "$section" critical '0'
# Check if path is a mountpoint and accessible
if mountpoint -q "$path" 2>/dev/null && [ -d "$path" ] && [ -r "$path" ]; then
clear_alert "mountpoint_$name"
return 0
else
if should_alert "mountpoint_$name"; then
log_alert "Mountpoint $name ($path) is not accessible"
fi
return 1
fi
}
# Remount a mountpoint
remount_mountpoint() {
local section="$1"
local name path
config_get name "$section" name
config_get path "$section" path
log_info "Attempting to remount: $path"
# Try to remount
mount -o remount "$path" 2>/dev/null
sleep 1
if mountpoint -q "$path" 2>/dev/null && [ -d "$path" ] && [ -r "$path" ]; then
log_ok "Mountpoint $path recovered"
return 0
else
log_alert "Mountpoint $path failed to remount - manual intervention required"
return 1
fi
}
# Check HTTPS endpoint
check_endpoint() {
local section="$1"
local enabled name host expected_codes
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get host "$section" host
config_get expected_codes "$section" expected_codes '200'
local code=$(curl -sk -o /dev/null -w "%{http_code}" -H "Host: $host" https://127.0.0.1/ 2>/dev/null)
local match=0
for expected in $expected_codes; do
[ "$code" = "$expected" ] && match=1 && break
done
if [ "$match" = "1" ]; then
clear_alert "endpoint_$host"
return 0
else
if should_alert "endpoint_$host"; then
log_alert "Endpoint $name ($host) returned HTTP $code"
fi
return 1
fi
}
# Run single check cycle
run_check() {
local auto_recover="${1:-0}"
local failed=0
# Temp file to track failed items
local failed_containers="/tmp/watchdog_failed_containers"
local failed_services="/tmp/watchdog_failed_services"
> "$failed_containers"
> "$failed_services"
# Check containers and record failures
local check_and_record_container
check_and_record_container() {
local section="$1"
local enabled name
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}')
if [ "$state" = "RUNNING" ]; then
clear_alert "container_$name"
else
if should_alert "container_$name"; then
log_alert "Container $name is $state"
fi
echo "$section" >> "$failed_containers"
failed=1
fi
}
config_foreach check_and_record_container container
# Check services and record failures
local check_and_record_service
check_and_record_service() {
local section="$1"
local enabled name process
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get process "$section" process
if pgrep "$process" >/dev/null 2>&1; then
clear_alert "service_$name"
else
if should_alert "service_$name"; then
log_alert "Service $name is not running"
fi
echo "$section" >> "$failed_services"
failed=1
fi
}
config_foreach check_and_record_service service
# Check endpoints
config_foreach check_endpoint endpoint || failed=1
# Auto-recovery if enabled
if [ "$auto_recover" = "1" ]; then
# Restart failed containers
while read section; do
[ -n "$section" ] && restart_container "$section"
done < "$failed_containers"
# Restart failed services
while read section; do
[ -n "$section" ] && restart_service "$section"
done < "$failed_services"
fi
rm -f "$failed_containers" "$failed_services"
return $failed
}
# Get status of a single container
get_container_status() {
local name="$1"
local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}')
local pid=$(lxc-info -n "$name" 2>/dev/null | grep "PID:" | awk '{print $2}')
if [ "$state" = "RUNNING" ]; then
echo "running:$pid"
else
echo "stopped:0"
fi
}
# Get status of a single service
get_service_status() {
local process="$1"
local pid=$(pgrep "$process" 2>/dev/null | head -1)
if [ -n "$pid" ]; then
echo "running:$pid"
else
echo "stopped:0"
fi
}
# Get endpoint status
get_endpoint_status() {
local host="$1"
local code=$(curl -sk -o /dev/null -w "%{http_code}" -H "Host: $host" --connect-timeout 5 https://127.0.0.1/ 2>/dev/null)
echo "$code"
}
# Show status command
cmd_status() {
load_config
echo ""
echo -e "${CYAN}SecuBox Watchdog Status${NC}"
echo "========================"
echo ""
echo -e "${CYAN}LXC Containers:${NC}"
echo "---------------"
config_load "$CONFIG_NAME"
local container_status
container_status() {
local section="$1"
local enabled name
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
local result=$(get_container_status "$name")
local state=$(echo "$result" | cut -d: -f1)
local pid=$(echo "$result" | cut -d: -f2)
if [ "$state" = "running" ]; then
printf " %-20s ${GREEN}RUNNING${NC} (PID: %s)\n" "$name:" "$pid"
else
printf " %-20s ${RED}STOPPED${NC}\n" "$name:"
fi
}
config_foreach container_status container
echo ""
echo -e "${CYAN}Host Services:${NC}"
echo "--------------"
local service_status
service_status() {
local section="$1"
local enabled name process
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get process "$section" process
local result=$(get_service_status "$process")
local state=$(echo "$result" | cut -d: -f1)
local pid=$(echo "$result" | cut -d: -f2)
if [ "$state" = "running" ]; then
printf " %-20s ${GREEN}RUNNING${NC} (PID: %s)\n" "$name:" "$pid"
else
printf " %-20s ${RED}NOT RUNNING${NC}\n" "$name:"
fi
}
config_foreach service_status service
echo ""
echo -e "${CYAN}HTTPS Endpoints:${NC}"
echo "----------------"
local endpoint_status
endpoint_status() {
local section="$1"
local enabled name host expected_codes
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get host "$section" host
config_get expected_codes "$section" expected_codes '200'
local code=$(get_endpoint_status "$host")
local match=0
for expected in $expected_codes; do
[ "$code" = "$expected" ] && match=1 && break
done
if [ "$match" = "1" ]; then
printf " %-25s ${GREEN}HTTP %s${NC}\n" "$host:" "$code"
else
printf " %-25s ${RED}HTTP %s${NC}\n" "$host:" "$code"
fi
}
config_foreach endpoint_status endpoint
echo ""
}
# JSON status output for RPCD
cmd_json_status() {
load_config
local json='{"containers":['
local first=1
local container_json
container_json() {
local section="$1"
local enabled name critical
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get critical "$section" critical '0'
local result=$(get_container_status "$name")
local state=$(echo "$result" | cut -d: -f1)
local pid=$(echo "$result" | cut -d: -f2)
[ "$first" = "0" ] && json="$json,"
first=0
json="$json{\"name\":\"$name\",\"state\":\"$state\",\"pid\":$pid,\"critical\":$critical}"
}
config_foreach container_json container
json="$json],\"services\":["
first=1
local service_json
service_json() {
local section="$1"
local enabled name process critical
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get process "$section" process
config_get critical "$section" critical '0'
local result=$(get_service_status "$process")
local state=$(echo "$result" | cut -d: -f1)
local pid=$(echo "$result" | cut -d: -f2)
[ "$first" = "0" ] && json="$json,"
first=0
json="$json{\"name\":\"$name\",\"process\":\"$process\",\"state\":\"$state\",\"pid\":$pid,\"critical\":$critical}"
}
config_foreach service_json service
json="$json],\"endpoints\":["
first=1
local endpoint_json
endpoint_json() {
local section="$1"
local enabled name host expected_codes
config_get enabled "$section" enabled '0'
[ "$enabled" = "1" ] || return 0
config_get name "$section" name
config_get host "$section" host
config_get expected_codes "$section" expected_codes '200'
local code=$(get_endpoint_status "$host")
local healthy=0
for expected in $expected_codes; do
[ "$code" = "$expected" ] && healthy=1 && break
done
[ "$first" = "0" ] && json="$json,"
first=0
json="$json{\"name\":\"$name\",\"host\":\"$host\",\"code\":$code,\"healthy\":$healthy}"
}
config_foreach endpoint_json endpoint
json="$json]}"
echo "$json"
}
# Check command (single cycle)
cmd_check() {
load_config
local auto_recover="${1:-0}"
echo "Running health check..."
if run_check "$auto_recover"; then
echo -e "${GREEN}All systems healthy${NC}"
return 0
else
echo -e "${RED}Issues detected${NC}"
return 1
fi
}
# Watch command (continuous loop)
cmd_watch() {
load_config
config_get interval main interval '60'
echo "Starting watchdog (interval: ${interval}s)..."
log_info "Watchdog started (interval: ${interval}s)"
while true; do
run_check 1
sleep "$interval"
done
}
# Restart a specific container
cmd_restart_container() {
local name="$1"
[ -z "$name" ] && echo "Usage: watchdogctl restart-container <name>" && return 1
load_config
echo "Restarting container: $name"
lxc-stop -n "$name" 2>/dev/null
sleep 1
lxc-start -n "$name" 2>/dev/null
sleep 2
# Check for service start
local start_service
start_service() {
local section="$1"
local c_name service_name start_svc
config_get c_name "$section" name
[ "$c_name" = "$name" ] || return 0
config_get start_svc "$section" start_service '0'
config_get service_name "$section" service_name ''
if [ "$start_svc" = "1" ] && [ -n "$service_name" ]; then
sleep 2
lxc-attach -n "$name" -- /etc/init.d/"$service_name" start 2>/dev/null
echo "Started $service_name service inside container"
fi
}
config_foreach start_service container
local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}')
if [ "$state" = "RUNNING" ]; then
echo -e "${GREEN}Container $name is now running${NC}"
else
echo -e "${RED}Container $name failed to start${NC}"
return 1
fi
}
# Restart a specific service
cmd_restart_service() {
local name="$1"
[ -z "$name" ] && echo "Usage: watchdogctl restart-service <name>" && return 1
load_config
local do_restart
do_restart() {
local section="$1"
local s_name init_script process
config_get s_name "$section" name
[ "$s_name" = "$name" ] || return 0
config_get init_script "$section" init_script
config_get process "$section" process
if [ -x "$init_script" ]; then
echo "Restarting service: $name"
"$init_script" restart 2>/dev/null
sleep 2
if pgrep "$process" >/dev/null 2>&1; then
echo -e "${GREEN}Service $name is now running${NC}"
else
echo -e "${RED}Service $name failed to start${NC}"
fi
fi
}
config_foreach do_restart service
}
# Show logs
cmd_logs() {
local lines="${1:-50}"
if [ -f "$LOG_FILE" ]; then
tail -n "$lines" "$LOG_FILE"
else
echo "No log file found"
fi
}
# Clear logs
cmd_clear_logs() {
> "$LOG_FILE"
rm -f "$ALERT_STATE_DIR"/*.alert
echo "Logs cleared"
}
# Main
case "$1" in
status)
cmd_status
;;
json-status)
cmd_json_status
;;
check)
cmd_check 0
;;
check-recover)
cmd_check 1
;;
watch)
cmd_watch
;;
restart-container)
cmd_restart_container "$2"
;;
restart-service)
cmd_restart_service "$2"
;;
logs)
cmd_logs "$2"
;;
clear-logs)
cmd_clear_logs
;;
version)
echo "watchdogctl version $VERSION"
;;
*)
echo "SecuBox Watchdog Control v$VERSION"
echo ""
echo "Usage: watchdogctl <command> [options]"
echo ""
echo "Commands:"
echo " status Show current status of all monitored services"
echo " json-status Output status as JSON (for RPCD)"
echo " check Run single health check (no auto-recovery)"
echo " check-recover Run single health check with auto-recovery"
echo " watch Start continuous monitoring loop"
echo " restart-container Restart a specific container"
echo " restart-service Restart a specific service"
echo " logs [N] Show last N log lines (default: 50)"
echo " clear-logs Clear all logs and alert states"
echo " version Show version"
;;
esac