From 66d9fbc6c0c633d01db6501f298250c404dccf91 Mon Sep 17 00:00:00 2001 From: CyberMind-FR Date: Thu, 12 Mar 2026 06:46:53 +0100 Subject: [PATCH] feat(watchdog): Add service health monitor with auto-recovery New packages for monitoring and auto-restarting critical services: secubox-app-watchdog: - watchdogctl CLI: status, check, check-recover, watch, restart-* - Monitors LXC containers: haproxy, mitmproxy-in/out, streamlit - Monitors host services: crowdsec, uhttpd, dnsmasq - Checks HTTPS endpoints: gk2.secubox.in, admin.gk2, lldh360.maegia.tv - Auto-recovery with alert cooldown and log rotation - Procd service + cron fallback for redundancy luci-app-watchdog: - Real-time dashboard with 10s polling - Container/service tables with restart buttons - Endpoint health indicators - Alert log viewer with refresh/clear - RPCD backend: status, restart_*, check, get_logs Co-Authored-By: Claude Opus 4.5 --- .claude/HISTORY.md | 117 +++ .claude/WIP.md | 53 +- package/secubox/luci-app-watchdog/Makefile | 33 + .../resources/view/watchdog/status.js | 380 ++++++++++ .../root/usr/libexec/rpcd/luci.watchdog | 522 ++++++++++++++ .../share/luci/menu.d/luci-app-watchdog.json | 13 + .../share/rpcd/acl.d/luci-app-watchdog.json | 29 + package/secubox/secubox-app-watchdog/Makefile | 64 ++ .../files/etc/config/watchdog | 77 ++ .../files/etc/cron.d/watchdog | 3 + .../files/etc/init.d/watchdog | 42 ++ .../files/usr/sbin/watchdogctl | 668 ++++++++++++++++++ 12 files changed, 1992 insertions(+), 9 deletions(-) create mode 100644 package/secubox/luci-app-watchdog/Makefile create mode 100644 package/secubox/luci-app-watchdog/htdocs/luci-static/resources/view/watchdog/status.js create mode 100644 package/secubox/luci-app-watchdog/root/usr/libexec/rpcd/luci.watchdog create mode 100644 package/secubox/luci-app-watchdog/root/usr/share/luci/menu.d/luci-app-watchdog.json create mode 100644 package/secubox/luci-app-watchdog/root/usr/share/rpcd/acl.d/luci-app-watchdog.json create mode 100644 package/secubox/secubox-app-watchdog/Makefile create mode 100644 package/secubox/secubox-app-watchdog/files/etc/config/watchdog create mode 100644 package/secubox/secubox-app-watchdog/files/etc/cron.d/watchdog create mode 100644 package/secubox/secubox-app-watchdog/files/etc/init.d/watchdog create mode 100644 package/secubox/secubox-app-watchdog/files/usr/sbin/watchdogctl diff --git a/.claude/HISTORY.md b/.claude/HISTORY.md index c58e2540..2f7b22a3 100644 --- a/.claude/HISTORY.md +++ b/.claude/HISTORY.md @@ -4743,3 +4743,120 @@ git checkout HEAD -- index.html - RPCD method: `start_terminal` returns terminal connection info - Menu entry: Remote Control → Remote Support → Web Terminal - Fullscreen toggle and refresh controls + +95. **HERMÈS·360 Full I-Ching Translation (2026-03-11)** + - Added full translations for all 64 hexagrams in 5 languages (DE, ES, PT, ZH, JA): + - Image texts (_i): symbolic imagery section - 320 translations + - Description texts (_d): hexagram meaning - 320 translations + - Judgment texts (_j): oracle guidance - 320 translations + - Total: 960 new translation fields + - Visual enhancements from wall.maegia.tv: + - Canvas CSS filters: saturate(1.3) brightness(1.15) contrast(1.05) + - Hover effect: saturate(1.4) brightness(1.25) contrast(1.08) + - Added grid rendering during coin toss animation (drawGrid function) + - File size: 1.7MB (up from 1.6MB with all translations) + - Deployed to: https://lldh360.maegia.tv/ + +96. **HERMÈS·360 Language Switching Fix (2026-03-12)** + - Fixed language switching for all hexagram texts (was only FR/EN, now all 7 languages) + - Updated `getHexD`, `getHexJ`, `getHexI` functions to use dynamic field lookup (`LANG + '_d'`) + - Added 320 hexagram name translations to `HNAMES_I18N` (DE/ES/PT/ZH/JA × 64) + - Removed white background from canvas wrapper (`.cvwrap{background:transparent}`) + - Mutation section now displays localized hexagram names + - All 960 translations (descriptions, judgments, images) now accessible via language selector + +97. **Streamlit Forge Phase 2 - Gitea Integration (2026-03-12)** + - **CLI Commands**: + - `slforge edit ` - Opens Gitea web editor, auto-creates repo if needed + - `slforge pull ` - Pulls latest from Gitea, auto-restarts if running + - `slforge push [-m "msg"]` - Commits and pushes local changes to Gitea + - `slforge preview ` - Generates HTML/SVG preview of running app + - **Gitea API Integration**: + - `gitea_api()` helper function with token auth + - `gitea_ensure_org()` creates streamlit-apps org if missing + - `gitea_create_repo()` initializes git repo and pushes to Gitea + - Reads token from `/etc/config/gitea` UCI config + - **RPCD Methods** (5 new): + - `gitea_status` - Check Gitea availability and version + - `edit` - Get Gitea editor URL for app + - `pull` - Pull changes from Gitea + - `push` - Push changes to Gitea + - `preview` - Generate app preview + - **LuCI Dashboard Updates**: + - Gitea status card (version, online/offline) + - Edit button (purple) opens Gitea editor modal + - Pull button syncs latest changes + - Modal shows direct link to Gitea editor + - **Dependencies**: Git credentials configured via `.git-credentials` + - **ACL**: Updated with new methods for read/write +98. **RTTY Remote Control Phase 4 - Session Replay (2026-03-12)** + - **Avatar-Tap Integration**: + - Session capture via mitmproxy WAF (passive, no traffic modification) + - UCI config integration for database path (`/srv/lxc/streamlit/rootfs/srv/avatar-tap/sessions.db`) + - Captures: auth headers, cookies, tokens, session data + - **CLI Commands** (rttyctl): + - `tap-sessions [domain]` - List captured sessions with optional domain filter + - `tap-show ` - Show detailed session info (headers, cookies) + - `tap-replay ` - Replay captured session to remote mesh node + - `tap-export [file]` - Export session as JSON + - `tap-import ` - Import session from JSON file + - `json-tap-sessions` / `json-tap-session` - JSON output for RPCD + - **RPCD Methods** (6 new): + - `get_tap_status` - Avatar-Tap running state, session count, database path + - `get_tap_sessions` - List all captured sessions + - `get_tap_session` - Get single session details + - `replay_to_node` - Replay session to target mesh node + - `export_session` - Export session as base64 JSON + - `import_session` - Import session from base64 JSON + - **LuCI View** (`session-replay.js`): + - Stats cards: total sessions, unique domains, recent activity, tap status + - Sessions table with domain, method, path, captured time, use count + - Filters: domain search, HTTP method dropdown + - Replay panel: node selector, custom IP support, execution preview + - View modal: session details with masked auth data + - Import/Export: JSON file upload/download + - **Menu**: System Hub → Session Replay + - **ACL**: Updated with read (get_tap_*) and write (replay_*, export_, import_) permissions + - **Tested**: 10 captured sessions from photos.gk2, cloud.gk2, api.anthropic.com, chatgpt.com + +99. **SecuBox Watchdog - Service Health Monitor (2026-03-12)** + - Created `secubox-app-watchdog` package for service health monitoring and auto-recovery + - Created `luci-app-watchdog` package for LuCI dashboard integration + - **Monitored Components**: + - LXC Containers: haproxy, mitmproxy-in, mitmproxy-out, streamlit + - Host Services: crowdsec, uhttpd, dnsmasq + - HTTPS Endpoints: gk2.secubox.in, admin.gk2.secubox.in, lldh360.maegia.tv + - **CLI Tool** (`watchdogctl`): + - `status` - Show status of all monitored services with color output + - `check` - Single health check without recovery + - `check-recover` - Health check with automatic restart of failed services + - `watch` - Continuous monitoring loop (procd managed) + - `restart-container ` - Manual container restart + - `restart-service ` - Manual service restart + - `logs [N]` - View last N log entries + - `clear-logs` - Clear log file and alert states + - **Features**: + - Alert cooldown to prevent spam (configurable, default 300s) + - Log rotation (configurable max lines) + - Critical service flagging + - Container service start after LXC start (e.g., haproxy inside container) + - **RPCD Methods**: + - `status` - Full status with containers, services, endpoints + - `get_containers` / `get_services` / `get_endpoints` - Individual lists + - `restart_container` / `restart_service` - Remote restart via ubus + - `check` - Trigger health check + - `get_logs` / `clear_logs` - Log management + - **LuCI Dashboard** (`watchdog/status.js`): + - Real-time status with 10s polling + - Containers table with restart buttons + - Services table with restart buttons + - Endpoints table with health indicators + - Alert logs viewer with refresh/clear + - "Run Check Now" button + - **Auto-Recovery**: Cron job runs every minute, procd service runs continuous loop + - **Files**: + - `/etc/config/watchdog` - UCI configuration + - `/usr/sbin/watchdogctl` - CLI tool + - `/etc/init.d/watchdog` - procd service + - `/etc/cron.d/watchdog` - Cron backup + - `/usr/libexec/rpcd/luci.watchdog` - RPCD backend diff --git a/.claude/WIP.md b/.claude/WIP.md index bcb21f16..3998453b 100644 --- a/.claude/WIP.md +++ b/.claude/WIP.md @@ -1,6 +1,6 @@ # Work In Progress (Claude) -_Last updated: 2026-03-11 (Meta Cataloger - Virtual Books)_ +_Last updated: 2026-03-12 (SecuBox Watchdog)_ > **Architecture Reference**: SecuBox Fanzine v3 — Les 4 Couches @@ -8,8 +8,50 @@ _Last updated: 2026-03-11 (Meta Cataloger - Virtual Books)_ ## Recently Completed +### 2026-03-12 + +- **SecuBox Watchdog - Service Health Monitor (Complete)** + - New `secubox-app-watchdog` + `luci-app-watchdog` packages + - Monitors: LXC containers (haproxy, mitmproxy-in/out, streamlit), host services (crowdsec, uhttpd, dnsmasq), HTTPS endpoints + - CLI: watchdogctl status/check/check-recover/watch/restart-container/restart-service/logs + - Auto-recovery: detects stopped containers/services and restarts them + - RPCD: status, get_containers, get_services, get_endpoints, restart_*, check, get_logs + - LuCI Dashboard: Real-time status with 10s polling, restart buttons, log viewer + - Alert cooldown and log rotation + - Procd service + cron fallback + - Fixed HAProxy missing backends (luci_direct, fallback) and port mismatch + +- **RTTY Remote Control Phase 4 - Session Replay (Complete)** + - Avatar-Tap integration: passive session capture via mitmproxy WAF + - CLI: tap-sessions, tap-show, tap-replay, tap-export, tap-import + - RPCD: 6 new methods (get_tap_status, get_tap_sessions, get_tap_session, replay_to_node, export_session, import_session) + - LuCI: session-replay.js view with stats, filters, replay panel, import/export + - Menu: System Hub → Session Replay + - Tested: 10 captured sessions from multiple domains + ### 2026-03-11 +- **Streamlit Forge Phase 2 - Gitea Integration (Complete)** + - CLI: `slforge edit/pull/push/preview` commands + - Gitea API integration with token auth + - Auto-creates org/repo on first edit + - RPCD: 5 new methods (gitea_status, edit, pull, push, preview) + - LuCI: Gitea status card, Edit/Pull buttons, editor modal + - Preview generation: HTML capture + SVG placeholder + +- **HERMÈS·360 Full I-Ching Translation** + - All 64 hexagrams translated in 5 languages (DE, ES, PT, ZH, JA): + - Image texts (_i): 320 translations - symbolic imagery + - Description texts (_d): 320 translations - hexagram meaning + - Judgment texts (_j): 320 translations - oracle guidance + - Total: 960 new translation fields added + - Visual enhancements from wall.maegia.tv: + - Canvas CSS filters: saturate(1.3) brightness(1.15) contrast(1.05) + - Hover effect: saturate(1.4) brightness(1.25) contrast(1.08) + - Added grid rendering during coin toss animation (drawGrid function) + - File size: 1.7MB (up from 1.6MB) + - Deployed to: https://lldh360.maegia.tv/ + - **Meta Cataloger Phase 2 & 3 (Complete)** - **Phase 2: RPCD + LuCI** - RPCD backend: `luci.metacatalog` with 10 methods (list_entries, list_books, get_entry, get_book, search, get_stats, sync, scan, assign, unassign) @@ -438,14 +480,7 @@ _Last updated: 2026-03-11 (Meta Cataloger - Virtual Books)_ ## In Progress -- **Meta Cataloger Phase 3** - Enhanced landing page with search, entry management UI - -- **Streamlit Forge Phase 2** - Preview generation, Gitea push/pull - -- **RTTY Remote Control Module (Phase 4 - Session Replay)** - - Avatar-tap integration for session capture - - Replay captured sessions to target nodes - - Session export/import functionality +(No active tasks) --- diff --git a/package/secubox/luci-app-watchdog/Makefile b/package/secubox/luci-app-watchdog/Makefile new file mode 100644 index 00000000..26caba6c --- /dev/null +++ b/package/secubox/luci-app-watchdog/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: MIT +# LuCI App Watchdog - Web UI for SecuBox Service Watchdog +# Copyright (C) 2025-2026 CyberMind.fr + +include $(TOPDIR)/rules.mk + +PKG_NAME:=luci-app-watchdog +PKG_VERSION:=1.0.0 +PKG_RELEASE:=1 + +PKG_MAINTAINER:=CyberMind +PKG_LICENSE:=MIT + +LUCI_TITLE:=LuCI Watchdog Dashboard +LUCI_DEPENDS:=+secubox-app-watchdog +luci-base + +include $(TOPDIR)/feeds/luci/luci.mk + +define Package/luci-app-watchdog/install + $(INSTALL_DIR) $(1)/www/luci-static/resources/view/watchdog + $(INSTALL_DATA) ./htdocs/luci-static/resources/view/watchdog/*.js $(1)/www/luci-static/resources/view/watchdog/ + + $(INSTALL_DIR) $(1)/usr/share/luci/menu.d + $(INSTALL_DATA) ./root/usr/share/luci/menu.d/*.json $(1)/usr/share/luci/menu.d/ + + $(INSTALL_DIR) $(1)/usr/share/rpcd/acl.d + $(INSTALL_DATA) ./root/usr/share/rpcd/acl.d/*.json $(1)/usr/share/rpcd/acl.d/ + + $(INSTALL_DIR) $(1)/usr/libexec/rpcd + $(INSTALL_BIN) ./root/usr/libexec/rpcd/luci.watchdog $(1)/usr/libexec/rpcd/luci.watchdog +endef + +$(eval $(call BuildPackage,luci-app-watchdog)) diff --git a/package/secubox/luci-app-watchdog/htdocs/luci-static/resources/view/watchdog/status.js b/package/secubox/luci-app-watchdog/htdocs/luci-static/resources/view/watchdog/status.js new file mode 100644 index 00000000..dca85b56 --- /dev/null +++ b/package/secubox/luci-app-watchdog/htdocs/luci-static/resources/view/watchdog/status.js @@ -0,0 +1,380 @@ +'use strict'; +'require view'; +'require rpc'; +'require ui'; +'require poll'; +'require dom'; + +var callGetStatus = rpc.declare({ + object: 'luci.watchdog', + method: 'status', + expect: {} +}); + +var callGetLogs = rpc.declare({ + object: 'luci.watchdog', + method: 'get_logs', + params: ['lines'], + expect: {} +}); + +var callRestartContainer = rpc.declare({ + object: 'luci.watchdog', + method: 'restart_container', + params: ['name'], + expect: {} +}); + +var callRestartService = rpc.declare({ + object: 'luci.watchdog', + method: 'restart_service', + params: ['name'], + expect: {} +}); + +var callCheck = rpc.declare({ + object: 'luci.watchdog', + method: 'check', + expect: {} +}); + +var callClearLogs = rpc.declare({ + object: 'luci.watchdog', + method: 'clear_logs', + expect: {} +}); + +function renderStatusBadge(state, critical) { + var color = state === 'running' ? '#00ff88' : (critical ? '#ff0066' : '#ffaa00'); + var text = state === 'running' ? 'RUNNING' : 'STOPPED'; + return E('span', { + 'style': 'background: ' + color + '; color: #000; padding: 2px 8px; border-radius: 4px; font-weight: bold; font-size: 11px;' + }, text); +} + +function renderHealthBadge(healthy) { + var color = healthy ? '#00ff88' : '#ff0066'; + var text = healthy ? 'HEALTHY' : 'UNHEALTHY'; + return E('span', { + 'style': 'background: ' + color + '; color: #000; padding: 2px 8px; border-radius: 4px; font-weight: bold; font-size: 11px;' + }, text); +} + +function renderCriticalBadge(critical) { + if (!critical) return ''; + return E('span', { + 'style': 'background: #ff0066; color: #fff; padding: 2px 6px; border-radius: 4px; font-size: 10px; margin-left: 8px;' + }, 'CRITICAL'); +} + +return view.extend({ + load: function() { + return Promise.all([ + callGetStatus(), + callGetLogs(30) + ]); + }, + + pollStatus: function() { + var self = this; + poll.add(function() { + return callGetStatus().then(function(status) { + self.updateDashboard(status); + }); + }, 10); + }, + + updateDashboard: function(status) { + // Update watchdog status + var watchdogStatus = document.getElementById('watchdog-status'); + if (watchdogStatus) { + var running = status.running; + watchdogStatus.innerHTML = ''; + watchdogStatus.appendChild(E('span', { + 'style': 'color: ' + (running ? '#00ff88' : '#ff0066') + '; font-weight: bold;' + }, running ? 'ACTIVE' : 'INACTIVE')); + } + + // Update containers + var containersTable = document.getElementById('containers-body'); + if (containersTable && status.containers) { + containersTable.innerHTML = ''; + status.containers.forEach(function(c) { + var row = E('tr', {}, [ + E('td', {}, c.name), + E('td', {}, [renderStatusBadge(c.state, c.critical), renderCriticalBadge(c.critical)]), + E('td', {}, c.pid > 0 ? String(c.pid) : '-'), + E('td', {}, [ + E('button', { + 'class': 'cbi-button cbi-button-action', + 'click': ui.createHandlerFn(this, 'handleRestartContainer', c.name), + 'style': 'padding: 2px 8px; font-size: 11px;' + }, 'Restart') + ]) + ]); + containersTable.appendChild(row); + }); + } + + // Update services + var servicesTable = document.getElementById('services-body'); + if (servicesTable && status.services) { + servicesTable.innerHTML = ''; + status.services.forEach(function(s) { + var row = E('tr', {}, [ + E('td', {}, s.name), + E('td', {}, s.process), + E('td', {}, [renderStatusBadge(s.state, s.critical), renderCriticalBadge(s.critical)]), + E('td', {}, s.pid > 0 ? String(s.pid) : '-'), + E('td', {}, [ + E('button', { + 'class': 'cbi-button cbi-button-action', + 'click': ui.createHandlerFn(this, 'handleRestartService', s.name), + 'style': 'padding: 2px 8px; font-size: 11px;' + }, 'Restart') + ]) + ]); + servicesTable.appendChild(row); + }); + } + + // Update endpoints + var endpointsTable = document.getElementById('endpoints-body'); + if (endpointsTable && status.endpoints) { + endpointsTable.innerHTML = ''; + status.endpoints.forEach(function(e) { + var row = E('tr', {}, [ + E('td', {}, e.name), + E('td', {}, e.host), + E('td', {}, 'HTTP ' + e.code), + E('td', {}, renderHealthBadge(e.healthy)) + ]); + endpointsTable.appendChild(row); + }); + } + }, + + handleRestartContainer: function(name) { + var self = this; + ui.showModal('Restarting Container', [ + E('p', { 'class': 'spinning' }, 'Restarting ' + name + '...') + ]); + + return callRestartContainer(name).then(function(result) { + ui.hideModal(); + if (result.success) { + ui.addNotification(null, E('p', {}, 'Container ' + name + ' restarted successfully'), 'success'); + } else { + ui.addNotification(null, E('p', {}, 'Failed to restart ' + name + ': ' + (result.error || 'Unknown error')), 'error'); + } + return callGetStatus().then(function(status) { + self.updateDashboard(status); + }); + }); + }, + + handleRestartService: function(name) { + var self = this; + ui.showModal('Restarting Service', [ + E('p', { 'class': 'spinning' }, 'Restarting ' + name + '...') + ]); + + return callRestartService(name).then(function(result) { + ui.hideModal(); + if (result.success) { + ui.addNotification(null, E('p', {}, 'Service ' + name + ' restarted successfully'), 'success'); + } else { + ui.addNotification(null, E('p', {}, 'Failed to restart ' + name + ': ' + (result.error || 'Unknown error')), 'error'); + } + return callGetStatus().then(function(status) { + self.updateDashboard(status); + }); + }); + }, + + handleRunCheck: function() { + var self = this; + ui.showModal('Running Health Check', [ + E('p', { 'class': 'spinning' }, 'Running health check with auto-recovery...') + ]); + + return callCheck().then(function(result) { + ui.hideModal(); + ui.addNotification(null, E('p', {}, 'Health check completed'), 'success'); + return callGetStatus().then(function(status) { + self.updateDashboard(status); + }); + }); + }, + + handleClearLogs: function() { + return callClearLogs().then(function() { + ui.addNotification(null, E('p', {}, 'Logs cleared'), 'success'); + var logsArea = document.getElementById('logs-area'); + if (logsArea) { + logsArea.value = ''; + } + }); + }, + + handleRefreshLogs: function() { + return callGetLogs(50).then(function(result) { + var logsArea = document.getElementById('logs-area'); + if (logsArea && result.lines) { + logsArea.value = result.lines.join('\n'); + logsArea.scrollTop = logsArea.scrollHeight; + } + }); + }, + + render: function(data) { + var status = data[0] || {}; + var logs = data[1] || {}; + var self = this; + + var view = E('div', { 'class': 'cbi-map' }, [ + E('h2', {}, 'SecuBox Watchdog'), + E('div', { 'class': 'cbi-map-descr' }, 'Service health monitoring and auto-recovery dashboard'), + + // Status overview + E('div', { 'class': 'cbi-section', 'style': 'background: linear-gradient(135deg, #1a1a2e 0%, #0f0f1a 100%); border: 1px solid #333; border-radius: 8px; padding: 16px; margin-bottom: 20px;' }, [ + E('div', { 'style': 'display: flex; justify-content: space-between; align-items: center;' }, [ + E('div', {}, [ + E('span', { 'style': 'color: #888;' }, 'Watchdog Status: '), + E('span', { 'id': 'watchdog-status', 'style': 'color: ' + (status.running ? '#00ff88' : '#ff0066') + '; font-weight: bold;' }, + status.running ? 'ACTIVE' : 'INACTIVE'), + E('span', { 'style': 'color: #888; margin-left: 20px;' }, 'Check Interval: '), + E('span', { 'style': 'color: #00ffff;' }, (status.interval || 60) + 's') + ]), + E('div', {}, [ + E('button', { + 'class': 'cbi-button cbi-button-action', + 'click': ui.createHandlerFn(this, 'handleRunCheck') + }, 'Run Check Now') + ]) + ]) + ]), + + // Containers section + E('div', { 'class': 'cbi-section' }, [ + E('h3', {}, 'LXC Containers'), + E('table', { 'class': 'table cbi-section-table' }, [ + E('thead', {}, [ + E('tr', { 'class': 'tr table-titles' }, [ + E('th', { 'class': 'th' }, 'Container'), + E('th', { 'class': 'th' }, 'Status'), + E('th', { 'class': 'th' }, 'PID'), + E('th', { 'class': 'th' }, 'Actions') + ]) + ]), + E('tbody', { 'id': 'containers-body' }, + (status.containers || []).map(function(c) { + return E('tr', { 'class': 'tr' }, [ + E('td', { 'class': 'td' }, c.name), + E('td', { 'class': 'td' }, [renderStatusBadge(c.state, c.critical), renderCriticalBadge(c.critical)]), + E('td', { 'class': 'td' }, c.pid > 0 ? String(c.pid) : '-'), + E('td', { 'class': 'td' }, [ + E('button', { + 'class': 'cbi-button cbi-button-action', + 'click': ui.createHandlerFn(self, 'handleRestartContainer', c.name), + 'style': 'padding: 2px 8px; font-size: 11px;' + }, 'Restart') + ]) + ]); + }) + ) + ]) + ]), + + // Services section + E('div', { 'class': 'cbi-section' }, [ + E('h3', {}, 'Host Services'), + E('table', { 'class': 'table cbi-section-table' }, [ + E('thead', {}, [ + E('tr', { 'class': 'tr table-titles' }, [ + E('th', { 'class': 'th' }, 'Service'), + E('th', { 'class': 'th' }, 'Process'), + E('th', { 'class': 'th' }, 'Status'), + E('th', { 'class': 'th' }, 'PID'), + E('th', { 'class': 'th' }, 'Actions') + ]) + ]), + E('tbody', { 'id': 'services-body' }, + (status.services || []).map(function(s) { + return E('tr', { 'class': 'tr' }, [ + E('td', { 'class': 'td' }, s.name), + E('td', { 'class': 'td' }, s.process), + E('td', { 'class': 'td' }, [renderStatusBadge(s.state, s.critical), renderCriticalBadge(s.critical)]), + E('td', { 'class': 'td' }, s.pid > 0 ? String(s.pid) : '-'), + E('td', { 'class': 'td' }, [ + E('button', { + 'class': 'cbi-button cbi-button-action', + 'click': ui.createHandlerFn(self, 'handleRestartService', s.name), + 'style': 'padding: 2px 8px; font-size: 11px;' + }, 'Restart') + ]) + ]); + }) + ) + ]) + ]), + + // Endpoints section + E('div', { 'class': 'cbi-section' }, [ + E('h3', {}, 'HTTPS Endpoints'), + E('table', { 'class': 'table cbi-section-table' }, [ + E('thead', {}, [ + E('tr', { 'class': 'tr table-titles' }, [ + E('th', { 'class': 'th' }, 'Name'), + E('th', { 'class': 'th' }, 'Host'), + E('th', { 'class': 'th' }, 'Response'), + E('th', { 'class': 'th' }, 'Health') + ]) + ]), + E('tbody', { 'id': 'endpoints-body' }, + (status.endpoints || []).map(function(e) { + return E('tr', { 'class': 'tr' }, [ + E('td', { 'class': 'td' }, e.name), + E('td', { 'class': 'td' }, e.host), + E('td', { 'class': 'td' }, 'HTTP ' + e.code), + E('td', { 'class': 'td' }, renderHealthBadge(e.healthy)) + ]); + }) + ) + ]) + ]), + + // Logs section + E('div', { 'class': 'cbi-section' }, [ + E('div', { 'style': 'display: flex; justify-content: space-between; align-items: center;' }, [ + E('h3', {}, 'Alert Logs'), + E('div', {}, [ + E('button', { + 'class': 'cbi-button', + 'click': ui.createHandlerFn(this, 'handleRefreshLogs'), + 'style': 'margin-right: 8px;' + }, 'Refresh'), + E('button', { + 'class': 'cbi-button cbi-button-negative', + 'click': ui.createHandlerFn(this, 'handleClearLogs') + }, 'Clear') + ]) + ]), + E('textarea', { + 'id': 'logs-area', + 'readonly': 'readonly', + 'style': 'width: 100%; height: 200px; background: #0f0f1a; color: #00ff88; font-family: monospace; font-size: 12px; border: 1px solid #333; border-radius: 4px; padding: 8px;' + }, (logs.lines || []).join('\n')) + ]) + ]); + + // Start polling + this.pollStatus(); + + return view; + }, + + handleSaveApply: null, + handleSave: null, + handleReset: null +}); diff --git a/package/secubox/luci-app-watchdog/root/usr/libexec/rpcd/luci.watchdog b/package/secubox/luci-app-watchdog/root/usr/libexec/rpcd/luci.watchdog new file mode 100644 index 00000000..dd6df3a7 --- /dev/null +++ b/package/secubox/luci-app-watchdog/root/usr/libexec/rpcd/luci.watchdog @@ -0,0 +1,522 @@ +#!/bin/sh +# RPCD backend for SecuBox Watchdog +# Provides LuCI integration for watchdog status and control + +. /lib/functions.sh +. /usr/share/libubox/jshn.sh + +CONFIG_NAME="watchdog" +LOG_FILE="/var/log/watchdog.log" +ALERT_STATE_DIR="/tmp/watchdog" + +# Get container status +get_container_status() { + local name="$1" + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + local pid=$(lxc-info -n "$name" 2>/dev/null | grep "PID:" | awk '{print $2}') + + if [ "$state" = "RUNNING" ]; then + echo "running:${pid:-0}" + else + echo "stopped:0" + fi +} + +# Get service status +get_service_status() { + local process="$1" + local pid=$(pgrep "$process" 2>/dev/null | head -1) + + if [ -n "$pid" ]; then + echo "running:$pid" + else + echo "stopped:0" + fi +} + +# Get endpoint status +get_endpoint_http_code() { + local host="$1" + local code=$(curl -sk -o /dev/null -w "%{http_code}" -H "Host: $host" --connect-timeout 5 https://127.0.0.1/ 2>/dev/null) + echo "${code:-0}" +} + +# Method: list +method_list() { + json_init + json_add_object "status" + json_close_object + json_add_object "get_containers" + json_close_object + json_add_object "get_services" + json_close_object + json_add_object "get_endpoints" + json_close_object + json_add_object "get_logs" + json_add_int "lines" 50 + json_close_object + json_add_object "restart_container" + json_add_string "name" "string" + json_close_object + json_add_object "restart_service" + json_add_string "name" "string" + json_close_object + json_add_object "check" + json_close_object + json_add_object "clear_logs" + json_close_object + json_add_object "get_config" + json_close_object + json_dump +} + +# Method: status - Full status overview +method_status() { + config_load "$CONFIG_NAME" + + local enabled interval + config_get enabled main enabled '0' + config_get interval main interval '60' + + # Check if watchdog process is running + local running=0 + pgrep -f "watchdogctl watch" >/dev/null && running=1 + + json_init + json_add_boolean "enabled" "$enabled" + json_add_boolean "running" "$running" + json_add_int "interval" "$interval" + + # Containers + json_add_array "containers" + local add_container + add_container() { + local section="$1" + local c_enabled c_name c_critical + + config_get c_enabled "$section" enabled '0' + [ "$c_enabled" = "1" ] || return 0 + + config_get c_name "$section" name + config_get c_critical "$section" critical '0' + + local result=$(get_container_status "$c_name") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + json_add_object "" + json_add_string "name" "$c_name" + json_add_string "state" "$state" + json_add_int "pid" "$pid" + json_add_boolean "critical" "$c_critical" + json_close_object + } + config_foreach add_container container + json_close_array + + # Services + json_add_array "services" + local add_service + add_service() { + local section="$1" + local s_enabled s_name s_process s_critical + + config_get s_enabled "$section" enabled '0' + [ "$s_enabled" = "1" ] || return 0 + + config_get s_name "$section" name + config_get s_process "$section" process + config_get s_critical "$section" critical '0' + + local result=$(get_service_status "$s_process") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + json_add_object "" + json_add_string "name" "$s_name" + json_add_string "process" "$s_process" + json_add_string "state" "$state" + json_add_int "pid" "$pid" + json_add_boolean "critical" "$s_critical" + json_close_object + } + config_foreach add_service service + json_close_array + + # Endpoints + json_add_array "endpoints" + local add_endpoint + add_endpoint() { + local section="$1" + local e_enabled e_name e_host e_expected + + config_get e_enabled "$section" enabled '0' + [ "$e_enabled" = "1" ] || return 0 + + config_get e_name "$section" name + config_get e_host "$section" host + config_get e_expected "$section" expected_codes '200' + + local code=$(get_endpoint_http_code "$e_host") + + local healthy=0 + for exp in $e_expected; do + [ "$code" = "$exp" ] && healthy=1 && break + done + + json_add_object "" + json_add_string "name" "$e_name" + json_add_string "host" "$e_host" + json_add_int "code" "$code" + json_add_boolean "healthy" "$healthy" + json_close_object + } + config_foreach add_endpoint endpoint + json_close_array + + json_dump +} + +# Method: get_containers +method_get_containers() { + config_load "$CONFIG_NAME" + + json_init + json_add_array "containers" + + local add_container + add_container() { + local section="$1" + local c_enabled c_name c_critical c_start_service c_service_name + + config_get c_enabled "$section" enabled '0' + config_get c_name "$section" name + config_get c_critical "$section" critical '0' + config_get c_start_service "$section" start_service '0' + config_get c_service_name "$section" service_name '' + + local result=$(get_container_status "$c_name") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + json_add_object "" + json_add_string "id" "$section" + json_add_string "name" "$c_name" + json_add_string "state" "$state" + json_add_int "pid" "$pid" + json_add_boolean "enabled" "$c_enabled" + json_add_boolean "critical" "$c_critical" + json_add_boolean "start_service" "$c_start_service" + json_add_string "service_name" "$c_service_name" + json_close_object + } + config_foreach add_container container + json_close_array + + json_dump +} + +# Method: get_services +method_get_services() { + config_load "$CONFIG_NAME" + + json_init + json_add_array "services" + + local add_service + add_service() { + local section="$1" + local s_enabled s_name s_process s_critical s_init_script + + config_get s_enabled "$section" enabled '0' + config_get s_name "$section" name + config_get s_process "$section" process + config_get s_critical "$section" critical '0' + config_get s_init_script "$section" init_script '' + + local result=$(get_service_status "$s_process") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + json_add_object "" + json_add_string "id" "$section" + json_add_string "name" "$s_name" + json_add_string "process" "$s_process" + json_add_string "state" "$state" + json_add_int "pid" "$pid" + json_add_boolean "enabled" "$s_enabled" + json_add_boolean "critical" "$s_critical" + json_add_string "init_script" "$s_init_script" + json_close_object + } + config_foreach add_service service + json_close_array + + json_dump +} + +# Method: get_endpoints +method_get_endpoints() { + config_load "$CONFIG_NAME" + + json_init + json_add_array "endpoints" + + local add_endpoint + add_endpoint() { + local section="$1" + local e_enabled e_name e_host e_expected e_critical + + config_get e_enabled "$section" enabled '0' + config_get e_name "$section" name + config_get e_host "$section" host + config_get e_expected "$section" expected_codes '200' + config_get e_critical "$section" critical '0' + + local code=$(get_endpoint_http_code "$e_host") + + local healthy=0 + for exp in $e_expected; do + [ "$code" = "$exp" ] && healthy=1 && break + done + + json_add_object "" + json_add_string "id" "$section" + json_add_string "name" "$e_name" + json_add_string "host" "$e_host" + json_add_int "code" "$code" + json_add_boolean "enabled" "$e_enabled" + json_add_boolean "healthy" "$healthy" + json_add_boolean "critical" "$e_critical" + json_add_string "expected_codes" "$e_expected" + json_close_object + } + config_foreach add_endpoint endpoint + json_close_array + + json_dump +} + +# Method: get_logs +method_get_logs() { + local lines="${1:-50}" + + json_init + + if [ -f "$LOG_FILE" ]; then + local log_content=$(tail -n "$lines" "$LOG_FILE" 2>/dev/null | sed 's/"/\\"/g' | tr '\n' '\n') + json_add_array "lines" + tail -n "$lines" "$LOG_FILE" 2>/dev/null | while IFS= read -r line; do + json_add_string "" "$line" + done + json_close_array + json_add_int "total" "$(wc -l < "$LOG_FILE" 2>/dev/null || echo 0)" + else + json_add_array "lines" + json_close_array + json_add_int "total" 0 + fi + + json_dump +} + +# Method: restart_container +method_restart_container() { + local name="$1" + + json_init + + if [ -z "$name" ]; then + json_add_boolean "success" 0 + json_add_string "error" "Container name required" + json_dump + return + fi + + # Stop container + lxc-stop -n "$name" 2>/dev/null + sleep 1 + + # Start container + lxc-start -n "$name" 2>/dev/null + sleep 2 + + # Check for service start + config_load "$CONFIG_NAME" + local start_service + start_service() { + local section="$1" + local c_name service_name start_svc + config_get c_name "$section" name + [ "$c_name" = "$name" ] || return 0 + config_get start_svc "$section" start_service '0' + config_get service_name "$section" service_name '' + + if [ "$start_svc" = "1" ] && [ -n "$service_name" ]; then + sleep 2 + lxc-attach -n "$name" -- /etc/init.d/"$service_name" start 2>/dev/null + fi + } + config_foreach start_service container + + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + + if [ "$state" = "RUNNING" ]; then + json_add_boolean "success" 1 + json_add_string "state" "running" + else + json_add_boolean "success" 0 + json_add_string "error" "Container failed to start" + json_add_string "state" "$state" + fi + + json_dump +} + +# Method: restart_service +method_restart_service() { + local name="$1" + + json_init + + if [ -z "$name" ]; then + json_add_boolean "success" 0 + json_add_string "error" "Service name required" + json_dump + return + fi + + config_load "$CONFIG_NAME" + + local found=0 + local do_restart + do_restart() { + local section="$1" + local s_name init_script process + config_get s_name "$section" name + [ "$s_name" = "$name" ] || return 0 + found=1 + + config_get init_script "$section" init_script + config_get process "$section" process + + if [ -x "$init_script" ]; then + "$init_script" restart 2>/dev/null + sleep 2 + + if pgrep "$process" >/dev/null 2>&1; then + json_add_boolean "success" 1 + json_add_string "state" "running" + else + json_add_boolean "success" 0 + json_add_string "error" "Service failed to start" + fi + else + json_add_boolean "success" 0 + json_add_string "error" "Init script not found" + fi + } + config_foreach do_restart service + + if [ "$found" = "0" ]; then + json_add_boolean "success" 0 + json_add_string "error" "Service not found in configuration" + fi + + json_dump +} + +# Method: check - Run single health check +method_check() { + /usr/sbin/watchdogctl check-recover >/dev/null 2>&1 + + json_init + json_add_boolean "success" 1 + json_add_string "message" "Health check completed" + json_dump +} + +# Method: clear_logs +method_clear_logs() { + > "$LOG_FILE" 2>/dev/null + rm -f "$ALERT_STATE_DIR"/*.alert 2>/dev/null + + json_init + json_add_boolean "success" 1 + json_dump +} + +# Method: get_config +method_get_config() { + config_load "$CONFIG_NAME" + + local enabled interval alert_cooldown max_log_lines + + config_get enabled main enabled '0' + config_get interval main interval '60' + config_get alert_cooldown main alert_cooldown '300' + config_get max_log_lines main max_log_lines '1000' + + json_init + json_add_boolean "enabled" "$enabled" + json_add_int "interval" "$interval" + json_add_int "alert_cooldown" "$alert_cooldown" + json_add_int "max_log_lines" "$max_log_lines" + json_dump +} + +# Main dispatcher +case "$1" in + list) + method_list + ;; + call) + case "$2" in + status) + method_status + ;; + get_containers) + method_get_containers + ;; + get_services) + method_get_services + ;; + get_endpoints) + method_get_endpoints + ;; + get_logs) + read -r input + json_load "$input" + json_get_var lines lines 50 + method_get_logs "$lines" + ;; + restart_container) + read -r input + json_load "$input" + json_get_var name name + method_restart_container "$name" + ;; + restart_service) + read -r input + json_load "$input" + json_get_var name name + method_restart_service "$name" + ;; + check) + method_check + ;; + clear_logs) + method_clear_logs + ;; + get_config) + method_get_config + ;; + *) + echo '{"error":"Unknown method"}' + ;; + esac + ;; + *) + echo '{"error":"Unknown command"}' + ;; +esac diff --git a/package/secubox/luci-app-watchdog/root/usr/share/luci/menu.d/luci-app-watchdog.json b/package/secubox/luci-app-watchdog/root/usr/share/luci/menu.d/luci-app-watchdog.json new file mode 100644 index 00000000..0fa68ed0 --- /dev/null +++ b/package/secubox/luci-app-watchdog/root/usr/share/luci/menu.d/luci-app-watchdog.json @@ -0,0 +1,13 @@ +{ + "admin/secubox/system/watchdog": { + "title": "Watchdog", + "order": 5, + "action": { + "type": "view", + "path": "watchdog/status" + }, + "depends": { + "acl": ["luci-app-watchdog"] + } + } +} diff --git a/package/secubox/luci-app-watchdog/root/usr/share/rpcd/acl.d/luci-app-watchdog.json b/package/secubox/luci-app-watchdog/root/usr/share/rpcd/acl.d/luci-app-watchdog.json new file mode 100644 index 00000000..30fe0251 --- /dev/null +++ b/package/secubox/luci-app-watchdog/root/usr/share/rpcd/acl.d/luci-app-watchdog.json @@ -0,0 +1,29 @@ +{ + "luci-app-watchdog": { + "description": "Grant access to SecuBox Watchdog", + "read": { + "ubus": { + "luci.watchdog": [ + "status", + "get_containers", + "get_services", + "get_endpoints", + "get_logs", + "get_config" + ] + }, + "uci": ["watchdog"] + }, + "write": { + "ubus": { + "luci.watchdog": [ + "restart_container", + "restart_service", + "check", + "clear_logs" + ] + }, + "uci": ["watchdog"] + } + } +} diff --git a/package/secubox/secubox-app-watchdog/Makefile b/package/secubox/secubox-app-watchdog/Makefile new file mode 100644 index 00000000..96178fc6 --- /dev/null +++ b/package/secubox/secubox-app-watchdog/Makefile @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: MIT +# SecuBox Watchdog - Service Health Monitor & Auto-Recovery +# Copyright (C) 2025-2026 CyberMind.fr + +include $(TOPDIR)/rules.mk + +PKG_NAME:=secubox-app-watchdog +PKG_VERSION:=1.0.0 +PKG_RELEASE:=1 + +PKG_MAINTAINER:=CyberMind +PKG_LICENSE:=MIT + +include $(INCLUDE_DIR)/package.mk + +define Package/secubox-app-watchdog + SECTION:=secubox + CATEGORY:=SecuBox + SUBMENU:=System + TITLE:=SecuBox Service Watchdog + DEPENDS:=+lxc +jsonfilter +curl + PKGARCH:=all +endef + +define Package/secubox-app-watchdog/description + Service health monitoring and auto-recovery watchdog. + Features: + - Monitor LXC containers (haproxy, mitmproxy, streamlit) + - Monitor core services (crowdsec, uhttpd, dnsmasq) + - HTTPS endpoint health checks + - Auto-restart failed services + - Alert logging with timestamps + - LuCI dashboard integration +endef + +define Package/secubox-app-watchdog/conffiles +/etc/config/watchdog +endef + +define Build/Compile +endef + +define Package/secubox-app-watchdog/install + $(INSTALL_DIR) $(1)/etc/config + $(INSTALL_CONF) ./files/etc/config/watchdog $(1)/etc/config/watchdog + + $(INSTALL_DIR) $(1)/etc/init.d + $(INSTALL_BIN) ./files/etc/init.d/watchdog $(1)/etc/init.d/watchdog + + $(INSTALL_DIR) $(1)/usr/sbin + $(INSTALL_BIN) ./files/usr/sbin/watchdogctl $(1)/usr/sbin/watchdogctl + + $(INSTALL_DIR) $(1)/etc/cron.d + $(INSTALL_DATA) ./files/etc/cron.d/watchdog $(1)/etc/cron.d/watchdog +endef + +define Package/secubox-app-watchdog/postinst +#!/bin/sh +[ -n "$${IPKG_INSTROOT}" ] && exit 0 +/etc/init.d/watchdog enable 2>/dev/null || true +exit 0 +endef + +$(eval $(call BuildPackage,secubox-app-watchdog)) diff --git a/package/secubox/secubox-app-watchdog/files/etc/config/watchdog b/package/secubox/secubox-app-watchdog/files/etc/config/watchdog new file mode 100644 index 00000000..2070afab --- /dev/null +++ b/package/secubox/secubox-app-watchdog/files/etc/config/watchdog @@ -0,0 +1,77 @@ +config watchdog 'main' + option enabled '1' + option interval '60' + option alert_command '' + option log_file '/var/log/watchdog.log' + option max_log_lines '1000' + option alert_cooldown '300' + +# LXC Containers to monitor +config container 'haproxy' + option enabled '1' + option name 'haproxy' + option critical '1' + option start_service '1' + option service_name 'haproxy' + +config container 'mitmproxy_in' + option enabled '1' + option name 'mitmproxy-in' + option critical '1' + option start_service '0' + +config container 'mitmproxy_out' + option enabled '1' + option name 'mitmproxy-out' + option critical '0' + option start_service '0' + +config container 'streamlit' + option enabled '1' + option name 'streamlit' + option critical '0' + option start_service '0' + +# Host services to monitor +config service 'crowdsec' + option enabled '1' + option name 'crowdsec' + option process 'crowdsec' + option critical '1' + option init_script '/etc/init.d/crowdsec' + +config service 'uhttpd' + option enabled '1' + option name 'uhttpd' + option process 'uhttpd' + option critical '1' + option init_script '/etc/init.d/uhttpd' + +config service 'dnsmasq' + option enabled '1' + option name 'dnsmasq' + option process 'dnsmasq' + option critical '1' + option init_script '/etc/init.d/dnsmasq' + +# HTTPS endpoints to check +config endpoint 'gk2' + option enabled '1' + option name 'GK2 Hub' + option host 'gk2.secubox.in' + option expected_codes '200 301 302' + option critical '0' + +config endpoint 'admin' + option enabled '1' + option name 'LuCI Admin' + option host 'admin.gk2.secubox.in' + option expected_codes '200 301 302' + option critical '0' + +config endpoint 'lldh360' + option enabled '1' + option name 'HERMES 360' + option host 'lldh360.maegia.tv' + option expected_codes '200 301 302' + option critical '0' diff --git a/package/secubox/secubox-app-watchdog/files/etc/cron.d/watchdog b/package/secubox/secubox-app-watchdog/files/etc/cron.d/watchdog new file mode 100644 index 00000000..428ea31e --- /dev/null +++ b/package/secubox/secubox-app-watchdog/files/etc/cron.d/watchdog @@ -0,0 +1,3 @@ +# SecuBox Watchdog - Health check cron job +# Runs every minute to check and auto-recover services +* * * * * root /usr/sbin/watchdogctl check-recover >/dev/null 2>&1 diff --git a/package/secubox/secubox-app-watchdog/files/etc/init.d/watchdog b/package/secubox/secubox-app-watchdog/files/etc/init.d/watchdog new file mode 100644 index 00000000..e69fe6ff --- /dev/null +++ b/package/secubox/secubox-app-watchdog/files/etc/init.d/watchdog @@ -0,0 +1,42 @@ +#!/bin/sh /etc/rc.common + +START=99 +STOP=10 +USE_PROCD=1 + +NAME="watchdog" +PROG="/usr/sbin/watchdogctl" + +start_service() { + local enabled + config_load watchdog + config_get enabled main enabled '0' + + [ "$enabled" = "1" ] || { + echo "Watchdog is disabled. Enable with: uci set watchdog.main.enabled=1" + return 0 + } + + procd_open_instance + procd_set_param command "$PROG" watch + procd_set_param respawn 3600 5 5 + procd_set_param stdout 1 + procd_set_param stderr 1 + procd_set_param pidfile /var/run/watchdog.pid + procd_close_instance + + echo "Watchdog started" +} + +stop_service() { + echo "Watchdog stopped" +} + +reload_service() { + stop + start +} + +service_triggers() { + procd_add_reload_trigger "watchdog" +} diff --git a/package/secubox/secubox-app-watchdog/files/usr/sbin/watchdogctl b/package/secubox/secubox-app-watchdog/files/usr/sbin/watchdogctl new file mode 100644 index 00000000..a9538c61 --- /dev/null +++ b/package/secubox/secubox-app-watchdog/files/usr/sbin/watchdogctl @@ -0,0 +1,668 @@ +#!/bin/sh +# SecuBox Watchdog Control +# Service health monitoring and auto-recovery + +. /lib/functions.sh + +CONFIG_NAME="watchdog" +LOG_FILE="/var/log/watchdog.log" +ALERT_STATE_DIR="/tmp/watchdog" +VERSION="1.0.0" + +# Colors for terminal output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +# Load configuration +load_config() { + config_load "$CONFIG_NAME" + config_get LOG_FILE main log_file '/var/log/watchdog.log' + config_get ALERT_COOLDOWN main alert_cooldown '300' + config_get MAX_LOG_LINES main max_log_lines '1000' + mkdir -p "$ALERT_STATE_DIR" +} + +# Logging +log_msg() { + local level="$1" + local msg="$2" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] [$level] $msg" >> "$LOG_FILE" + + # Rotate log if too large + local lines=$(wc -l < "$LOG_FILE" 2>/dev/null || echo 0) + if [ "$lines" -gt "$MAX_LOG_LINES" ]; then + tail -n "$((MAX_LOG_LINES / 2))" "$LOG_FILE" > "$LOG_FILE.tmp" + mv "$LOG_FILE.tmp" "$LOG_FILE" + fi +} + +log_alert() { + log_msg "ALERT" "$1" + echo -e "${RED}SPUNK ALERT${NC} - $1" >&2 +} + +log_info() { + log_msg "INFO" "$1" +} + +log_ok() { + log_msg "OK" "$1" +} + +# Check if we should alert (cooldown) +should_alert() { + local service="$1" + local state_file="$ALERT_STATE_DIR/$service.alert" + + if [ -f "$state_file" ]; then + local last_alert=$(cat "$state_file") + local now=$(date +%s) + local diff=$((now - last_alert)) + [ "$diff" -lt "$ALERT_COOLDOWN" ] && return 1 + fi + + date +%s > "$state_file" + return 0 +} + +clear_alert() { + local service="$1" + rm -f "$ALERT_STATE_DIR/$service.alert" +} + +# Check LXC container status +check_container() { + local section="$1" + local enabled name critical start_service service_name + + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + + config_get name "$section" name + config_get critical "$section" critical '0' + config_get start_service "$section" start_service '0' + config_get service_name "$section" service_name '' + + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + + if [ "$state" = "RUNNING" ]; then + clear_alert "container_$name" + return 0 + else + if should_alert "container_$name"; then + log_alert "Container $name is $state" + fi + return 1 + fi +} + +# Restart LXC container +restart_container() { + local section="$1" + local name start_service service_name + + config_get name "$section" name + config_get start_service "$section" start_service '0' + config_get service_name "$section" service_name '' + + log_info "Restarting container: $name" + + # Stop if running + lxc-stop -n "$name" 2>/dev/null + sleep 1 + + # Start container + lxc-start -n "$name" 2>/dev/null + sleep 2 + + # Start service inside if configured + if [ "$start_service" = "1" ] && [ -n "$service_name" ]; then + sleep 2 + lxc-attach -n "$name" -- /etc/init.d/"$service_name" start 2>/dev/null + log_info "Started $service_name service inside $name" + fi + + # Verify + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + if [ "$state" = "RUNNING" ]; then + log_ok "Container $name recovered successfully" + return 0 + else + log_alert "Container $name failed to start" + return 1 + fi +} + +# Check host service status +check_service() { + local section="$1" + local enabled name process critical + + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + + config_get name "$section" name + config_get process "$section" process + config_get critical "$section" critical '0' + + if pgrep "$process" >/dev/null 2>&1; then + clear_alert "service_$name" + return 0 + else + if should_alert "service_$name"; then + log_alert "Service $name is not running" + fi + return 1 + fi +} + +# Restart host service +restart_service() { + local section="$1" + local name process init_script + + config_get name "$section" name + config_get process "$section" process + config_get init_script "$section" init_script + + log_info "Restarting service: $name" + + if [ -x "$init_script" ]; then + "$init_script" restart 2>/dev/null + sleep 2 + + if pgrep "$process" >/dev/null 2>&1; then + log_ok "Service $name recovered successfully" + return 0 + else + log_alert "Service $name failed to restart" + return 1 + fi + else + log_alert "No init script found for $name" + return 1 + fi +} + +# Check HTTPS endpoint +check_endpoint() { + local section="$1" + local enabled name host expected_codes + + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + + config_get name "$section" name + config_get host "$section" host + config_get expected_codes "$section" expected_codes '200' + + local code=$(curl -sk -o /dev/null -w "%{http_code}" -H "Host: $host" https://127.0.0.1/ 2>/dev/null) + + local match=0 + for expected in $expected_codes; do + [ "$code" = "$expected" ] && match=1 && break + done + + if [ "$match" = "1" ]; then + clear_alert "endpoint_$host" + return 0 + else + if should_alert "endpoint_$host"; then + log_alert "Endpoint $name ($host) returned HTTP $code" + fi + return 1 + fi +} + +# Run single check cycle +run_check() { + local auto_recover="${1:-0}" + local failed=0 + + # Temp file to track failed items + local failed_containers="/tmp/watchdog_failed_containers" + local failed_services="/tmp/watchdog_failed_services" + > "$failed_containers" + > "$failed_services" + + # Check containers and record failures + local check_and_record_container + check_and_record_container() { + local section="$1" + local enabled name + + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + + config_get name "$section" name + + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + + if [ "$state" = "RUNNING" ]; then + clear_alert "container_$name" + else + if should_alert "container_$name"; then + log_alert "Container $name is $state" + fi + echo "$section" >> "$failed_containers" + failed=1 + fi + } + config_foreach check_and_record_container container + + # Check services and record failures + local check_and_record_service + check_and_record_service() { + local section="$1" + local enabled name process + + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + + config_get name "$section" name + config_get process "$section" process + + if pgrep "$process" >/dev/null 2>&1; then + clear_alert "service_$name" + else + if should_alert "service_$name"; then + log_alert "Service $name is not running" + fi + echo "$section" >> "$failed_services" + failed=1 + fi + } + config_foreach check_and_record_service service + + # Check endpoints + config_foreach check_endpoint endpoint || failed=1 + + # Auto-recovery if enabled + if [ "$auto_recover" = "1" ]; then + # Restart failed containers + while read section; do + [ -n "$section" ] && restart_container "$section" + done < "$failed_containers" + + # Restart failed services + while read section; do + [ -n "$section" ] && restart_service "$section" + done < "$failed_services" + fi + + rm -f "$failed_containers" "$failed_services" + return $failed +} + +# Get status of a single container +get_container_status() { + local name="$1" + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + local pid=$(lxc-info -n "$name" 2>/dev/null | grep "PID:" | awk '{print $2}') + + if [ "$state" = "RUNNING" ]; then + echo "running:$pid" + else + echo "stopped:0" + fi +} + +# Get status of a single service +get_service_status() { + local process="$1" + local pid=$(pgrep "$process" 2>/dev/null | head -1) + + if [ -n "$pid" ]; then + echo "running:$pid" + else + echo "stopped:0" + fi +} + +# Get endpoint status +get_endpoint_status() { + local host="$1" + local code=$(curl -sk -o /dev/null -w "%{http_code}" -H "Host: $host" --connect-timeout 5 https://127.0.0.1/ 2>/dev/null) + echo "$code" +} + +# Show status command +cmd_status() { + load_config + + echo "" + echo -e "${CYAN}SecuBox Watchdog Status${NC}" + echo "========================" + echo "" + + echo -e "${CYAN}LXC Containers:${NC}" + echo "---------------" + config_load "$CONFIG_NAME" + + local container_status + container_status() { + local section="$1" + local enabled name + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + config_get name "$section" name + + local result=$(get_container_status "$name") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + if [ "$state" = "running" ]; then + printf " %-20s ${GREEN}RUNNING${NC} (PID: %s)\n" "$name:" "$pid" + else + printf " %-20s ${RED}STOPPED${NC}\n" "$name:" + fi + } + config_foreach container_status container + + echo "" + echo -e "${CYAN}Host Services:${NC}" + echo "--------------" + + local service_status + service_status() { + local section="$1" + local enabled name process + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + config_get name "$section" name + config_get process "$section" process + + local result=$(get_service_status "$process") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + if [ "$state" = "running" ]; then + printf " %-20s ${GREEN}RUNNING${NC} (PID: %s)\n" "$name:" "$pid" + else + printf " %-20s ${RED}NOT RUNNING${NC}\n" "$name:" + fi + } + config_foreach service_status service + + echo "" + echo -e "${CYAN}HTTPS Endpoints:${NC}" + echo "----------------" + + local endpoint_status + endpoint_status() { + local section="$1" + local enabled name host expected_codes + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + config_get name "$section" name + config_get host "$section" host + config_get expected_codes "$section" expected_codes '200' + + local code=$(get_endpoint_status "$host") + + local match=0 + for expected in $expected_codes; do + [ "$code" = "$expected" ] && match=1 && break + done + + if [ "$match" = "1" ]; then + printf " %-25s ${GREEN}HTTP %s${NC}\n" "$host:" "$code" + else + printf " %-25s ${RED}HTTP %s${NC}\n" "$host:" "$code" + fi + } + config_foreach endpoint_status endpoint + + echo "" +} + +# JSON status output for RPCD +cmd_json_status() { + load_config + + local json='{"containers":[' + local first=1 + + local container_json + container_json() { + local section="$1" + local enabled name critical + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + config_get name "$section" name + config_get critical "$section" critical '0' + + local result=$(get_container_status "$name") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + [ "$first" = "0" ] && json="$json," + first=0 + json="$json{\"name\":\"$name\",\"state\":\"$state\",\"pid\":$pid,\"critical\":$critical}" + } + config_foreach container_json container + + json="$json],\"services\":[" + first=1 + + local service_json + service_json() { + local section="$1" + local enabled name process critical + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + config_get name "$section" name + config_get process "$section" process + config_get critical "$section" critical '0' + + local result=$(get_service_status "$process") + local state=$(echo "$result" | cut -d: -f1) + local pid=$(echo "$result" | cut -d: -f2) + + [ "$first" = "0" ] && json="$json," + first=0 + json="$json{\"name\":\"$name\",\"process\":\"$process\",\"state\":\"$state\",\"pid\":$pid,\"critical\":$critical}" + } + config_foreach service_json service + + json="$json],\"endpoints\":[" + first=1 + + local endpoint_json + endpoint_json() { + local section="$1" + local enabled name host expected_codes + config_get enabled "$section" enabled '0' + [ "$enabled" = "1" ] || return 0 + config_get name "$section" name + config_get host "$section" host + config_get expected_codes "$section" expected_codes '200' + + local code=$(get_endpoint_status "$host") + + local healthy=0 + for expected in $expected_codes; do + [ "$code" = "$expected" ] && healthy=1 && break + done + + [ "$first" = "0" ] && json="$json," + first=0 + json="$json{\"name\":\"$name\",\"host\":\"$host\",\"code\":$code,\"healthy\":$healthy}" + } + config_foreach endpoint_json endpoint + + json="$json]}" + echo "$json" +} + +# Check command (single cycle) +cmd_check() { + load_config + local auto_recover="${1:-0}" + + echo "Running health check..." + if run_check "$auto_recover"; then + echo -e "${GREEN}All systems healthy${NC}" + return 0 + else + echo -e "${RED}Issues detected${NC}" + return 1 + fi +} + +# Watch command (continuous loop) +cmd_watch() { + load_config + config_get interval main interval '60' + + echo "Starting watchdog (interval: ${interval}s)..." + log_info "Watchdog started (interval: ${interval}s)" + + while true; do + run_check 1 + sleep "$interval" + done +} + +# Restart a specific container +cmd_restart_container() { + local name="$1" + [ -z "$name" ] && echo "Usage: watchdogctl restart-container " && return 1 + + load_config + + echo "Restarting container: $name" + lxc-stop -n "$name" 2>/dev/null + sleep 1 + lxc-start -n "$name" 2>/dev/null + sleep 2 + + # Check for service start + local start_service + start_service() { + local section="$1" + local c_name service_name start_svc + config_get c_name "$section" name + [ "$c_name" = "$name" ] || return 0 + config_get start_svc "$section" start_service '0' + config_get service_name "$section" service_name '' + + if [ "$start_svc" = "1" ] && [ -n "$service_name" ]; then + sleep 2 + lxc-attach -n "$name" -- /etc/init.d/"$service_name" start 2>/dev/null + echo "Started $service_name service inside container" + fi + } + config_foreach start_service container + + local state=$(lxc-info -n "$name" 2>/dev/null | grep "State:" | awk '{print $2}') + if [ "$state" = "RUNNING" ]; then + echo -e "${GREEN}Container $name is now running${NC}" + else + echo -e "${RED}Container $name failed to start${NC}" + return 1 + fi +} + +# Restart a specific service +cmd_restart_service() { + local name="$1" + [ -z "$name" ] && echo "Usage: watchdogctl restart-service " && return 1 + + load_config + + local do_restart + do_restart() { + local section="$1" + local s_name init_script process + config_get s_name "$section" name + [ "$s_name" = "$name" ] || return 0 + config_get init_script "$section" init_script + config_get process "$section" process + + if [ -x "$init_script" ]; then + echo "Restarting service: $name" + "$init_script" restart 2>/dev/null + sleep 2 + + if pgrep "$process" >/dev/null 2>&1; then + echo -e "${GREEN}Service $name is now running${NC}" + else + echo -e "${RED}Service $name failed to start${NC}" + fi + fi + } + config_foreach do_restart service +} + +# Show logs +cmd_logs() { + local lines="${1:-50}" + if [ -f "$LOG_FILE" ]; then + tail -n "$lines" "$LOG_FILE" + else + echo "No log file found" + fi +} + +# Clear logs +cmd_clear_logs() { + > "$LOG_FILE" + rm -f "$ALERT_STATE_DIR"/*.alert + echo "Logs cleared" +} + +# Main +case "$1" in + status) + cmd_status + ;; + json-status) + cmd_json_status + ;; + check) + cmd_check 0 + ;; + check-recover) + cmd_check 1 + ;; + watch) + cmd_watch + ;; + restart-container) + cmd_restart_container "$2" + ;; + restart-service) + cmd_restart_service "$2" + ;; + logs) + cmd_logs "$2" + ;; + clear-logs) + cmd_clear_logs + ;; + version) + echo "watchdogctl version $VERSION" + ;; + *) + echo "SecuBox Watchdog Control v$VERSION" + echo "" + echo "Usage: watchdogctl [options]" + echo "" + echo "Commands:" + echo " status Show current status of all monitored services" + echo " json-status Output status as JSON (for RPCD)" + echo " check Run single health check (no auto-recovery)" + echo " check-recover Run single health check with auto-recovery" + echo " watch Start continuous monitoring loop" + echo " restart-container Restart a specific container" + echo " restart-service Restart a specific service" + echo " logs [N] Show last N log lines (default: 50)" + echo " clear-logs Clear all logs and alert states" + echo " version Show version" + ;; +esac