Compare commits

..

2 Commits

Author SHA1 Message Date
36cfb72e41 Merge feature/727 — aggregator auto-heal watchdog
Some checks are pending
License Headers / check (push) Waiting to run
2026-06-24 12:32:59 +02:00
6e62c0166d feat(aggregator): packaged auto-heal watchdog timer (#727)
Ships secubox-aggregator-watchdog.{sh,service,timer}: probes aggregator.sock
/api/v1/hub/public/menu every 2min, restarts secubox-aggregator after 2
consecutive failures (the hub/auth/menu SPOF wedged under a load spike in the
2026-06-24 incident). State file kept in /run (root-owned), NOT the shared
sticky /run/secubox — a stale secubox-owned file there can't be overwritten by
CAP_DAC_OVERRIDE-less root, which would freeze the streak and stop it triggering.
Enabled in postinst (respects masking). Verified live: state persists root:root,
timer active. Bump 0.2.3.
2026-06-24 12:32:55 +02:00
8 changed files with 110 additions and 0 deletions

View File

@ -1,3 +1,14 @@
secubox-aggregator (0.2.3-1~bookworm1) bookworm; urgency=medium
* #727 auto-heal watchdog: ship secubox-aggregator-watchdog.{sh,service,timer}.
The in-process aggregator is the hub/auth/menu SPOF; under a host load spike
its event loop can wedge and the socket stops answering (board-wide 502/000:
sparse navbar, login errors). The timer probes aggregator.sock every 2 min
and restarts the service after 2 consecutive failures. Enabled in postinst
(respects operator masking). Packages the live fix from the 2026-06-24 incident.
-- Gerald KERMA <devel@cybermind.fr> Wed, 24 Jun 2026 15:10:00 +0000
secubox-aggregator (0.2.1-1~bookworm1) bookworm; urgency=medium secubox-aggregator (0.2.1-1~bookworm1) bookworm; urgency=medium
* Phase 7 follow-up (#498) — relax hardening for module sudoers : * Phase 7 follow-up (#498) — relax hardening for module sudoers :

View File

@ -17,6 +17,12 @@ case "$1" in
systemctl enable secubox-aggregator.service systemctl enable secubox-aggregator.service
systemctl start secubox-aggregator.service || true systemctl start secubox-aggregator.service || true
# Auto-heal watchdog (#727): restart the aggregator if its socket wedges
# under load (the hub/auth/menu SPOF). Respect operator masking.
if [ "$(systemctl is-enabled secubox-aggregator-watchdog.timer 2>/dev/null)" != "masked" ]; then
systemctl enable --now secubox-aggregator-watchdog.timer 2>/dev/null || true
fi
echo "secubox-aggregator: to migrate all installed SecuBox modules into" echo "secubox-aggregator: to migrate all installed SecuBox modules into"
echo " the aggregator (replaces per-module uvicorn processes) run :" echo " the aggregator (replaces per-module uvicorn processes) run :"
echo " sudo /usr/sbin/secubox-aggregator-migrate" echo " sudo /usr/sbin/secubox-aggregator-migrate"

6
packages/secubox-aggregator/debian/rules Normal file → Executable file
View File

@ -14,6 +14,12 @@ override_dh_auto_install:
install -d $(CURDIR)/debian/secubox-aggregator/lib/systemd/system install -d $(CURDIR)/debian/secubox-aggregator/lib/systemd/system
install -m 644 systemd/secubox-aggregator.service \ install -m 644 systemd/secubox-aggregator.service \
$(CURDIR)/debian/secubox-aggregator/lib/systemd/system/ $(CURDIR)/debian/secubox-aggregator/lib/systemd/system/
install -m 644 systemd/secubox-aggregator-watchdog.service \
$(CURDIR)/debian/secubox-aggregator/lib/systemd/system/
install -m 644 systemd/secubox-aggregator-watchdog.timer \
$(CURDIR)/debian/secubox-aggregator/lib/systemd/system/
install -d $(CURDIR)/debian/secubox-aggregator/usr/sbin install -d $(CURDIR)/debian/secubox-aggregator/usr/sbin
install -m 755 sbin/secubox-aggregator-migrate \ install -m 755 sbin/secubox-aggregator-migrate \
$(CURDIR)/debian/secubox-aggregator/usr/sbin/ $(CURDIR)/debian/secubox-aggregator/usr/sbin/
install -m 755 sbin/secubox-aggregator-watchdog.sh \
$(CURDIR)/debian/secubox-aggregator/usr/sbin/

View File

@ -0,0 +1,10 @@
# Automatically added by dh_python3
if command -v py3compile >/dev/null 2>&1; then
py3compile -p secubox-aggregator
fi
if command -v pypy3compile >/dev/null 2>&1; then
pypy3compile -p secubox-aggregator || true
fi
# End automatically added section

View File

@ -0,0 +1,10 @@
# Automatically added by dh_python3
if command -v py3clean >/dev/null 2>&1; then
py3clean -p secubox-aggregator
else
dpkg -L secubox-aggregator | sed -En -e '/^(.*)\/(.+)\.py$/s,,rm "\1/__pycache__/\2".*,e'
find /usr/lib/python3/dist-packages/ -type d -name __pycache__ -empty -print0 | xargs --null --no-run-if-empty rmdir
fi
# End automatically added section

View File

@ -0,0 +1,48 @@
#!/bin/bash
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
# Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
# Source-Disclosed License — All rights reserved except as expressly granted.
# See LICENCE-CMSD-1.0.md for terms.
#
# SecuBox-Deb :: secubox-aggregator-watchdog
#
# Auto-heal the in-process aggregator — the hub/auth/menu single point of
# failure. Under a host load spike its shared event loop can wedge and its
# socket stops answering, taking down the navbar, login and service status
# board-wide (incident 2026-06-24). Probe the socket; if /api/v1/hub/public/menu
# stops answering for N consecutive checks, restart the service. Idempotent,
# safe to run on a timer.
set -uo pipefail
readonly MODULE="secubox-aggregator-watchdog"
readonly VERSION="1.0"
SOCK="/run/secubox/aggregator.sock"
# State lives in /run (root-owned), NOT the shared sticky /run/secubox: that dir
# is 1777 and a stale secubox-owned file there can't be overwritten by this
# (CSPN-hardened, CAP_DAC_OVERRIDE-less) root — which would silently freeze the
# streak counter and stop the watchdog ever triggering.
STATE="/run/secubox-aggregator-watchdog.fails"
FAIL_THRESHOLD="${SECUBOX_AGG_WD_THRESHOLD:-2}"
TIMEOUT="${SECUBOX_AGG_WD_TIMEOUT:-12}"
# No socket yet (service still starting / not migrated) → nothing to heal.
[ -S "$SOCK" ] || exit 0
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$TIMEOUT" \
--unix-socket "$SOCK" http://localhost/api/v1/hub/public/menu 2>/dev/null || echo 000)
if [ "$code" = "200" ]; then
echo 0 > "$STATE" 2>/dev/null || true
exit 0
fi
n=$(( $(cat "$STATE" 2>/dev/null || echo 0) + 1 ))
echo "$n" > "$STATE" 2>/dev/null || true
logger -t "$MODULE" "aggregator probe failed (code=$code, streak=$n/$FAIL_THRESHOLD)"
if [ "$n" -ge "$FAIL_THRESHOLD" ]; then
logger -t "$MODULE" "restarting secubox-aggregator (auto-heal)"
systemctl restart secubox-aggregator.service
echo 0 > "$STATE" 2>/dev/null || true
fi
exit 0

View File

@ -0,0 +1,9 @@
[Unit]
Description=SecuBox aggregator auto-heal watchdog
Documentation=https://github.com/CyberMind-FR/secubox-deb/issues/727
After=secubox-aggregator.service
[Service]
Type=oneshot
ExecStart=/usr/sbin/secubox-aggregator-watchdog.sh
Nice=10

View File

@ -0,0 +1,10 @@
[Unit]
Description=Probe + auto-heal secubox-aggregator every 2 min
[Timer]
OnBootSec=2min
OnUnitActiveSec=2min
AccuracySec=20s
[Install]
WantedBy=timers.target