Compare commits

...

8 Commits

9 changed files with 815 additions and 66 deletions

View File

@ -0,0 +1,482 @@
# Hub Dashboard Latency Fix Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Turn the hub `/dashboard`, `/status`, `/modules` (9-12 s) and `/public/health-batch` (3.3 s) into cache-served sub-100 ms responses, and cap the toolbox `/admin/clients/rich` enrichment to the displayed rows.
**Architecture:** Both hub portals run mounted in `secubox-aggregator` (no sub-app lifespan → cold caches). Fix the cold path: dashboard/status/modules `await` ONE batched `systemctl is-active` (warm-on-demand) instead of ~16 per-module calls; health-batch serves a TTL snapshot built by the existing background loop, cold-miss = ONE offloaded `list-units`. Toolbox bounds geo/UA enrichment to the first 12 most-recent clients.
**Tech Stack:** FastAPI, asyncio, systemctl, pytest + monkeypatch.
---
### Task 1: Hub — `_ensure_services_warm()` + health-batch snapshot helper
**Files:**
- Modify: `packages/secubox-hub/api/main.py`
- Test: `packages/secubox-hub/tests/test_cache_warm.py` (create)
- [ ] **Step 1: Write failing tests**
Create `packages/secubox-hub/tests/test_cache_warm.py`:
```python
import asyncio
import importlib
import sys
from pathlib import Path
import pytest
# Import the hub app module.
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "api"))
main = importlib.import_module("main")
def _reset_cache():
main._cache["services"] = {}
main._cache["last_refresh"] = 0
main._cache["health_batch"] = None
main._cache["health_batch_ts"] = 0
def test_ensure_services_warm_refreshes_when_cold(monkeypatch):
_reset_cache()
calls = {"n": 0}
def fake_refresh():
calls["n"] += 1
main._cache["services"]["secubox-x"] = {"name": "secubox-x", "active": True, "socket": False}
monkeypatch.setattr(main, "_refresh_services_cache", fake_refresh)
asyncio.run(main._ensure_services_warm())
assert calls["n"] == 1
assert main._cache["last_refresh"] > 0
def test_ensure_services_warm_skips_when_fresh(monkeypatch):
_reset_cache()
main._cache["last_refresh"] = main.time.time()
calls = {"n": 0}
monkeypatch.setattr(main, "_refresh_services_cache", lambda: calls.__setitem__("n", calls["n"] + 1))
asyncio.run(main._ensure_services_warm())
assert calls["n"] == 0
def test_refresh_health_batch_parses_units(monkeypatch):
_reset_cache()
class R:
stdout = (
"secubox-hub.service loaded active running Hub\n"
"secubox-dpi.service loaded active exited DPI\n"
"secubox-cdn.service loaded failed failed CDN\n"
)
monkeypatch.setattr(main.subprocess, "run", lambda *a, **k: R())
# No sockets present for these in the test env.
main._refresh_health_batch()
hb = main._cache["health_batch"]
assert hb["modules"]["hub"]["status"] == "ok"
assert hb["modules"]["dpi"]["status"] == "warn"
assert hb["modules"]["cdn"]["status"] == "error"
assert main._cache["health_batch_ts"] > 0
```
Run: `cd packages/secubox-hub && python -m pytest tests/test_cache_warm.py -v`
Expected: FAIL (`_ensure_services_warm` / `_refresh_health_batch` not defined; `health_batch` key missing).
- [ ] **Step 2: Add cache keys**
In `main.py`, extend the `_cache` dict literal (currently lines ~322-327):
```python
_cache = {
"services": {}, # module_id -> {name, active, socket}
"menu": None, # Full menu response
"system_stats": {}, # CPU, memory, disk
"last_refresh": 0,
"health_batch": None, # {modules: {...}, count: int} snapshot for sidebar LEDs
"health_batch_ts": 0, # monotonic-ish wall time of last health_batch build
}
```
- [ ] **Step 3: Add `_refresh_health_batch()` (sync)**
Insert right after `_refresh_services_cache()` (after line ~399). This MOVES the
parse logic out of the handler so both the background loop and the cold-miss path
share it:
```python
def _refresh_health_batch():
"""Build the sidebar health snapshot in ONE systemctl list-units call.
Stores _cache["health_batch"] = {modules, count} + stamps health_batch_ts.
Shared by the background loop and the /public/health-batch cold-miss path so
the request never makes its own (3.3 s) synchronous systemctl call.
"""
modules = {}
try:
result = subprocess.run(
["systemctl", "list-units", "--type=service",
"--state=running,failed,inactive", "--no-legend", "--plain",
"secubox-*"],
capture_output=True, text=True, timeout=5
)
for line in result.stdout.strip().split("\n"):
if not line.strip():
continue
parts = line.split()
if len(parts) >= 4:
unit, _load, active, sub = parts[0], parts[1], parts[2], parts[3]
if unit.startswith("secubox-") and unit.endswith(".service"):
mod_id = unit[8:-8]
if active == "active" and sub == "running":
modules[mod_id] = {"status": "ok", "msg": "Running"}
elif active == "active":
modules[mod_id] = {"status": "warn", "msg": f"Active ({sub})"}
elif active == "failed":
modules[mod_id] = {"status": "error", "msg": "Failed"}
else:
modules[mod_id] = {"status": "warn", "msg": f"{active}/{sub}"}
except Exception as e:
log.warning("health-batch systemctl error: %s", e)
socket_dir = Path("/run/secubox")
if socket_dir.exists():
for sock in socket_dir.glob("*.sock"):
mod_id = sock.stem
if mod_id not in modules:
modules[mod_id] = {"status": "ok", "msg": "Socket active"}
_cache["health_batch"] = {"modules": modules, "count": len(modules)}
_cache["health_batch_ts"] = time.time()
```
- [ ] **Step 4: Add `_ensure_services_warm()` (async)**
Insert right after `_svc()` (after line ~542):
```python
async def _ensure_services_warm():
"""Refresh the services cache in ONE batched call when cold/stale.
Replaces the ~16 per-module `systemctl is-active` fallbacks inside _svc()
with a single offloaded `is-active -- [all]` so dashboard/status/modules cold
paths cost one call instead of sixteen, and never block the shared loop.
"""
if (time.time() - _cache["last_refresh"]) >= CACHE_TTL * 2:
await asyncio.to_thread(_refresh_services_cache)
_cache["last_refresh"] = time.time()
```
- [ ] **Step 5: Run tests**
Run: `cd packages/secubox-hub && python -m pytest tests/test_cache_warm.py -v`
Expected: PASS (3 tests).
- [ ] **Step 6: Commit**
```bash
git add packages/secubox-hub/api/main.py packages/secubox-hub/tests/test_cache_warm.py
git commit -m "perf(hub): add _ensure_services_warm + _refresh_health_batch cache helpers (ref #644)"
```
---
### Task 2: Hub — wire helpers into handlers + background loop
**Files:**
- Modify: `packages/secubox-hub/api/main.py`
- Test: `packages/secubox-hub/tests/test_cache_warm.py` (extend)
- [ ] **Step 1: Add failing test for health-batch cache-serve**
Append to `tests/test_cache_warm.py`:
```python
def test_health_batch_serves_cache_without_subprocess(monkeypatch):
_reset_cache()
main._cache["health_batch"] = {"modules": {"hub": {"status": "ok", "msg": "Running"}}, "count": 1}
main._cache["health_batch_ts"] = main.time.time()
def boom(*a, **k):
raise AssertionError("subprocess must NOT be called when cache is warm")
monkeypatch.setattr(main.subprocess, "run", boom)
out = asyncio.run(main.public_health_batch())
assert out["count"] == 1
assert out["modules"]["hub"]["status"] == "ok"
def test_health_batch_cold_miss_builds_once(monkeypatch):
_reset_cache()
class R:
stdout = "secubox-hub.service loaded active running Hub\n"
calls = {"n": 0}
def fake_run(*a, **k):
calls["n"] += 1
return R()
monkeypatch.setattr(main.subprocess, "run", fake_run)
out = asyncio.run(main.public_health_batch())
assert out["count"] >= 1
assert calls["n"] == 1
```
Run: `cd packages/secubox-hub && python -m pytest tests/test_cache_warm.py -v`
Expected: FAIL (handler still builds inline every call / no cache check).
- [ ] **Step 2: Rewrite `public_health_batch` to serve the snapshot**
Replace the body of `public_health_batch` (lines ~263-314) with:
```python
@public_router.get("/health-batch")
async def public_health_batch():
"""Batch health check for all modules — returns status for sidebar LEDs.
Serves the TTL snapshot built by the background loop; on a cold miss it
builds it ONCE off the event loop. Never makes a synchronous systemctl call
on the request path.
"""
hb = _cache.get("health_batch")
if not hb or (time.time() - _cache.get("health_batch_ts", 0)) >= CACHE_TTL * 2:
await asyncio.to_thread(_refresh_health_batch)
hb = _cache.get("health_batch") or {"modules": {}, "count": 0}
return hb
```
(The `import subprocess` inside the old handler is removed; `subprocess` is
already imported at module level.)
- [ ] **Step 3: Warm health-batch on `/dashboard`, `/status`, `/modules`**
In each of `status` (line ~545), `modules` (~554), and `dashboard` (~593), add
`await _ensure_services_warm()` immediately before the `_svc` loop. Example for
`dashboard`:
```python
@router.get("/dashboard")
async def dashboard(user=Depends(require_jwt)):
"""Données complètes du dashboard (uses cached stats for speed)."""
board = get_board_info()
await _ensure_services_warm()
# _svc() now hits the warm cache (one batched refresh above on a cold cache).
modules_status = await asyncio.to_thread(lambda: {k: _svc(v) for k, v in MODULES.items()})
```
Apply the same single `await _ensure_services_warm()` line before the `_svc`
comprehension in `status` and `modules` (keep the existing `await
asyncio.to_thread(...)` comprehension that follows).
- [ ] **Step 4: Build health-batch in the background loop**
In `_background_cache_refresh` (after the `_refresh_system_stats` line ~428) add:
```python
await asyncio.to_thread(_refresh_health_batch)
```
And in `_start_background_once` initial warm (after line ~466
`_refresh_system_stats`) add the same line so the snapshot is warm before the
first sidebar poll:
```python
await asyncio.to_thread(_refresh_health_batch)
```
- [ ] **Step 5: Run full hub test suite**
Run: `cd packages/secubox-hub && python -m pytest tests/ -v`
Expected: PASS (new + any existing).
- [ ] **Step 6: Commit**
```bash
git add packages/secubox-hub/api/main.py packages/secubox-hub/tests/test_cache_warm.py
git commit -m "perf(hub): serve dashboard/health-batch from cache, one batched systemctl on cold path (ref #644)"
```
---
### Task 3: Toolbox — cap `/admin/clients/rich` enrichment to displayed rows
**Files:**
- Modify: `packages/secubox-toolbox/secubox_toolbox/api.py`
- Test: `packages/secubox-toolbox/tests/test_clients_rich_cap.py` (create)
- [ ] **Step 1: Write failing test**
Create `packages/secubox-toolbox/tests/test_clients_rich_cap.py`:
```python
import asyncio
from secubox_toolbox import api
def test_clients_rich_caps_enrichment(monkeypatch):
rows = [
{"mac_hash": f"m{i}", "ip": f"10.0.0.{i}", "state": "active",
"level": "r1", "score": 0, "last_seen": float(i), "first_seen": 0.0}
for i in range(20)
]
monkeypatch.setattr(api.store, "list_clients", lambda: rows)
monkeypatch.setattr(api.store, "latest_user_agent", lambda mh: "Mozilla/5.0")
geo_calls = {"n": 0}
def fake_lookup(ip):
geo_calls["n"] += 1
return {"flag": "🇫🇷", "country_iso": "FR", "asn_org": "X"}
monkeypatch.setattr(api.geo, "lookup", fake_lookup)
out = asyncio.run(api.admin_clients_rich())
assert out["count"] == 20
# Geo enrichment bounded to ENRICH_LIMIT, not all 20 clients.
assert geo_calls["n"] == api.ENRICH_LIMIT
# Most-recent client (last_seen highest) is enriched.
assert out["clients"][0]["flag"] == "🇫🇷"
# A client beyond the cap has bare geo fields.
assert out["clients"][-1]["flag"] == ""
```
Run: `cd packages/secubox-toolbox && python -m pytest tests/test_clients_rich_cap.py -v`
Expected: FAIL (`ENRICH_LIMIT` undefined; all rows enriched).
- [ ] **Step 2: Add `ENRICH_LIMIT` + cap enrichment**
In `api.py`, add a module-level constant near the other config constants (top of
file, after imports):
```python
# Cap geo/UA enrichment on /admin/clients/rich to the rows the UI actually shows
# (top-5 + headroom). Beyond this, clients get bare fields — avoids ~51 cached
# geo lookups per poll (ref #644).
ENRICH_LIMIT = 12
```
Then rewrite the loop in `admin_clients_rich` (lines ~2917-2978) to sort by
`last_seen` desc and only enrich the first `ENRICH_LIMIT`:
```python
rows = store.list_clients()
rows = sorted(rows, key=lambda r: (r.get("last_seen") or 0), reverse=True)
now = _t.time()
enriched = []
for idx, r in enumerate(rows):
age_min = (now - (r.get("last_seen") or 0)) / 60.0
if age_min < 5:
status_emoji = "🟢"
status_label = "actif"
elif age_min < 60:
status_emoji = "🟡"
status_label = "idle"
else:
status_emoji = "⚪"
status_label = "expiré"
if r.get("state") == "quarantine":
status_emoji = "🔴"
status_label = "quarantine"
level = r.get("level") or "r1"
level_emoji = {"r0": "🌐", "r1": "🛡", "r2": "🔍", "r3": "🌐"}.get(level, "❔")
score = r.get("score", 0)
risk_emoji = "🟢" if score < 30 else "🟡" if score < 70 else "🔴"
# Device + geo enrichment only for the displayed rows (ENRICH_LIMIT).
dev_emoji, dev_label = "📱", ""
flag = country_iso = asn_org = ""
if idx < ENRICH_LIMIT:
try:
ua = store.latest_user_agent(r.get("mac_hash") or "")
if ua:
cl = _av.classify_user_agent(ua)
dev_emoji = cl.get("device_emoji") or dev_emoji
dev_label = cl.get("device") or ""
except Exception:
pass
try:
gi = _geo.lookup(r.get("ip") or "")
flag = gi.get("flag", "") or ""
country_iso = gi.get("country_iso", "") or ""
asn_org = gi.get("asn_org", "") or ""
except Exception:
pass
enriched.append({
"mac_hash": r.get("mac_hash"),
"ip": r.get("ip"),
"state": r.get("state"),
"level": level,
"level_emoji": level_emoji,
"score": score,
"risk_emoji": risk_emoji,
"status_emoji": status_emoji,
"status_label": status_label,
"first_seen": r.get("first_seen"),
"last_seen": r.get("last_seen"),
"device_emoji": dev_emoji,
"device": dev_label,
"flag": flag,
"country_iso": country_iso,
"asn_org": asn_org,
})
return {"clients": enriched, "count": len(enriched)}
```
- [ ] **Step 3: Run tests**
Run: `cd packages/secubox-toolbox && python -m pytest tests/test_clients_rich_cap.py -v`
Expected: PASS.
- [ ] **Step 4: Run full toolbox suite (no regressions)**
Run: `cd packages/secubox-toolbox && python -m pytest tests/ -q`
Expected: PASS (existing clients/rich test, if any, still green).
- [ ] **Step 5: Commit**
```bash
git add packages/secubox-toolbox/secubox_toolbox/api.py packages/secubox-toolbox/tests/test_clients_rich_cap.py
git commit -m "perf(toolbox): cap /admin/clients/rich enrichment to ENRICH_LIMIT most-recent rows (ref #644)"
```
---
### Task 4: Changelogs + version bumps
**Files:**
- Modify: `packages/secubox-hub/debian/changelog`
- Modify: `packages/secubox-toolbox/debian/changelog`
- [ ] **Step 1: Bump hub changelog**
Add a new top entry (use the next patch version after the current one; check the
current top with `head -1 packages/secubox-hub/debian/changelog`). Body:
`perf: dashboard/status/modules + health-batch served from TTL cache; one batched
systemctl on cold path (9-12 s → <100 ms) (ref #644)`.
- [ ] **Step 2: Bump toolbox changelog**
Add a new top entry after the current top (2.6.51). Body:
`perf: /admin/clients/rich enriches only the ENRICH_LIMIT most-recent rows (ref #644)`.
- [ ] **Step 3: Commit**
```bash
git add packages/secubox-hub/debian/changelog packages/secubox-toolbox/debian/changelog
git commit -m "chore: changelogs for hub latency + toolbox clients/rich cap (ref #644)"
```
---
## Self-Review notes
- Spec coverage: Task1=helpers(#1,#2), Task2=wiring(#3,#4,#5), Task3=clients cap(#6). All spec items mapped.
- `_ensure_services_warm` uses `CACHE_TTL*2` staleness window — identical to `_svc`'s own freshness gate, so a warm cache short-circuits and a stale one triggers exactly one batched refresh.
- health-batch returns the raw snapshot dict (same shape as before: `{modules,count}`), so `sidebar.js` is unaffected.
- clients/rich sorts by `last_seen` desc so the enriched rows are the ones the UI surfaces (top-5 + headroom to 12).

View File

@ -0,0 +1,67 @@
# hub dashboard latency + clients/rich cap (#644)
- **Date:** 2026-06-18 · **Packages:** `secubox-hub`, `secubox-toolbox` · **Issue:** #644
- **Status:** Design approved (scope chosen: targeted cache fix), pending plan
## Problem (measured live on gk2)
- `GET /api/v1/hub/dashboard` = **9-12 s** every request: on cold/stale cache,
`_svc()` (main.py:537) falls back to **~16 sequential** `systemctl is-active <svc>`
calls inside the request.
- `GET /api/v1/hub/public/health-batch` = **3.3 s**: one `systemctl list-units
secubox-*` over ~111 services, **synchronous + uncached**, blocking the shared
aggregator loop.
- `GET /admin/clients/rich` (toolbox) = **~180 ms**: #635 geo/UA enrichment of all
~51 clients though the UI shows top-5.
- Root: the hub is mounted in the aggregator which doesn't run sub-app lifespans;
the #619 lazy-start warmer fires but the cold request path is per-module / the
health-batch path is never cached.
## Fix
### secubox-hub (`api/main.py`)
1. **`_refresh_health_batch()`** (new, sync): one `systemctl list-units secubox-*`
call + the existing parse → store `_cache["health_batch"] = {modules, count}`
and stamp `_cache["health_batch_ts"]`. (Extract the parse from the current
handler.)
2. **`_ensure_services_warm()`** (new, async): if `now - _cache["last_refresh"]
>= CACHE_TTL*2`, `await asyncio.to_thread(_refresh_services_cache)` and stamp
`last_refresh`. One batched `is-active -- [all]` call, off the loop.
3. **`/dashboard`, `/status`, `/modules`**: `await _ensure_services_warm()` before
the `_svc` loop → every `_svc` now hits the warm cache (no per-module fallback).
Replaces ~16 calls with **1** on a cold cache.
4. **`/public/health-batch`**: serve `_cache["health_batch"]` when fresh
(`now - health_batch_ts < CACHE_TTL*2`); cold `await
asyncio.to_thread(_refresh_health_batch)` then serve. Never sync-blocks.
5. **`_background_cache_refresh`**: add `await
asyncio.to_thread(_refresh_health_batch)` each cycle so both snapshots stay warm.
(Keep the existing lazy-start middleware + startup hook.)
### secubox-toolbox (`api.py`)
6. **`admin_clients_rich`**: only geo/UA-enrich the first `ENRICH_LIMIT = 12`
clients (UI shows top-5; headroom). Beyond that, emit bare fields
(`device_emoji:"📱"`, empty `device`/`flag`/`country_iso`/`asn_org`). Bounds
the endpoint to ≤12 cached geo lookups.
## Targets
dashboard 9-12 s → <100 ms (warm) / 1 call (cold); health-batch 3.3 s <50 ms
(warm) / 1 offloaded call (cold); clients/rich 180 ms → ~15 ms.
## Error handling / rollout
- All systemctl work stays offloaded (`to_thread`) + best-effort try/except;
stale-but-served beats blocking. Cache misses degrade to one batched call, never
the 16-call loop.
- No schema/config change. Deploy = rebuild + `dpkg -i` both packages; restart the
hub portal (it runs in the aggregator — restart `secubox-aggregator` OR the
hub's own service if standalone; confirm at deploy) + toolbox portal. No mass
restart.
## Tests
- hub: `_refresh_health_batch` parses canned `list-units` → populates cache;
`/health-batch` warm-cache path makes NO subprocess call (monkeypatch
subprocess to raise → still returns cache); `_ensure_services_warm` calls the
batched refresh once when stale, zero when warm (count calls).
- toolbox: `admin_clients_rich` with >12 clients → clients[12:] have bare fields
(geo NOT called for them — monkeypatch geo.lookup to count calls ≤12).
## Out of scope
- Aggregator running sub-app lifespans / process isolation (the bigger documented
effort). This is the bounded targeted fix.

View File

@ -264,54 +264,20 @@ async def public_led_status():
async def public_health_batch():
"""Batch health check for all modules — returns status for sidebar LEDs.
Returns a dict of module_id -> {status: 'ok'|'warn'|'error', msg: str}
Used by sidebar.js for efficient status display.
Serves the TTL snapshot built by the background loop; on a cold miss it
builds it ONCE off the event loop. Never makes a synchronous systemctl call
on the request path.
"""
import subprocess
modules = {}
# Get list of secubox services from systemd
try:
result = subprocess.run(
["systemctl", "list-units", "--type=service", "--state=running,failed,inactive",
"--no-legend", "--plain", "secubox-*"],
capture_output=True, text=True, timeout=5
)
for line in result.stdout.strip().split("\n"):
if not line.strip():
continue
parts = line.split()
if len(parts) >= 4:
unit = parts[0]
load = parts[1]
active = parts[2]
sub = parts[3]
# Extract module id from unit name (secubox-xxx.service -> xxx)
if unit.startswith("secubox-") and unit.endswith(".service"):
mod_id = unit[8:-8] # Remove 'secubox-' and '.service'
if active == "active" and sub == "running":
modules[mod_id] = {"status": "ok", "msg": "Running"}
elif active == "active":
modules[mod_id] = {"status": "warn", "msg": f"Active ({sub})"}
elif active == "failed":
modules[mod_id] = {"status": "error", "msg": "Failed"}
else:
modules[mod_id] = {"status": "warn", "msg": f"{active}/{sub}"}
except Exception as e:
log.warning("health-batch systemctl error: %s", e)
# Also check socket-based services
socket_dir = Path("/run/secubox")
if socket_dir.exists():
for sock in socket_dir.glob("*.sock"):
mod_id = sock.stem
if mod_id not in modules:
modules[mod_id] = {"status": "ok", "msg": "Socket active"}
return {"modules": modules, "count": len(modules)}
hb = _cache.get("health_batch")
if hb and (time.time() - _cache.get("health_batch_ts", 0)) < CACHE_TTL * 2:
return hb
async with _health_batch_lock:
# Re-check under the lock: a concurrent waiter may have just rebuilt it.
hb = _cache.get("health_batch")
if not hb or (time.time() - _cache.get("health_batch_ts", 0)) >= CACHE_TTL * 2:
await asyncio.to_thread(_refresh_health_batch)
hb = _cache.get("health_batch") or {"modules": {}, "count": 0}
return hb
app.include_router(public_router)
@ -324,9 +290,16 @@ _cache = {
"menu": None, # Full menu response
"system_stats": {}, # CPU, memory, disk
"last_refresh": 0,
"health_batch": None, # {modules: {...}, count: int} snapshot for sidebar LEDs
"health_batch_ts": 0, # monotonic-ish wall time of last health_batch build
}
CACHE_TTL = 5 # seconds - cache valid for 5 seconds
# Collapse a thundering herd of concurrent cold requests (the background loop is
# starved >10s, e.g. under aggregator saturation) to a single refresh each.
_services_warm_lock = asyncio.Lock()
_health_batch_lock = asyncio.Lock()
# MODULES dict is dynamically populated from installed services
# These are the "expected" core modules - actual list comes from systemd
CORE_MODULES = {
@ -399,6 +372,51 @@ def _refresh_services_cache():
log.warning("Cache refresh failed: %s", e)
def _refresh_health_batch():
"""Build the sidebar health snapshot in ONE systemctl list-units call.
Stores _cache["health_batch"] = {modules, count} + stamps health_batch_ts.
Shared by the background loop and the /public/health-batch cold-miss path so
the request never makes its own (3.3 s) synchronous systemctl call.
"""
modules = {}
try:
result = subprocess.run(
["systemctl", "list-units", "--type=service",
"--state=running,failed,inactive", "--no-legend", "--plain",
"secubox-*"],
capture_output=True, text=True, timeout=5
)
for line in result.stdout.strip().split("\n"):
if not line.strip():
continue
parts = line.split()
if len(parts) >= 4:
unit, _load, active, sub = parts[0], parts[1], parts[2], parts[3]
if unit.startswith("secubox-") and unit.endswith(".service"):
mod_id = unit[8:-8]
if active == "active" and sub == "running":
modules[mod_id] = {"status": "ok", "msg": "Running"}
elif active == "active":
modules[mod_id] = {"status": "warn", "msg": f"Active ({sub})"}
elif active == "failed":
modules[mod_id] = {"status": "error", "msg": "Failed"}
else:
modules[mod_id] = {"status": "warn", "msg": f"{active}/{sub}"}
except Exception as e:
log.warning("health-batch systemctl error: %s", e)
socket_dir = Path("/run/secubox")
if socket_dir.exists():
for sock in socket_dir.glob("*.sock"):
mod_id = sock.stem
if mod_id not in modules:
modules[mod_id] = {"status": "ok", "msg": "Socket active"}
_cache["health_batch"] = {"modules": modules, "count": len(modules)}
_cache["health_batch_ts"] = time.time()
def _refresh_system_stats():
"""Refresh system stats (CPU, memory, disk)."""
try:
@ -426,6 +444,7 @@ async def _background_cache_refresh():
try:
await asyncio.to_thread(_refresh_services_cache)
await asyncio.to_thread(_refresh_system_stats)
await asyncio.to_thread(_refresh_health_batch)
_cache["last_refresh"] = time.time()
# Refresh package versions every 12 cycles (~60s)
_version_refresh_counter += 1
@ -464,6 +483,7 @@ async def _start_background_once():
try:
await asyncio.to_thread(_refresh_services_cache)
await asyncio.to_thread(_refresh_system_stats)
await asyncio.to_thread(_refresh_health_batch)
_cache["last_refresh"] = time.time()
except Exception as e:
log.warning("Initial cache warm failed: %s", e)
@ -541,9 +561,28 @@ def _svc(name: str) -> dict:
except Exception:
return {"name": name, "active": False, "socket": False, "version": "-"}
async def _ensure_services_warm():
"""Refresh the services cache in ONE batched call when cold/stale.
Replaces the ~16 per-module `systemctl is-active` fallbacks inside _svc()
with a single offloaded `is-active -- [all]` so dashboard/status/modules cold
paths cost one call instead of sixteen, and never block the shared loop.
"""
if (time.time() - _cache["last_refresh"]) < CACHE_TTL * 2:
return
async with _services_warm_lock:
# Re-check under the lock: a concurrent waiter may have just refreshed.
if (time.time() - _cache["last_refresh"]) < CACHE_TTL * 2:
return
await asyncio.to_thread(_refresh_services_cache)
_cache["last_refresh"] = time.time()
@router.get("/status")
async def status(user=Depends(require_jwt)):
board = get_board_info()
await _ensure_services_warm()
# Offload _svc() — blocking systemctl on a cold cache must not stall the loop.
modules_status = await asyncio.to_thread(lambda: {k: _svc(v) for k, v in MODULES.items()})
active = sum(1 for m in modules_status.values() if m["active"])
@ -552,12 +591,14 @@ async def status(user=Depends(require_jwt)):
@router.get("/modules")
async def modules(user=Depends(require_jwt)):
await _ensure_services_warm()
return await asyncio.to_thread(lambda: [{"id": k, **_svc(v)} for k, v in MODULES.items()])
@router.get("/alerts")
async def alerts(user=Depends(require_jwt)):
# Use the cached _svc() (offloaded) instead of an un-timed per-module
# systemctl loop that blocked the shared aggregator event loop.
await _ensure_services_warm()
statuses = await asyncio.to_thread(lambda: {m: _svc(svc) for m, svc in MODULES.items()})
return [{"type": "service_down", "module": m, "service": MODULES[m],
"severity": "warning"}
@ -593,6 +634,7 @@ def _get_build_info() -> dict:
async def dashboard(user=Depends(require_jwt)):
"""Données complètes du dashboard (uses cached stats for speed)."""
board = get_board_info()
await _ensure_services_warm()
# Offload _svc() — on a cold cache it makes blocking systemctl calls that
# must not stall the shared aggregator event loop.
modules_status = await asyncio.to_thread(lambda: {k: _svc(v) for k, v in MODULES.items()})

View File

@ -1,3 +1,14 @@
secubox-hub (1.4.6-1~bookworm1) bookworm; urgency=medium
* perf(#644): dashboard/status/modules/alerts + public/health-batch now served
from the TTL background-refreshed cache. Cold path makes ONE batched
`systemctl is-active` (via _ensure_services_warm) instead of ~16 per-module
calls, and health-batch serves a snapshot (one offloaded list-units on a cold
miss) instead of a 3.3 s synchronous call — eliminates the 9-12 s dashboard
latency under the aggregator (no sub-app lifespan -> cold caches).
-- Gerald Kerma <devel@cybermind.fr> Thu, 18 Jun 2026 09:00:00 +0200
secubox-hub (1.4.5-1~bookworm1) bookworm; urgency=medium
* feat(health): Health Monitor page (/health/) — live status of vital +

View File

@ -0,0 +1,11 @@
"""Add the package's api/ and the repo-wide common/ to sys.path for tests."""
import sys
from pathlib import Path
# packages/secubox-hub/
_pkg_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_pkg_root / "api"))
# repo root → common/
_repo_root = Path(__file__).resolve().parents[3]
sys.path.insert(0, str(_repo_root / "common"))

View File

@ -0,0 +1,92 @@
import asyncio
import importlib
import sys
from pathlib import Path
import pytest
# Import the hub app module.
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "api"))
main = importlib.import_module("main")
def _reset_cache():
main._cache["services"] = {}
main._cache["last_refresh"] = 0
main._cache["health_batch"] = None
main._cache["health_batch_ts"] = 0
def test_ensure_services_warm_refreshes_when_cold(monkeypatch):
_reset_cache()
calls = {"n": 0}
def fake_refresh():
calls["n"] += 1
main._cache["services"]["secubox-x"] = {"name": "secubox-x", "active": True, "socket": False}
monkeypatch.setattr(main, "_refresh_services_cache", fake_refresh)
asyncio.run(main._ensure_services_warm())
assert calls["n"] == 1
assert main._cache["last_refresh"] > 0
def test_ensure_services_warm_skips_when_fresh(monkeypatch):
_reset_cache()
main._cache["last_refresh"] = main.time.time()
calls = {"n": 0}
monkeypatch.setattr(main, "_refresh_services_cache", lambda: calls.__setitem__("n", calls["n"] + 1))
asyncio.run(main._ensure_services_warm())
assert calls["n"] == 0
def test_refresh_health_batch_parses_units(monkeypatch):
_reset_cache()
class R:
stdout = (
"secubox-hub.service loaded active running Hub\n"
"secubox-dpi.service loaded active exited DPI\n"
"secubox-cdn.service loaded failed failed CDN\n"
)
monkeypatch.setattr(main.subprocess, "run", lambda *a, **k: R())
# No sockets present for these in the test env.
main._refresh_health_batch()
hb = main._cache["health_batch"]
assert hb["modules"]["hub"]["status"] == "ok"
assert hb["modules"]["dpi"]["status"] == "warn"
assert hb["modules"]["cdn"]["status"] == "error"
assert main._cache["health_batch_ts"] > 0
def test_health_batch_serves_cache_without_subprocess(monkeypatch):
_reset_cache()
main._cache["health_batch"] = {"modules": {"hub": {"status": "ok", "msg": "Running"}}, "count": 1}
main._cache["health_batch_ts"] = main.time.time()
def boom(*a, **k):
raise AssertionError("subprocess must NOT be called when cache is warm")
monkeypatch.setattr(main.subprocess, "run", boom)
out = asyncio.run(main.public_health_batch())
assert out["count"] == 1
assert out["modules"]["hub"]["status"] == "ok"
def test_health_batch_cold_miss_builds_once(monkeypatch):
_reset_cache()
class R:
stdout = "secubox-hub.service loaded active running Hub\n"
calls = {"n": 0}
def fake_run(*a, **k):
calls["n"] += 1
return R()
monkeypatch.setattr(main.subprocess, "run", fake_run)
out = asyncio.run(main.public_health_batch())
assert out["count"] >= 1
assert calls["n"] == 1

View File

@ -1,3 +1,12 @@
secubox-toolbox (2.6.52-1~bookworm1) bookworm; urgency=medium
* perf(#644): /admin/clients/rich enriches only the ENRICH_LIMIT (12)
most-recent clients (sorted by last_seen). Rows beyond the cap return bare
device/geo fields — avoids ~51 cached geo/UA lookups per poll. All keys
preserved so the admin table is unaffected.
-- Gerald KERMA <devel@cybermind.fr> Thu, 18 Jun 2026 09:00:00 +0200
secubox-toolbox (2.6.51-1~bookworm1) bookworm; urgency=medium
* social-graph (#642): stop recording IP-literal "tracker" edges (no-SNI

View File

@ -73,6 +73,11 @@ async def toolbox_bundle(mh: str = Query(default=""), wg: int = Query(default=0)
headers={"Cache-Control": "no-store"},
)
# Cap geo/UA enrichment on /admin/clients/rich to the rows the UI actually shows
# (top-5 + headroom). Beyond this, clients get bare fields — avoids ~51 cached
# geo lookups per poll (ref #644).
ENRICH_LIMIT = 12
TEMPLATE_DIR = Path(__file__).resolve().parent.parent / "conf"
_env = jinja2.Environment(
loader=jinja2.FileSystemLoader(TEMPLATE_DIR),
@ -2915,9 +2920,10 @@ async def admin_clients_rich() -> dict:
_av = avatar_analysis
_geo = geo
rows = store.list_clients()
rows = sorted(rows, key=lambda r: (r.get("last_seen") or 0), reverse=True)
now = _t.time()
enriched = []
for r in rows:
for idx, r in enumerate(rows):
age_min = (now - (r.get("last_seen") or 0)) / 60.0
if age_min < 5:
status_emoji = "🟢"
@ -2936,8 +2942,10 @@ async def admin_clients_rich() -> dict:
score = r.get("score", 0)
risk_emoji = "🟢" if score < 30 else "🟡" if score < 70 else "🔴"
# --- Device classification (UA-based) ---
# Device + geo enrichment only for the displayed rows (ENRICH_LIMIT).
dev_emoji, dev_label = "📱", ""
flag = country_iso = asn_org = ""
if idx < ENRICH_LIMIT:
try:
ua = store.latest_user_agent(r.get("mac_hash") or "")
if ua:
@ -2946,9 +2954,6 @@ async def admin_clients_rich() -> dict:
dev_label = cl.get("device") or ""
except Exception:
pass
# --- Geo enrichment (country flag, ISO, ASN org) ---
flag = country_iso = asn_org = ""
try:
gi = _geo.lookup(r.get("ip") or "")
flag = gi.get("flag", "") or ""

View File

@ -0,0 +1,30 @@
import asyncio
from secubox_toolbox import api
def test_clients_rich_caps_enrichment(monkeypatch):
rows = [
{"mac_hash": f"m{i}", "ip": f"10.0.0.{i}", "state": "active",
"level": "r1", "score": 0, "last_seen": float(i), "first_seen": 0.0}
for i in range(20)
]
monkeypatch.setattr(api.store, "list_clients", lambda: rows)
monkeypatch.setattr(api.store, "latest_user_agent", lambda mh: "Mozilla/5.0")
geo_calls = {"n": 0}
def fake_lookup(ip):
geo_calls["n"] += 1
return {"flag": "🇫🇷", "country_iso": "FR", "asn_org": "X"}
monkeypatch.setattr(api.geo, "lookup", fake_lookup)
out = asyncio.run(api.admin_clients_rich())
assert out["count"] == 20
# Geo enrichment bounded to ENRICH_LIMIT, not all 20 clients.
assert geo_calls["n"] == api.ENRICH_LIMIT
# Most-recent client (last_seen highest) is enriched.
assert out["clients"][0]["flag"] == "🇫🇷"
# A client beyond the cap has bare geo fields.
assert out["clients"][-1]["flag"] == ""