Compare commits

..

No commits in common. "2523333fc8f6b07c9bcad0d749a0e579610394c4" and "630cb81e03d8696d2dc0907c146948e1ae174c15" have entirely different histories.

8 changed files with 3 additions and 211 deletions

View File

@ -1,27 +1,3 @@
secubox-toolbox (2.6.36-1~bookworm1) bookworm; urgency=medium
* fix(autolearn): exclude anti-bot vendors from the auto-block list (#589
follow-up). Anti-bot WADs (Datadome/PerimeterX) often sit in the visited
site's own path, so auto-blocking them would break the page. The learner
now feeds only OPERATOR-GRADE/data-broker classified trackers (+ threat-
intel domains); cross-site threshold lowered 4→2.
-- Gerald KERMA <devel@cybermind.fr> Sun, 14 Jun 2026 16:50:00 +0200
secubox-toolbox (2.6.35-1~bookworm1) bookworm; urgency=medium
* Autolearn bad trackers/actors (#589) — feeds ad_ghost's block set.
- sbin/secubox-toolbox-autolearn (+ hourly timer) builds a HIGH-
confidence list /var/lib/secubox/toolbox/learned-trackers.txt from
(1) threat-intel domain IOCs (threatfox malicious) and (2) cross-site
domains CLASSIFIED anti-bot/operator-grade seen on >=4 sites.
Conservative — plain cross-site CDNs are NOT learned.
- ad_ghost.py loads it (mtime-cached) and 204s learned hosts too
(X-SecuBox-Ghost: learned), gated by the new `autolearn` filter
(default on). postinst enables the timer + runs once.
-- Gerald KERMA <devel@cybermind.fr> Sun, 14 Jun 2026 16:30:00 +0200
secubox-toolbox (2.6.34-1~bookworm1) bookworm; urgency=medium
* Cartographie: domain-nugget cloud view (#587). New "🏷️ Domaines" toggle

View File

@ -211,11 +211,6 @@ fi
# until the operator opts in via a SECUBOX_ESCALATE_* drop-in.
systemctl enable secubox-escalate.timer 2>/dev/null || true
systemctl start secubox-escalate.timer 2>/dev/null || true
# #589 : autolearn bad-tracker timer (hourly) + a first run now so
# the learned list exists immediately for ad_ghost.
systemctl enable secubox-toolbox-autolearn.timer 2>/dev/null || true
systemctl start secubox-toolbox-autolearn.timer 2>/dev/null || true
/usr/sbin/secubox-toolbox-autolearn 2>&1 | head -1 || true
fi
fi

View File

@ -35,10 +35,6 @@ override_dh_installsystemd:
# Install the secondary unit manually (dh_installsystemd expects 1 unit/pkg).
install -d debian/secubox-toolbox/lib/systemd/system
install -m 0644 systemd/secubox-toolbox-mitm.service debian/secubox-toolbox/lib/systemd/system/
# #589 : autolearn bad-tracker learner + hourly timer
install -m 0755 sbin/secubox-toolbox-autolearn debian/secubox-toolbox/usr/sbin/
install -m 0644 systemd/secubox-toolbox-autolearn.service debian/secubox-toolbox/lib/systemd/system/
install -m 0644 systemd/secubox-toolbox-autolearn.timer debian/secubox-toolbox/lib/systemd/system/
# Phase 6.P (#496) : systemd drop-ins for RuntimeMaxSec=6h on mitm + mitm-wg
install -d debian/secubox-toolbox/lib/systemd/system/secubox-toolbox-mitm.service.d
install -m 0644 systemd/secubox-toolbox-mitm.service.d/10-runtime-max.conf \

View File

@ -46,43 +46,6 @@ _AD_HOST = re.compile(
re.IGNORECASE,
)
# #589 — auto-learned bad hosts (threat-intel + classified cross-site
# trackers), rebuilt hourly by secubox-toolbox-autolearn. Loaded with a
# mtime check so a fresh learn takes effect within ~60 s, no restart.
_LEARNED_PATH = "/var/lib/secubox/toolbox/learned-trackers.txt"
_learned: set = set()
_learned_mtime = 0.0
_learned_check = 0.0
_2L_TLD = ("co.uk", "com.au", "co.jp", "co.nz", "com.br", "co.za", "gouv.fr")
def _registrable(host: str):
host = (host or "").split(":")[0].lower().strip(".")
if not host or host.replace(".", "").isdigit() or ":" in host:
return None
p = host.split(".")
if len(p) <= 2:
return host
last2 = ".".join(p[-2:])
return ".".join(p[-3:]) if (last2 in _2L_TLD and len(p) >= 3) else last2
def _learned_set() -> set:
global _learned, _learned_mtime, _learned_check
now = time.time()
if now - _learned_check < 60:
return _learned
_learned_check = now
try:
m = os.path.getmtime(_LEARNED_PATH)
if m != _learned_mtime:
with open(_LEARNED_PATH, encoding="utf-8") as f:
_learned = {ln.strip().lower() for ln in f if ln.strip()}
_learned_mtime = m
except Exception:
pass
return _learned
# Cosmetic hide selectors, grouped so the WebUI can toggle each category.
_COSMETIC = {
"ads": (
@ -161,18 +124,10 @@ class AdGhost:
if not _is_r3plus(flow):
return
host = flow.request.pretty_host or ""
blocked = bool(_AD_HOST.search(host))
learned = False
if not blocked and f.get("autolearn", True):
reg = _registrable(host)
if reg and (reg in _learned_set() or host.lower() in _learned_set()):
blocked = learned = True
if blocked:
if _AD_HOST.search(host):
flow.response = http.Response.make(
204, b"", {"X-SecuBox-Ghost": "learned" if learned else "blocked"})
204, b"", {"X-SecuBox-Ghost": "blocked"})
_counts["blocked_requests"] += 1
if learned:
_counts["learned_blocks"] = _counts.get("learned_blocks", 0) + 1
_counts["bytes_saved_est"] += _EST_BYTES_PER_REQ
_flush()

View File

@ -1,106 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
# Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
#
# #589 — autolearn bad trackers/actors. Builds a HIGH-CONFIDENCE block list
# that ad_ghost consults (in addition to its static ad-host regex), from:
# 1. threat-intel domain IOCs (threatfox malicious C2/malware domains) ;
# 2. cross-site OPERATOR-GRADE / data-broker tracker domains
# (social_host_meta.opgrade_vendor) seen on >= MIN_SITES sites.
# Deliberately conservative — a plain cross-site CDN (fonts, shared assets)
# is NOT learned, and ANTI-BOT vendors are NOT learned either : a site's own
# WAF (Datadome/PerimeterX) sits in the 1st-party path, so blocking it would
# break the site. So live R3 users don't get legit sites broken. Run hourly
# by secubox-toolbox-autolearn.timer ; output read by ad_ghost (cached).
from __future__ import annotations
import json
import sqlite3
import sys
import time
DB = "/var/lib/secubox/toolbox/toolbox.db"
OUT = "/var/lib/secubox/toolbox/learned-trackers.txt"
MIN_SITES = 2 # cross-site threshold for operator-grade trackers
MAX_ENTRIES = 8000
_2L = ("co.uk", "com.au", "co.jp", "co.nz", "com.br", "co.za", "gouv.fr")
def registrable(host: str):
host = (host or "").split(":")[0].lower().strip(".")
if not host or host.replace(".", "").isdigit() or ":" in host:
return None
p = host.split(".")
if len(p) <= 2:
return host
last2 = ".".join(p[-2:])
return ".".join(p[-3:]) if (last2 in _2L and len(p) >= 3) else last2
def main() -> int:
learned: set[str] = set()
try:
c = sqlite3.connect(DB, timeout=10)
c.row_factory = sqlite3.Row
except Exception as e:
sys.stderr.write(f"autolearn: cannot open {DB}: {e}\n")
return 0
# 1) threat-intel malicious domains (high confidence).
try:
for r in c.execute("SELECT DISTINCT ioc FROM threat_intel WHERE type='domain'"):
d = registrable(r["ioc"])
if d:
learned.add(d)
except Exception:
pass
ti = len(learned)
# 2) cross-site OPERATOR-GRADE / data-broker trackers ONLY. Anti-bot
# vendors are deliberately excluded — they're frequently the visited
# site's own WAF (in-path), so blocking them breaks the page.
try:
classified = set()
for r in c.execute(
"SELECT tracker_domain FROM social_host_meta "
"WHERE opgrade_vendor IS NOT NULL"):
d = registrable(r["tracker_domain"])
if d:
classified.add(d)
# distinct 1st-party sites per registrable tracker domain
sites: dict[str, set] = {}
for r in c.execute("SELECT tracker_domain, sites_jsonl FROM social_nodes"):
d = registrable(r["tracker_domain"])
if not d or d not in classified:
continue
try:
for s in json.loads(r["sites_jsonl"] or "[]"):
sites.setdefault(d, set()).add(s)
except Exception:
pass
for d, ss in sites.items():
if len(ss) >= MIN_SITES:
learned.add(d)
except Exception:
pass
c.close()
learned.discard(None)
out = sorted(learned)[:MAX_ENTRIES]
try:
tmp = OUT + ".tmp"
with open(tmp, "w", encoding="utf-8") as f:
f.write("\n".join(out) + ("\n" if out else ""))
import os
os.replace(tmp, OUT)
except Exception as e:
sys.stderr.write(f"autolearn: write failed: {e}\n")
return 0
sys.stderr.write(
f"autolearn: {len(out)} hosts learned ({ti} threat-intel + "
f"{len(out) - ti} classified cross-site) @ {int(time.time())}\n")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -22,7 +22,6 @@ DEFAULTS: Dict = {
"ad_ghost": True, # R3+/R4 silent ad/banner/widget ghosting
"ad_ghost_block": True, # 204 known ad/tracker hosts (save bandwidth)
"media_cache": False, # #577 shared media proxy-cache (opt-in)
"autolearn": True, # #589 also block auto-learned bad hosts
"ad_ghost_categories": { # cosmetic ghost groups
"ads": True,
"consent_nag": True,
@ -74,7 +73,7 @@ def set_filters(patch: Dict) -> Dict:
if ck in DEFAULTS["ad_ghost_categories"]})
elif k == "protective" and v in _VALID_PROTECTIVE:
cur["protective"] = v
elif k in ("banner", "ad_ghost", "ad_ghost_block", "media_cache", "autolearn"):
elif k in ("banner", "ad_ghost", "ad_ghost_block", "media_cache"):
cur[k] = bool(v)
try:
os.makedirs(os.path.dirname(FILTERS_PATH), exist_ok=True)

View File

@ -1,12 +0,0 @@
[Unit]
Description=SecuBox ToolBoX — autolearn bad trackers/actors (#589)
Documentation=https://github.com/CyberMind-FR/secubox-deb/issues/589
After=secubox-toolbox.service
[Service]
Type=oneshot
ExecStart=/usr/sbin/secubox-toolbox-autolearn
Nice=10
IOSchedulingClass=idle
# best-effort, never block boot
TimeoutStartSec=120

View File

@ -1,11 +0,0 @@
[Unit]
Description=SecuBox ToolBoX — hourly autolearn of bad trackers (#589)
[Timer]
OnBootSec=10min
OnUnitActiveSec=1h
Persistent=true
RandomizedDelaySec=5min
[Install]
WantedBy=timers.target