2026-06-30 08:00:54 +00:00
8 changed files with 3 additions and 211 deletions
--- a/packages/secubox-toolbox/debian/changelog
+++ b/packages/secubox-toolbox/debian/changelog
@ -1,27 +1,3 @@
-secubox-toolbox (2.6.36-1~bookworm1) bookworm; urgency=medium
-
-  * fix(autolearn): exclude anti-bot vendors from the auto-block list (#589
-    follow-up). Anti-bot WADs (Datadome/PerimeterX) often sit in the visited
-    site's own path, so auto-blocking them would break the page. The learner
-    now feeds only OPERATOR-GRADE/data-broker classified trackers (+ threat-
-    intel domains); cross-site threshold lowered 4→2.
-
- -- Gerald KERMA <devel@cybermind.fr>  Sun, 14 Jun 2026 16:50:00 +0200
-
-secubox-toolbox (2.6.35-1~bookworm1) bookworm; urgency=medium
-
-  * Autolearn bad trackers/actors (#589) — feeds ad_ghost's block set.
-      - sbin/secubox-toolbox-autolearn (+ hourly timer) builds a HIGH-
-        confidence list /var/lib/secubox/toolbox/learned-trackers.txt from
-        (1) threat-intel domain IOCs (threatfox malicious) and (2) cross-site
-        domains CLASSIFIED anti-bot/operator-grade seen on >=4 sites.
-        Conservative — plain cross-site CDNs are NOT learned.
-      - ad_ghost.py loads it (mtime-cached) and 204s learned hosts too
-        (X-SecuBox-Ghost: learned), gated by the new `autolearn` filter
-        (default on). postinst enables the timer + runs once.
-
- -- Gerald KERMA <devel@cybermind.fr>  Sun, 14 Jun 2026 16:30:00 +0200
-
 secubox-toolbox (2.6.34-1~bookworm1) bookworm; urgency=medium

  * Cartographie: domain-nugget cloud view (#587). New "🏷️ Domaines" toggle
--- a/packages/secubox-toolbox/debian/postinst
+++ b/packages/secubox-toolbox/debian/postinst
@ -211,11 +211,6 @@ fi
        # until the operator opts in via a SECUBOX_ESCALATE_* drop-in.
        systemctl enable secubox-escalate.timer 2>/dev/null || true
        systemctl start secubox-escalate.timer 2>/dev/null || true
-        # #589 : autolearn bad-tracker timer (hourly) + a first run now so
-        # the learned list exists immediately for ad_ghost.
-        systemctl enable secubox-toolbox-autolearn.timer 2>/dev/null || true
-        systemctl start secubox-toolbox-autolearn.timer 2>/dev/null || true
-        /usr/sbin/secubox-toolbox-autolearn 2>&1 | head -1 || true
      fi
    fi

--- a/packages/secubox-toolbox/debian/rules
+++ b/packages/secubox-toolbox/debian/rules
@ -35,10 +35,6 @@ override_dh_installsystemd:
 	# Install the secondary unit manually (dh_installsystemd expects 1 unit/pkg).
 	install -d debian/secubox-toolbox/lib/systemd/system
 	install -m 0644 systemd/secubox-toolbox-mitm.service debian/secubox-toolbox/lib/systemd/system/
-	# #589 : autolearn bad-tracker learner + hourly timer
-	install -m 0755 sbin/secubox-toolbox-autolearn debian/secubox-toolbox/usr/sbin/
-	install -m 0644 systemd/secubox-toolbox-autolearn.service debian/secubox-toolbox/lib/systemd/system/
-	install -m 0644 systemd/secubox-toolbox-autolearn.timer debian/secubox-toolbox/lib/systemd/system/
 	# Phase 6.P (#496) : systemd drop-ins for RuntimeMaxSec=6h on mitm + mitm-wg
 	install -d debian/secubox-toolbox/lib/systemd/system/secubox-toolbox-mitm.service.d
 	install -m 0644 systemd/secubox-toolbox-mitm.service.d/10-runtime-max.conf \
--- a/packages/secubox-toolbox/mitmproxy_addons/ad_ghost.py
+++ b/packages/secubox-toolbox/mitmproxy_addons/ad_ghost.py
@ -46,43 +46,6 @@ _AD_HOST = re.compile(
    re.IGNORECASE,
 )

-# #589 — auto-learned bad hosts (threat-intel + classified cross-site
-# trackers), rebuilt hourly by secubox-toolbox-autolearn. Loaded with a
-# mtime check so a fresh learn takes effect within ~60 s, no restart.
-_LEARNED_PATH = "/var/lib/secubox/toolbox/learned-trackers.txt"
-_learned: set = set()
-_learned_mtime = 0.0
-_learned_check = 0.0
-_2L_TLD = ("co.uk", "com.au", "co.jp", "co.nz", "com.br", "co.za", "gouv.fr")
-
-
-def _registrable(host: str):
-    host = (host or "").split(":")[0].lower().strip(".")
-    if not host or host.replace(".", "").isdigit() or ":" in host:
-        return None
-    p = host.split(".")
-    if len(p) <= 2:
-        return host
-    last2 = ".".join(p[-2:])
-    return ".".join(p[-3:]) if (last2 in _2L_TLD and len(p) >= 3) else last2
-
-
-def _learned_set() -> set:
-    global _learned, _learned_mtime, _learned_check
-    now = time.time()
-    if now - _learned_check < 60:
-        return _learned
-    _learned_check = now
-    try:
-        m = os.path.getmtime(_LEARNED_PATH)
-        if m != _learned_mtime:
-            with open(_LEARNED_PATH, encoding="utf-8") as f:
-                _learned = {ln.strip().lower() for ln in f if ln.strip()}
-            _learned_mtime = m
-    except Exception:
-        pass
-    return _learned
-
 # Cosmetic hide selectors, grouped so the WebUI can toggle each category.
 _COSMETIC = {
    "ads": (
@ -161,18 +124,10 @@ class AdGhost:
        if not _is_r3plus(flow):
            return
        host = flow.request.pretty_host or ""
-        blocked = bool(_AD_HOST.search(host))
-        learned = False
-        if not blocked and f.get("autolearn", True):
-            reg = _registrable(host)
-            if reg and (reg in _learned_set() or host.lower() in _learned_set()):
-                blocked = learned = True
-        if blocked:
+        if _AD_HOST.search(host):
            flow.response = http.Response.make(
-                204, b"", {"X-SecuBox-Ghost": "learned" if learned else "blocked"})
+                204, b"", {"X-SecuBox-Ghost": "blocked"})
            _counts["blocked_requests"] += 1
-            if learned:
-                _counts["learned_blocks"] = _counts.get("learned_blocks", 0) + 1
            _counts["bytes_saved_est"] += _EST_BYTES_PER_REQ
            _flush()

--- a/packages/secubox-toolbox/sbin/secubox-toolbox-autolearn
+++ b/packages/secubox-toolbox/sbin/secubox-toolbox-autolearn
@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: LicenseRef-CMSD-1.0
-# Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
-#
-# #589 — autolearn bad trackers/actors. Builds a HIGH-CONFIDENCE block list
-# that ad_ghost consults (in addition to its static ad-host regex), from:
-#   1. threat-intel domain IOCs (threatfox malicious C2/malware domains) ;
-#   2. cross-site OPERATOR-GRADE / data-broker tracker domains
-#      (social_host_meta.opgrade_vendor) seen on >= MIN_SITES sites.
-# Deliberately conservative — a plain cross-site CDN (fonts, shared assets)
-# is NOT learned, and ANTI-BOT vendors are NOT learned either : a site's own
-# WAF (Datadome/PerimeterX) sits in the 1st-party path, so blocking it would
-# break the site. So live R3 users don't get legit sites broken. Run hourly
-# by secubox-toolbox-autolearn.timer ; output read by ad_ghost (cached).
-from __future__ import annotations
-
-import json
-import sqlite3
-import sys
-import time
-
-DB = "/var/lib/secubox/toolbox/toolbox.db"
-OUT = "/var/lib/secubox/toolbox/learned-trackers.txt"
-MIN_SITES = 2          # cross-site threshold for operator-grade trackers
-MAX_ENTRIES = 8000
-_2L = ("co.uk", "com.au", "co.jp", "co.nz", "com.br", "co.za", "gouv.fr")
-
-
-def registrable(host: str):
-    host = (host or "").split(":")[0].lower().strip(".")
-    if not host or host.replace(".", "").isdigit() or ":" in host:
-        return None
-    p = host.split(".")
-    if len(p) <= 2:
-        return host
-    last2 = ".".join(p[-2:])
-    return ".".join(p[-3:]) if (last2 in _2L and len(p) >= 3) else last2
-
-
-def main() -> int:
-    learned: set[str] = set()
-    try:
-        c = sqlite3.connect(DB, timeout=10)
-        c.row_factory = sqlite3.Row
-    except Exception as e:
-        sys.stderr.write(f"autolearn: cannot open {DB}: {e}\n")
-        return 0
-
-    # 1) threat-intel malicious domains (high confidence).
-    try:
-        for r in c.execute("SELECT DISTINCT ioc FROM threat_intel WHERE type='domain'"):
-            d = registrable(r["ioc"])
-            if d:
-                learned.add(d)
-    except Exception:
-        pass
-    ti = len(learned)
-
-    # 2) cross-site OPERATOR-GRADE / data-broker trackers ONLY. Anti-bot
-    # vendors are deliberately excluded — they're frequently the visited
-    # site's own WAF (in-path), so blocking them breaks the page.
-    try:
-        classified = set()
-        for r in c.execute(
-            "SELECT tracker_domain FROM social_host_meta "
-            "WHERE opgrade_vendor IS NOT NULL"):
-            d = registrable(r["tracker_domain"])
-            if d:
-                classified.add(d)
-        # distinct 1st-party sites per registrable tracker domain
-        sites: dict[str, set] = {}
-        for r in c.execute("SELECT tracker_domain, sites_jsonl FROM social_nodes"):
-            d = registrable(r["tracker_domain"])
-            if not d or d not in classified:
-                continue
-            try:
-                for s in json.loads(r["sites_jsonl"] or "[]"):
-                    sites.setdefault(d, set()).add(s)
-            except Exception:
-                pass
-        for d, ss in sites.items():
-            if len(ss) >= MIN_SITES:
-                learned.add(d)
-    except Exception:
-        pass
-
-    c.close()
-    learned.discard(None)
-    out = sorted(learned)[:MAX_ENTRIES]
-    try:
-        tmp = OUT + ".tmp"
-        with open(tmp, "w", encoding="utf-8") as f:
-            f.write("\n".join(out) + ("\n" if out else ""))
-        import os
-        os.replace(tmp, OUT)
-    except Exception as e:
-        sys.stderr.write(f"autolearn: write failed: {e}\n")
-        return 0
-    sys.stderr.write(
-        f"autolearn: {len(out)} hosts learned ({ti} threat-intel + "
-        f"{len(out) - ti} classified cross-site) @ {int(time.time())}\n")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
--- a/packages/secubox-toolbox/secubox_toolbox/filters.py
+++ b/packages/secubox-toolbox/secubox_toolbox/filters.py
@ -22,7 +22,6 @@ DEFAULTS: Dict = {
    "ad_ghost": True,               # R3+/R4 silent ad/banner/widget ghosting
    "ad_ghost_block": True,         # 204 known ad/tracker hosts (save bandwidth)
    "media_cache": False,           # #577 shared media proxy-cache (opt-in)
-    "autolearn": True,              # #589 also block auto-learned bad hosts
    "ad_ghost_categories": {        # cosmetic ghost groups
        "ads": True,
        "consent_nag": True,
@ -74,7 +73,7 @@ def set_filters(patch: Dict) -> Dict:
                 if ck in DEFAULTS["ad_ghost_categories"]})
        elif k == "protective" and v in _VALID_PROTECTIVE:
            cur["protective"] = v
-        elif k in ("banner", "ad_ghost", "ad_ghost_block", "media_cache", "autolearn"):
+        elif k in ("banner", "ad_ghost", "ad_ghost_block", "media_cache"):
            cur[k] = bool(v)
    try:
        os.makedirs(os.path.dirname(FILTERS_PATH), exist_ok=True)
--- a/packages/secubox-toolbox/systemd/secubox-toolbox-autolearn.service
+++ b/packages/secubox-toolbox/systemd/secubox-toolbox-autolearn.service
@ -1,12 +0,0 @@
-[Unit]
-Description=SecuBox ToolBoX — autolearn bad trackers/actors (#589)
-Documentation=https://github.com/CyberMind-FR/secubox-deb/issues/589
-After=secubox-toolbox.service
-
-[Service]
-Type=oneshot
-ExecStart=/usr/sbin/secubox-toolbox-autolearn
-Nice=10
-IOSchedulingClass=idle
-# best-effort, never block boot
-TimeoutStartSec=120
--- a/packages/secubox-toolbox/systemd/secubox-toolbox-autolearn.timer
+++ b/packages/secubox-toolbox/systemd/secubox-toolbox-autolearn.timer
@ -1,11 +0,0 @@
-[Unit]
-Description=SecuBox ToolBoX — hourly autolearn of bad trackers (#589)
-
-[Timer]
-OnBootSec=10min
-OnUnitActiveSec=1h
-Persistent=true
-RandomizedDelaySec=5min
-
-[Install]
-WantedBy=timers.target