Merge pull request #679 from CyberMind-FR/feat/662-adlearn

feat(#662): restore + strengthen ad/tracker auto-learn (live-reload, candidate emit, cross-site promotion)
chore: changelog 0.1.12 — ad auto-learn loop + live-reload (ref #662 )
2026-06-29 13:31:30 +00:00 · 2026-06-19 10:40:07 +02:00 · 2026-06-19 10:37:49 +02:00 · 2026-06-19 10:36:02 +02:00
12 changed files with 938 additions and 35 deletions
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/adcand_test.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/adcand_test.go
@ -0,0 +1,147 @@
+// SPDX-License-Identifier: LicenseRef-CMSD-1.0
+// Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
+//
+// SecuBox-Deb :: toolbox-ng :: ad-candidate learning-feed tests (#662)
+//
+// The Go cutover blocked from STATIC lists but never emitted LEARNING
+// candidates, so a brand-new adware (acotedemoi.com) was never observed → never
+// promoted → slipped through forever. These tests prove the engine now ports
+// ad_ghost's _AD_PATH heuristic and records a candidate (host,site) for every
+// 3rd-party ad-path request on the allow/mitm path — the feed autolearn promotes.
+package main
+
+import (
+	"path/filepath"
+	"testing"
+)
+
+func TestAdPathRegex(t *testing.T) {
+	hit := []string{
+		"/ad/1.gif", "/ads/x", "/adserver/req", "/pagead/conversion",
+		"/gampad/ads", "/doubleclick/x", "/beacon", "/pixel.gif",
+		"/collect", "/track", "/tracking/p", "/telemetry/v2", "/metric",
+		"/PAGEAD/Upper", // case-insensitive
+	}
+	for _, p := range hit {
+		if !adPathRE.MatchString(p) {
+			t.Errorf("adPathRE should MATCH %q", p)
+		}
+	}
+	miss := []string{"/", "/index.html", "/api/users", "/static/app.js", "/cart", "/headline"}
+	for _, p := range miss {
+		if adPathRE.MatchString(p) {
+			t.Errorf("adPathRE should NOT match %q", p)
+		}
+	}
+}
+
+// newAdCandTestPolicy builds a Policy with doubleclick.net allowlisted (so the
+// allowlist-skip branch is exercised) and nothing learned.
+func newAdCandTestPolicy(t *testing.T) *Policy {
+	t.Helper()
+	pol, err := LoadPolicy(PolicyOpts{
+		AllowPath:        writeTemp(t, "doubleclick.net\n"),
+		LearnedPath:      writeTemp(t, ""),
+		SpliceSeedPath:   writeTemp(t, ""),
+		SpliceLearnPath:  writeTemp(t, ""),
+		PureTrackersPath: writeTemp(t, ""),
+		SelfDomains:      []string{"secubox.in"},
+	})
+	if err != nil {
+		t.Fatalf("LoadPolicy: %v", err)
+	}
+	return pol
+}
+
+func TestMaybeRecordAdCandidate(t *testing.T) {
+	pol := newAdCandTestPolicy(t)
+
+	cases := []struct {
+		name   string
+		host   string // request host
+		site   string // referer site (registrable)
+		path   string
+		want   bool // candidate recorded?
+		wantHK string
+	}{
+		{"3rd-party ad-path → candidate", "metrics.acotedemoi.com", "lemonde.fr", "/collect", true, "metrics.acotedemoi.com"},
+		{"3rd-party ad-path /pagead", "ads.foo.io", "news.example", "/pagead/x", true, "ads.foo.io"},
+		{"1st-party (same registrable) → no candidate", "static.lemonde.fr", "lemonde.fr", "/ads/x", false, ""},
+		{"3rd-party non-ad-path → no candidate", "cdn.acotedemoi.com", "lemonde.fr", "/app.js", false, ""},
+		{"no site (no Referer) → no candidate", "metrics.acotedemoi.com", "", "/collect", false, ""},
+		{"allowlisted host → no candidate", "ads.doubleclick.net", "lemonde.fr", "/pagead/x", false, ""},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			cand := newAdCandidates()
+			px := &Proxy{pol: pol, cand: cand, analysisRelay: true}
+			px.maybeRecordAdCandidate(tc.host, tc.site, tc.path)
+			snap := cand.snapshot()
+			if tc.want {
+				if len(snap) != 1 {
+					t.Fatalf("want 1 candidate, got %d (%+v)", len(snap), snap)
+				}
+				if snap[0].Host != tc.wantHK {
+					t.Fatalf("candidate host = %q, want %q", snap[0].Host, tc.wantHK)
+				}
+				if snap[0].Site != tc.site {
+					t.Fatalf("candidate site = %q, want %q", snap[0].Site, tc.site)
+				}
+				if snap[0].Hits != 1 {
+					t.Fatalf("candidate hits = %d, want 1", snap[0].Hits)
+				}
+			} else if len(snap) != 0 {
+				t.Fatalf("want 0 candidates, got %d (%+v)", len(snap), snap)
+			}
+		})
+	}
+}
+
+// TestAdCandidateGatedByRelay proves the feed is gated behind the analysis/ad
+// relay flag: with the gate off, nothing is recorded even on a textbook hit.
+func TestAdCandidateGatedByRelay(t *testing.T) {
+	pol := newAdCandTestPolicy(t)
+	cand := newAdCandidates()
+	px := &Proxy{pol: pol, cand: cand, analysisRelay: false}
+	px.maybeRecordAdCandidate("metrics.acotedemoi.com", "lemonde.fr", "/collect")
+	if n := len(cand.snapshot()); n != 0 {
+		t.Fatalf("relay off: want 0 candidates, got %d", n)
+	}
+}
+
+// TestAdCandidateHitsAccumulate proves repeated (host,site) hits coalesce.
+func TestAdCandidateHitsAccumulate(t *testing.T) {
+	cand := newAdCandidates()
+	for i := 0; i < 5; i++ {
+		cand.record("x.tracker.io", "site.example")
+	}
+	snap := cand.snapshot()
+	if len(snap) != 1 || snap[0].Hits != 5 {
+		t.Fatalf("want 1 row hits=5, got %+v", snap)
+	}
+	// snapshot clears.
+	if n := len(cand.snapshot()); n != 0 {
+		t.Fatalf("snapshot should clear: got %d", n)
+	}
+}
+
+// TestAdCandidatePayloadShape proves the candidates list serialises into the
+// extended ad-event payload (host/site/hits keys).
+func TestAdCandidatePayloadShape(t *testing.T) {
+	cand := newAdCandidates()
+	cand.record("x.tracker.io", "site.example")
+	rows := cand.snapshot()
+	p := adEventPayload{Candidates: rows}
+	if p.empty() {
+		t.Fatal("payload with candidates must not be empty()")
+	}
+}
+
+// writeTemp writes content to a fresh temp file and returns its path.
+func writeTemp(t *testing.T, content string) string {
+	t.Helper()
+	f := filepath.Join(t.TempDir(), "list.txt")
+	writeFile(t, f, content)
+	return f
+}
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/adstats.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/adstats.go
@ -26,10 +26,74 @@ import (
 	"log"
 	"net/http"
 	"net/url"
+	"regexp"
 	"sync"
 	"time"
 )

+// ── ad-candidate learning feed (#662 auto-learn loop) ─────────────────────────
+//
+// The STATIC block list never grows on its own; ad_ghost fed autolearn by
+// capturing CANDIDATES — 3rd-party requests whose PATH smells like an ad/track
+// endpoint — into ad_candidates, which secubox-toolbox-autolearn later promotes
+// into learned-trackers.txt at AD_MIN_SITES distinct sites. The Go cutover
+// dropped this feed, so new adwares (acotedemoi.com) were never observed. This
+// restores it in the engine: the allow/mitm hot path records (host,site) when
+// the request is 3rd-party AND adPathRE matches, buffered + flushed with the
+// existing ad-event machinery.
+
+// adPathRE ports ad_ghost._AD_PATH (RE2-safe, case-insensitive). Matches a path
+// that looks like an ad/track endpoint. Learning only — never a block decision.
+//
+//	Python: re.compile(r"/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|"
+//	                    r"/pixel|/collect|/track(ing)?|/telemetry|/metric", re.I)
+var adPathRE = regexp.MustCompile(`(?i)/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|/pixel|/collect|/track(ing)?|/telemetry|/metric`)
+
+// adCandMapCap bounds the candidate buffer (mirrors ad_ghost's `len(_cand) <
+// 20000` guard): NEW keys past the cap are dropped until the next flush clears
+// it, so a dead portal can never grow memory unbounded.
+const adCandMapCap = 20000
+
+// adCandidates is the lock-guarded (host,site)→hits candidate aggregator,
+// drained by the ad-stats flusher into the ad-event payload's "candidates" list.
+type adCandidates struct {
+	mu  sync.Mutex
+	hit map[adKey]int64
+}
+
+func newAdCandidates() *adCandidates { return &adCandidates{hit: map[adKey]int64{}} }
+
+// record tallies one ad-candidate (host,site). O(1); the cap drops only NEW keys
+// (existing keys keep accumulating). Empty host is ignored.
+func (a *adCandidates) record(host, site string) {
+	if host == "" {
+		return
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	k := adKey{adHost: host, site: site}
+	if _, ok := a.hit[k]; ok {
+		a.hit[k]++
+	} else if len(a.hit) < adCandMapCap {
+		a.hit[k] = 1
+	}
+}
+
+// snapshot atomically reads-and-clears the buffer, returning the candidate rows.
+func (a *adCandidates) snapshot() []adCandidateRow {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if len(a.hit) == 0 {
+		return nil
+	}
+	rows := make([]adCandidateRow, 0, len(a.hit))
+	for k, n := range a.hit {
+		rows = append(rows, adCandidateRow{Host: k.adHost, Site: k.site, Hits: n})
+	}
+	a.hit = map[adKey]int64{}
+	return rows
+}
+
 // refererSite ports the ad_ghost _site_of logic: parse the Referer header as a
 // URL, take its hostname, and return registrable(hostname). Empty Referer or a
 // parse failure → "" (the page that issued the blocked request is unknown).
@ -133,9 +197,19 @@ type adClientRow struct {
 	Bytes   int64  `json:"bytes"`
 }

+// adCandidateRow is one learning candidate (host seen issuing ad-path requests
+// from a 1st-party site). Mirrors the portal /__toolbox/ad-event "candidates"
+// contract → store.record_ad_candidates([(host, site, hits), ...]).
+type adCandidateRow struct {
+	Host string `json:"host"`
+	Site string `json:"site"`
+	Hits int64  `json:"hits"`
+}
+
 type adEventPayload struct {
-	Blocks  []adBlockRow  `json:"blocks"`
-	Clients []adClientRow `json:"clients"`
+	Blocks     []adBlockRow     `json:"blocks"`
+	Clients    []adClientRow    `json:"clients"`
+	Candidates []adCandidateRow `json:"candidates,omitempty"`
 }

 // snapshot atomically reads-and-clears both maps, returning the accumulated rows.
@ -159,7 +233,9 @@ func (a *adStats) snapshot() adEventPayload {
 }

 // empty reports whether a payload carries no rows (nothing to POST).
-func (p adEventPayload) empty() bool { return len(p.Blocks) == 0 && len(p.Clients) == 0 }
+func (p adEventPayload) empty() bool {
+	return len(p.Blocks) == 0 && len(p.Clients) == 0 && len(p.Candidates) == 0
+}

 // adEventClient is a short-timeout fire-and-forget client for the ad-event POST.
 // Sibling of portalClient (banner.go): the portal is a fixed loopback base, so
@ -175,8 +251,15 @@ var adEventClient = &http.Client{
 // non-2xx) is swallowed with at most a debug log — the metrics are stats, not
 // security, and the engine must never block on the portal. Exposed (returns the
 // flushed payload) so the test can assert the snapshot/clear + payload shape.
-func (a *adStats) flushOnce(portal string) adEventPayload {
+//
+// cand may be nil (the CONNECT PoC / tests with no learning feed); when set its
+// candidate rows are drained into the SAME payload so the learning feed rides
+// the existing ad-event channel (one POST per 10s, not two).
+func (a *adStats) flushOnce(portal string, cand *adCandidates) adEventPayload {
 	p := a.snapshot()
+	if cand != nil {
+		p.Candidates = cand.snapshot()
+	}
 	if p.empty() {
 		return p
 	}
@ -198,10 +281,10 @@ func (a *adStats) flushOnce(portal string) adEventPayload {
 // runAdStatsFlusher is the background flusher goroutine: every adFlushInterval it
 // drains the aggregator to the portal. Start it once from main() (like the
 // engine's other startup goroutines). It runs forever (the process lifetime).
-func (a *adStats) runAdStatsFlusher(portal string) {
+func (a *adStats) runAdStatsFlusher(portal string, cand *adCandidates) {
 	t := time.NewTicker(adFlushInterval)
 	defer t.Stop()
 	for range t.C {
-		a.flushOnce(portal)
+		a.flushOnce(portal, cand)
 	}
 }
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/adstats_test.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/adstats_test.go
@ -41,7 +41,7 @@ func TestRecordAdBlockEmptyHostIgnored(t *testing.T) {

 func TestRecordAdBlockPerClientOnlyWhenMacSet(t *testing.T) {
 	a := newAdStats()
-	a.recordAdBlock("ads.example.com", "site", "")    // no mac → no client row
+	a.recordAdBlock("ads.example.com", "site", "")     // no mac → no client row
 	a.recordAdBlock("ads.example.com", "site", "mac1") // mac → client row
 	a.recordAdBlock("ads.example.com", "site", "mac1")

@ -111,7 +111,7 @@ func TestFlushOncePayloadShapeMatchesContract(t *testing.T) {
 	}))
 	defer srv.Close()

-	a.flushOnce(srv.URL)
+	a.flushOnce(srv.URL, nil)

 	if ct != "application/json" {
 		t.Fatalf("Content-Type = %q, want application/json", ct)
@ -145,7 +145,7 @@ func TestFlushOnceEmptySkipsPost(t *testing.T) {
 		w.WriteHeader(http.StatusNoContent)
 	}))
 	defer srv.Close()
-	a.flushOnce(srv.URL)
+	a.flushOnce(srv.URL, nil)
 	if posted {
 		t.Fatalf("flushOnce on empty aggregator must not POST")
 	}
@ -156,7 +156,7 @@ func TestFlushOnceSwallowsPortalError(t *testing.T) {
 	a.recordAdBlock("ads.example.com", "site", "")
 	// Unreachable portal → must not panic, must still clear the maps (snapshot
 	// happens before the POST).
-	a.flushOnce("http://127.0.0.1:1")
+	a.flushOnce("http://127.0.0.1:1", nil)
 	if len(a.blocks) != 0 {
 		t.Fatalf("flushOnce must clear maps even on POST failure")
 	}
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/main.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/main.go
@ -199,12 +199,13 @@ func ja4ish(h *tls.ClientHelloInfo) string {
 type Proxy struct {
 	ca      *CA
 	pol     *Policy
-	jaSink  func(string) // JA4 observations (logged; a sidecar in prod)
-	jarKey  []byte       // anti-track HMAC fake-identity seed (nil → poison off)
-	poison  bool         // master gate: poison tracker Set-Cookies (default on when jarKey present)
-	portal  string       // portal base URL for /__toolbox/* reverse-proxy (banner assets)
-	ads     *adStats     // #662 — ad-block metrics aggregator (flushed to the portal)
-	cspDemo bool         // #662 CONSENTED-DEMONSTRATION: relax a page's CSP so the injected loader runs, and flag the bypass (data-csp=1 → 🔓). Default on.
+	jaSink  func(string)  // JA4 observations (logged; a sidecar in prod)
+	jarKey  []byte        // anti-track HMAC fake-identity seed (nil → poison off)
+	poison  bool          // master gate: poison tracker Set-Cookies (default on when jarKey present)
+	portal  string        // portal base URL for /__toolbox/* reverse-proxy (banner assets)
+	ads     *adStats      // #662 — ad-block metrics aggregator (flushed to the portal)
+	cand    *adCandidates // #662 — ad-candidate learning feed (flushed with ads to the portal)
+	cspDemo bool          // #662 CONSENTED-DEMONSTRATION: relax a page's CSP so the injected loader runs, and flag the bypass (data-csp=1 → 🔓). Default on.

 	// analysisRelay gates the per-flow telemetry relay to the dpi/cookies/ja4
 	// analysis sidecar sockets (#662 — restoring the "Qui te piste?" events the
@ -229,6 +230,33 @@ func (px *Proxy) recordAdBlock(adHost, site, macHash string) {
 	}
 }

+// maybeRecordAdCandidate feeds the auto-learn loop (#662): on the allow/mitm
+// path (NOT block — already caught; NOT allowlisted/own-infra), it records an
+// ad-candidate (host, site) when the request is 3rd-party
+// (registrable(host) != registrable(site)) AND the path smells like an ad/track
+// endpoint (adPathRE). It is the engine port of ad_ghost's candidate capture —
+// the feed secubox-toolbox-autolearn promotes into learned-trackers.txt at
+// AD_MIN_SITES distinct sites. Gated behind the analysis/ad relay flag, O(1) hot
+// path, fire-and-forget, nil-safe (CONNECT PoC / tests with no feed).
+func (px *Proxy) maybeRecordAdCandidate(host, site, path string) {
+	if px == nil || px.cand == nil || !px.relayEnabled() || px.pol == nil {
+		return
+	}
+	if site == "" || host == "" {
+		return // no 1st-party context (no Referer) → nothing to attribute.
+	}
+	if px.pol.allowedSafe(host) {
+		return // own-infra / allowlist: never learn our own / trusted hosts.
+	}
+	if registrable(host) == registrable(site) {
+		return // 1st-party request: not a cross-site ad/track signal.
+	}
+	if !adPathRE.MatchString(path) {
+		return // path doesn't look like an ad/track endpoint.
+	}
+	px.cand.record(host, site)
+}
+
 func (px *Proxy) serverTLSConfig() *tls.Config {
 	return px.serverTLSConfigCapture(nil)
 }
@ -414,6 +442,15 @@ func (px *Proxy) mitmPipeline(tconn *tls.Conn, rawClient net.Conn, host, verdict
 	relayIP := peerIP(rawClient)
 	px.emitDPI(relayIP, clientHash, host, req)

+	// #662 — feed the auto-learn loop: on this allow/mitm flow, record an
+	// ad-candidate when the request is 3rd-party AND its path smells like an
+	// ad/track endpoint (ad_ghost's _AD_PATH heuristic). site = registrable of
+	// the Referer (the ad_ghost _site_of flavour). Done BEFORE anonymize mutates
+	// headers (so the Referer is the client's original). O(1), gated,
+	// fire-and-forget — a new adware host gets observed here, promoted by
+	// autolearn, then blocked+smogged after the policy live-reloads it.
+	px.maybeRecordAdCandidate(host, refererSite(req.Header.Get("Referer")), req.URL.Path)
+
 	anonymizeRequest(req.Header)

 	// #662 — do NOT touch Accept-Encoding. We FORWARD the client's original
@ -569,6 +606,7 @@ func main() {
 		poison:  *poison,
 		portal:  *portal,
 		ads:     newAdStats(),
+		cand:    newAdCandidates(),
 		cspDemo: *cspDemo,

 		analysisRelay: *analysisRelay,
@ -585,7 +623,9 @@ func main() {
 	// #662 — start the ad-block metrics flusher: the block path tallies every
 	// 204 into px.ads, drained every 10s to the portal's /__toolbox/ad-event
 	// (best-effort, fire-and-forget) so the #ads dashboard sees blocks again.
-	go px.ads.runAdStatsFlusher(*portal)
+	// #662 — the candidate feed (px.cand) is drained in the SAME flush so the
+	// learning candidates ride the existing ad-event channel (one POST / 10s).
+	go px.ads.runAdStatsFlusher(*portal, px.cand)
 	if *transparent {
 		// Transparent R3 mode: raw accept loop, each conn carries its pre-DNAT
 		// destination via SO_ORIGINAL_DST (recovered in handleTransparent). The
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/policy.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/policy.go
@ -17,6 +17,8 @@ import (
 	"os"
 	"regexp"
 	"strings"
+	"sync"
+	"time"
 )

 // ── ad_ghost: static ad/tracker host pattern (port of _AD_HOST) ──────────────
@ -95,19 +97,55 @@ func envOr(key, def string) string {
 // Policy carries the loaded sets/regex and decides per-host actions. It also
 // keeps the legacy PoC fields (Inject) so the existing wiring/tests still work.
 type Policy struct {
-	adHost       *regexp.Regexp
-	learned      map[string]bool // learned-trackers (host or registrable, lowercased)
-	allow        map[string]bool // ad-allowlist (host or registrable, lowercased)
-	spliceSeed   map[string]bool // splice seed patterns
-	spliceLearn  map[string]bool // splice learned patterns
-	never        map[string]bool // pure-trackers ∪ fortknox (splice never-set)
-	selfRegs     map[string]bool // own-infra registrable domains
-	selfDomains  []string        // own-infra (for the host==d || host endswith .d guard)
+	// mu guards the live-reloadable map fields below. Decide/allowed/blockedByAd/
+	// shouldSplice take RLock; maybeReload takes Lock only when a backing file
+	// actually changed (the throttle + stat happen under a separate lighter lock).
+	mu sync.RWMutex
+
+	adHost      *regexp.Regexp
+	learned     map[string]bool // learned-trackers (host or registrable, lowercased)
+	allow       map[string]bool // ad-allowlist (host or registrable, lowercased)
+	spliceSeed  map[string]bool // splice seed patterns
+	spliceLearn map[string]bool // splice learned patterns
+	never       map[string]bool // pure-trackers ∪ fortknox (splice never-set)
+	selfRegs    map[string]bool // own-infra registrable domains
+	selfDomains []string        // own-infra (for the host==d || host endswith .d guard)
+
+	// ── live-reload state (#662 auto-learn loop) ─────────────────────────────
+	//
+	// The lists are loaded once at startup, then re-read on-disk when their
+	// mtime changes so autolearn promotions / manual edits take effect WITHOUT a
+	// worker restart (mirrors ad_ghost._maybe_reload). The hot path (Decide)
+	// calls maybeReload(): a throttle check, then — at most every reloadThrottle —
+	// a cheap stat() of each backing file. Only a changed file is re-read and its
+	// map atomically swapped under mu.
+	reloadFiles    []reloadTarget // backing files + their swap target
+	fortknoxSites  []string       // kept for rebuilding the never-set on pure-trackers reload
+	reloadMu       sync.Mutex     // guards lastReloadCheck + the per-file mtimes
+	lastReloadID   int64          // unix-nano of the last throttle pass (0 = never)
+	reloadThrottle time.Duration  // min interval between stat passes (0 in tests = eager)

 	// Legacy PoC fields kept so non-policy behaviour is unchanged.
 	Inject []byte // banner / ad-CSS marker injected before </head> or </body>
 }

+// reloadTarget describes one backing file the engine live-reloads: its path, the
+// last mtime we read, whether comment-stripping applies (loadLines vs
+// loadLinesRaw), and an applier that swaps the freshly-read set into the right
+// Policy field (under p.mu, held by the caller). pure-trackers re-derives the
+// never-set (∪ fortknox) so it stays consistent.
+type reloadTarget struct {
+	path      string
+	stripComm bool
+	lastMtime int64
+	apply     func(p *Policy, set map[string]bool)
+}
+
+// defaultReloadThrottle is the production stat cadence: a backing-file change
+// (autolearn runs hourly; a promotion is rare) is observed within ~15s, and the
+// hot path stats at most ~4×/minute regardless of request rate.
+const defaultReloadThrottle = 15 * time.Second
+
 // loadLines mirrors the comment-stripping Python loaders (splice._load_lines,
 // ad_ghost._allowed's allowlist read): split on first '#', trim, lowercase,
 // skip blanks. Missing/unreadable file → empty set (best-effort).
@ -196,16 +234,107 @@ func LoadPolicy(opts PolicyOpts) (*Policy, error) {
 		selfDomains = append(selfDomains, d)
 	}

-	return &Policy{
-		adHost:      re,
-		learned:     loadLinesRaw(opts.LearnedPath), // mirrors _learned_set (no comment-strip)
-		allow:       loadLines(opts.AllowPath),
-		spliceSeed:  loadLines(opts.SpliceSeedPath),
-		spliceLearn: loadLines(opts.SpliceLearnPath),
-		never:       never,
-		selfRegs:    selfRegs,
-		selfDomains: selfDomains,
-	}, nil
+	p := &Policy{
+		adHost:         re,
+		learned:        loadLinesRaw(opts.LearnedPath), // mirrors _learned_set (no comment-strip)
+		allow:          loadLines(opts.AllowPath),
+		spliceSeed:     loadLines(opts.SpliceSeedPath),
+		spliceLearn:    loadLines(opts.SpliceLearnPath),
+		never:          never,
+		selfRegs:       selfRegs,
+		selfDomains:    selfDomains,
+		fortknoxSites:  append([]string(nil), opts.FortknoxSites...),
+		reloadThrottle: defaultReloadThrottle,
+	}
+
+	// ── register the live-reloadable backing files (#662 auto-learn loop) ─────
+	//
+	// Each entry re-reads its file when its mtime changes and atomically swaps
+	// the map under p.mu (held by maybeReload). learned-trackers + ad-allowlist
+	// are the load-bearing pair (autolearn promotes into learned; the operator
+	// edits the allowlist); the splice seed/learned + pure-trackers files are
+	// reloaded too for consistency (pure-trackers re-derives the never-set).
+	p.reloadFiles = []reloadTarget{
+		{path: opts.LearnedPath, stripComm: false, lastMtime: statMtime(opts.LearnedPath),
+			apply: func(p *Policy, s map[string]bool) { p.learned = s }},
+		{path: opts.AllowPath, stripComm: true, lastMtime: statMtime(opts.AllowPath),
+			apply: func(p *Policy, s map[string]bool) { p.allow = s }},
+		{path: opts.SpliceSeedPath, stripComm: true, lastMtime: statMtime(opts.SpliceSeedPath),
+			apply: func(p *Policy, s map[string]bool) { p.spliceSeed = s }},
+		{path: opts.SpliceLearnPath, stripComm: true, lastMtime: statMtime(opts.SpliceLearnPath),
+			apply: func(p *Policy, s map[string]bool) { p.spliceLearn = s }},
+		{path: opts.PureTrackersPath, stripComm: true, lastMtime: statMtime(opts.PureTrackersPath),
+			apply: func(p *Policy, s map[string]bool) {
+				// pure-trackers ∪ fortknox → never-set (mirrors LoadPolicy above).
+				for _, fk := range p.fortknoxSites {
+					if fk = strings.Trim(strings.ToLower(strings.TrimSpace(fk)), "."); fk != "" {
+						s[fk] = true
+					}
+				}
+				p.never = s
+			}},
+	}
+	return p, nil
+}
+
+// statMtime returns the file's mtime in unix-nano, or 0 when the file is missing
+// or unreadable (best-effort, like the Python loaders: a missing file → empty
+// set, mtime 0). A file appearing/disappearing therefore registers as a change.
+func statMtime(path string) int64 {
+	if path == "" {
+		return 0
+	}
+	fi, err := os.Stat(path)
+	if err != nil {
+		return 0
+	}
+	return fi.ModTime().UnixNano()
+}
+
+// maybeReload re-reads any backing list whose on-disk mtime changed since the
+// last pass, swapping the affected map(s) under p.mu. Throttled to at most one
+// stat pass per p.reloadThrottle (cheap: a time compare + a few stats), so the
+// Decide hot path pays almost nothing. Concurrency-safe: the throttle/mtime
+// bookkeeping is under reloadMu and the map swap under mu — Decide's readers
+// hold mu.RLock, so a swap is atomic w.r.t. any in-flight decision.
+func (p *Policy) maybeReload() {
+	now := time.Now()
+	p.reloadMu.Lock()
+	if p.reloadThrottle > 0 && p.lastReloadID != 0 &&
+		now.Sub(time.Unix(0, p.lastReloadID)) < p.reloadThrottle {
+		p.reloadMu.Unlock()
+		return
+	}
+	p.lastReloadID = now.UnixNano()
+
+	// Collect the files that changed (stat under reloadMu; re-read outside mu).
+	type pending struct {
+		idx int
+		set map[string]bool
+	}
+	var changed []pending
+	for i := range p.reloadFiles {
+		rt := &p.reloadFiles[i]
+		if rt.path == "" {
+			continue
+		}
+		m := statMtime(rt.path)
+		if m != rt.lastMtime {
+			rt.lastMtime = m
+			changed = append(changed, pending{idx: i, set: scanLines(rt.path, rt.stripComm)})
+		}
+	}
+	p.reloadMu.Unlock()
+
+	if len(changed) == 0 {
+		return
+	}
+	// Swap the affected maps atomically under the write lock.
+	p.mu.Lock()
+	for _, c := range changed {
+		p.reloadFiles[c.idx].apply(p, c.set)
+	}
+	p.mu.Unlock()
 }

 // ── registrable: port of ad_ghost._registrable ───────────────────────────────
@ -279,6 +408,11 @@ func hostMatches(host string, patterns map[string]bool) bool {

 // allowed: port of ad_ghost._allowed. Own-infra ALWAYS wins (reflash-safe),
 // then the operator allowlist (host or registrable).
+//
+// LOCK CONTRACT: reads the reloadable allow map — the caller MUST hold at least
+// p.mu.RLock (Decide / shouldPoison do). Lock-free internally so Decide can call
+// it alongside shouldSplice/blockedByAd under a single RLock (sync.RWMutex is
+// not reentrant).
 func (p *Policy) allowed(host string) bool {
 	h := strings.ToLower(host)
 	reg := registrable(h)
@ -297,7 +431,19 @@ func (p *Policy) allowed(host string) bool {
 	return p.allow[h] || p.allow[reg]
 }

+// allowedSafe is the lock-taking entry point to allowed() for callers OUTSIDE a
+// Decide RLock (e.g. the ad-candidate feed). It also picks up a live-reloaded
+// allowlist via maybeReload, so a freshly-allowlisted host stops being learned.
+func (p *Policy) allowedSafe(host string) bool {
+	p.maybeReload()
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	return p.allowed(host)
+}
+
 // shouldSplice: port of splice.should_splice (never wins; then seed ∪ learned).
+// LOCK CONTRACT: reads the reloadable never/spliceSeed/spliceLearn maps — the
+// caller MUST hold at least p.mu.RLock (Decide does).
 func (p *Policy) shouldSplice(sni string) bool {
 	s := strings.Trim(strings.ToLower(sni), ".")
 	if s == "" {
@ -312,6 +458,10 @@ func (p *Policy) shouldSplice(sni string) bool {
 // blockedByAd: port of the ad_ghost requestheaders block decision (sans the
 // allowlist guard, which Decide applies first): _AD_HOST match OR
 // registrable/host in learned-trackers.
+//
+// LOCK CONTRACT: reads the reloadable learned map — the caller MUST hold at
+// least p.mu.RLock. Decide and shouldPoison (via isTracker) do; the candidate-
+// emit path calls it only through those.
 func (p *Policy) blockedByAd(host string) bool {
 	if p.adHost.MatchString(host) {
 		return true
@ -339,9 +489,16 @@ func (p *Policy) blockedByAd(host string) bool {
 // sni defaults to host when empty (the live engine splices on SNI == the TLS
 // host; for the parity harness host and sni are the same value).
 func (p *Policy) Decide(host, sni string) string {
+	// #662 — pick up autolearn promotions / manual edits without a worker
+	// restart. Throttled to ~every reloadThrottle and best-effort, so the hot
+	// path normally pays only a time compare. Done BEFORE taking the read lock
+	// (maybeReload may take the write lock to swap a changed map).
+	p.maybeReload()
 	if sni == "" {
 		sni = host
 	}
+	p.mu.RLock()
+	defer p.mu.RUnlock()
 	if p.allowed(host) {
 		return "allow"
 	}
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/privacy.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/privacy.go
@ -148,6 +148,12 @@ func (p *Policy) isTracker(host string) bool {
 // allowlisted — own-infra flows are left clean (same dark safety as the block
 // path). The caller additionally requires a loaded jar key.
 func (p *Policy) shouldPoison(host string) bool {
+	// #662 — consult the same live-reloaded learned set Decide uses, so a host
+	// promoted into learned-trackers (by autolearn) is poisoned (smogged), not
+	// only 204'd, without a worker restart. RLock-guard the reloadable maps
+	// (allowed + isTracker→blockedByAd read them); maybeReload may swap them.
+	p.mu.RLock()
+	defer p.mu.RUnlock()
 	if p.allowed(host) {
 		return false // own-infra / allowlist → never poison
 	}
--- a/packages/secubox-toolbox-ng/cmd/sbxmitm/reload_test.go
+++ b/packages/secubox-toolbox-ng/cmd/sbxmitm/reload_test.go
@ -0,0 +1,189 @@
+// SPDX-License-Identifier: LicenseRef-CMSD-1.0
+// Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
+//
+// SecuBox-Deb :: toolbox-ng :: policy live-reload tests (#662 auto-learn loop)
+//
+// The #662 Go cutover loaded the BLOCK/SPLICE lists ONCE at startup, so an
+// autolearn promotion (or a manual edit) of learned-trackers.txt never took
+// effect until a worker restart — the very thing that made new adwares slip
+// through forever. These tests prove the mtime-based live-reload: after the
+// throttle window, a host appended to learned-trackers.txt flips Decide from
+// "mitm" to "block" with NO restart. Concurrency is exercised under -race.
+package main
+
+import (
+	"os"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// writeFile is a tiny helper that (re)writes a backing list file with content.
+func writeFile(t *testing.T, path, content string) {
+	t.Helper()
+	if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+		t.Fatalf("write %s: %v", path, err)
+	}
+}
+
+// bumpMtime forces the file's mtime forward so the reload's stat sees a change
+// even on coarse-granularity filesystems or sub-second test runs.
+func bumpMtime(t *testing.T, path string, d time.Duration) {
+	t.Helper()
+	ft := time.Now().Add(d)
+	if err := os.Chtimes(path, ft, ft); err != nil {
+		t.Fatalf("chtimes %s: %v", path, err)
+	}
+}
+
+// TestMaybeReloadPicksUpAppendedLearnedTracker is the linchpin test: a host that
+// initially Decides "mitm" must flip to "block" once it is appended to
+// learned-trackers.txt and the throttle window elapses — without reloading the
+// Policy from scratch.
+func TestMaybeReloadPicksUpAppendedLearnedTracker(t *testing.T) {
+	dir := t.TempDir()
+	learned := filepath.Join(dir, "learned-trackers.txt")
+	allow := filepath.Join(dir, "ad-allowlist.txt")
+	writeFile(t, learned, "")
+	writeFile(t, allow, "")
+
+	pol, err := LoadPolicy(PolicyOpts{
+		LearnedPath: learned,
+		AllowPath:   allow,
+		// keep the splice/never paths in the temp dir so missing-file behaviour
+		// (empty set) is deterministic.
+		SpliceSeedPath:   filepath.Join(dir, "seed"),
+		SpliceLearnPath:  filepath.Join(dir, "slearn"),
+		PureTrackersPath: filepath.Join(dir, "pure"),
+		SelfDomains:      []string{"secubox.in"},
+	})
+	if err != nil {
+		t.Fatalf("LoadPolicy: %v", err)
+	}
+	// Make the reload eager for the test (no 15s wait): zero throttle.
+	pol.reloadThrottle = 0
+
+	const host = "acotedemoi.com"
+	if got := pol.Decide(host, host); got != "mitm" {
+		t.Fatalf("before promotion: Decide(%q) = %q, want mitm", host, got)
+	}
+
+	// Promote: append the host and bump mtime forward.
+	writeFile(t, learned, host+"\n")
+	bumpMtime(t, learned, 2*time.Second)
+
+	if got := pol.Decide(host, host); got != "block" {
+		t.Fatalf("after promotion: Decide(%q) = %q, want block", host, got)
+	}
+}
+
+// TestMaybeReloadThrottled proves the throttle: with a non-zero throttle window,
+// a change made just after a reload is NOT observed until the window elapses,
+// keeping the hot path cheap (one stat per ~window, not per request).
+func TestMaybeReloadThrottled(t *testing.T) {
+	dir := t.TempDir()
+	learned := filepath.Join(dir, "learned-trackers.txt")
+	writeFile(t, learned, "")
+
+	pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: filepath.Join(dir, "allow")})
+	if err != nil {
+		t.Fatalf("LoadPolicy: %v", err)
+	}
+	pol.reloadThrottle = time.Hour // effectively "never re-stat during the test"
+
+	// Prime the throttle clock with one Decide (does the initial stat).
+	_ = pol.Decide("x.example", "x.example")
+
+	const host = "tracker.example"
+	writeFile(t, learned, host+"\n")
+	bumpMtime(t, learned, 2*time.Second)
+
+	if got := pol.Decide(host, host); got != "mitm" {
+		t.Fatalf("throttled: Decide(%q) = %q, want mitm (change not yet observed)", host, got)
+	}
+}
+
+// TestMaybeReloadAllowlist proves the allowlist file is live-reloaded too: a
+// host the ad-host regex would block ("doubleclick.net") flips block→allow once
+// appended to the allowlist and the window elapses.
+func TestMaybeReloadAllowlist(t *testing.T) {
+	dir := t.TempDir()
+	learned := filepath.Join(dir, "learned-trackers.txt")
+	allow := filepath.Join(dir, "ad-allowlist.txt")
+	writeFile(t, learned, "")
+	writeFile(t, allow, "")
+
+	pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: allow})
+	if err != nil {
+		t.Fatalf("LoadPolicy: %v", err)
+	}
+	pol.reloadThrottle = 0
+
+	const host = "doubleclick.net"
+	if got := pol.Decide(host, host); got != "block" {
+		t.Fatalf("before allow: Decide(%q) = %q, want block", host, got)
+	}
+	writeFile(t, allow, host+"\n")
+	bumpMtime(t, allow, 2*time.Second)
+	if got := pol.Decide(host, host); got != "allow" {
+		t.Fatalf("after allow: Decide(%q) = %q, want allow", host, got)
+	}
+}
+
+// TestMaybeReloadConcurrent runs Decide from many goroutines while the backing
+// learned file is rewritten concurrently. Under `go test -race` this proves the
+// RWMutex-guarded swap is data-race-free.
+func TestMaybeReloadConcurrent(t *testing.T) {
+	dir := t.TempDir()
+	learned := filepath.Join(dir, "learned-trackers.txt")
+	writeFile(t, learned, "seed.example\n")
+
+	pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: filepath.Join(dir, "allow")})
+	if err != nil {
+		t.Fatalf("LoadPolicy: %v", err)
+	}
+	pol.reloadThrottle = 0 // force a stat on every Decide → maximal contention
+
+	var wg sync.WaitGroup
+	var blocks int64
+	stop := make(chan struct{})
+
+	// Writer: keep appending hosts + bumping mtime.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		i := 0
+		for {
+			select {
+			case <-stop:
+				return
+			default:
+			}
+			writeFile(t, learned, "seed.example\nh"+itoa(i)+".example\n")
+			bumpMtime(t, learned, time.Duration(i+1)*time.Second)
+			i++
+		}
+	}()
+
+	// Readers: hammer Decide on the seed (stable → always block) + a live host.
+	for r := 0; r < 8; r++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for j := 0; j < 2000; j++ {
+				if pol.Decide("seed.example", "seed.example") == "block" {
+					atomic.AddInt64(&blocks, 1)
+				}
+				pol.Decide("h0.example", "h0.example")
+			}
+		}()
+	}
+	time.Sleep(50 * time.Millisecond)
+	close(stop)
+	wg.Wait()
+	if blocks == 0 {
+		t.Fatal("expected the stable seed host to block at least once")
+	}
+}
--- a/packages/secubox-toolbox-ng/debian/changelog
+++ b/packages/secubox-toolbox-ng/debian/changelog
@ -1,3 +1,12 @@
+secubox-toolbox-ng (0.1.12-1~bookworm1) bookworm; urgency=medium
+
+  * adlearn: live-reload the blocklist (mtime) so promotions/edits block without
+    a worker restart; emit ad-candidates (3rd-party ad-path) to the portal;
+    autolearn also promotes cross-site trackers from social_edges. Learned
+    trackers are auto-204 + poison-smogged. (ref #662)
+
+ -- Gerald KERMA <devel@cybermind.fr>  Thu, 19 Jun 2026 12:30:00 +0000
+
 secubox-toolbox-ng (0.1.11-1~bookworm1) bookworm; urgency=medium

  * social: ALSO correlate on the block path — blocked 3rd-party trackers still
--- a/packages/secubox-toolbox/sbin/secubox-toolbox-autolearn
+++ b/packages/secubox-toolbox/sbin/secubox-toolbox-autolearn
@ -221,6 +221,92 @@ def _ad_feed() -> int:
    return len(promoted)


+# #662 — cross-site-reuse promotion. A tracker_domain seen issuing cookies on
+# >= SOCIAL_MIN_SITES DISTINCT src_site (across peers, recent window) is a
+# BEHAVIOURALLY-confirmed cross-site tracker (the social graph), independent of
+# the ad-path heuristic. Promote it into learned-trackers.txt so the engine
+# blocks (204) + smogs it. Conservative + reuses the SAME allowlist/self guard as
+# _ad_feed (NEVER promote allowlisted or self domains). De-dups against OUT.
+SOCIAL_MIN_SITES = int(os.environ.get("SECUBOX_SOCIAL_MIN_SITES", "3"))
+SOCIAL_WINDOW_HOURS = int(os.environ.get("SECUBOX_SOCIAL_WINDOW_HOURS", "168"))
+
+
+def _social_feed() -> int:
+    """Promote cross-site cookie-reuse trackers (social_edges) into the learned
+    blocklist. A tracker_domain linking >= SOCIAL_MIN_SITES distinct src_site in
+    the last SOCIAL_WINDOW_HOURS is promoted. Allowlist + self domains excluded
+    (reused guard). MERGES into OUT (never overwrites). Returns count promoted, or
+    -1 if unavailable (e.g. no social_edges table). Best-effort: never raises."""
+    cutoff = int(time.time()) - SOCIAL_WINDOW_HOURS * 3600
+    try:
+        con = sqlite3.connect(DB, timeout=5)
+        rows = con.execute(
+            "SELECT tracker_domain, COUNT(DISTINCT src_site) AS sites "
+            "FROM social_edges WHERE ts >= ? "
+            "GROUP BY tracker_domain", (cutoff,)).fetchall()
+        con.close()
+    except Exception as e:
+        sys.stderr.write(f"autolearn: social query failed: {e}\n")
+        return -1
+    # Fold to registrable and aggregate the distinct-site count per eTLD+1 (two
+    # tracker subdomains of the same registrable jointly meet the threshold).
+    by_reg: dict[str, set] = {}
+    try:
+        scon = sqlite3.connect(DB, timeout=5)
+        for td, _sites in rows:
+            reg = registrable(td)
+            if not reg:
+                continue
+            ss = by_reg.setdefault(reg, set())
+            for (s,) in scon.execute(
+                "SELECT DISTINCT src_site FROM social_edges "
+                "WHERE ts >= ? AND tracker_domain = ?", (cutoff, td)):
+                if s:
+                    ss.add(s)
+        scon.close()
+    except Exception as e:
+        sys.stderr.write(f"autolearn: social fold failed: {e}\n")
+        return -1
+
+    allow = _load_ad_allowlist()
+    self_doms = {d.strip().lower() for d in
+                 os.environ.get("SECUBOX_SELF_DOMAINS", "secubox.in").split(",")
+                 if d.strip()}
+    promoted: set = set()
+    for reg, sites in by_reg.items():
+        if len(sites) < SOCIAL_MIN_SITES:
+            continue
+        if reg in allow:
+            continue
+        if reg in self_doms or any(reg == d or reg.endswith("." + d) for d in self_doms):
+            continue
+        promoted.add(reg)
+    if not promoted:
+        return 0
+    existing: set = set()
+    try:
+        if os.path.exists(OUT):
+            with open(OUT, encoding="utf-8") as fh:
+                for ln in fh:
+                    ln = ln.strip()
+                    if ln:
+                        existing.add(ln)
+    except Exception as e:
+        sys.stderr.write(f"autolearn: social merge read failed: {e}\n")
+    new = promoted - existing
+    merged = sorted(existing | promoted)[:MAX_ENTRIES]
+    try:
+        os.makedirs(os.path.dirname(OUT), exist_ok=True)
+        tmp = OUT + ".tmp"
+        with open(tmp, "w", encoding="utf-8") as fh:
+            fh.write("\n".join(merged) + ("\n" if merged else ""))
+        os.replace(tmp, OUT)
+    except Exception as e:
+        sys.stderr.write(f"autolearn: social write failed: {e}\n")
+        return -1
+    return len(new)
+
+
 def main() -> int:
    learned: set[str] = set()
    try:
@ -317,6 +403,11 @@ def main() -> int:
        sys.stderr.write(f"autolearn: {_n_ad} ad-candidate hosts promoted\n")
    except Exception as e:
        sys.stderr.write(f"autolearn: ad feed error: {e}\n")
+    try:
+        _n_social = _social_feed()
+        sys.stderr.write(f"autolearn: {_n_social} cross-site cookie trackers promoted\n")
+    except Exception as e:
+        sys.stderr.write(f"autolearn: social feed error: {e}\n")
    sys.stderr.write(
        f"autolearn: {len(out)} hosts learned ({ti} threat-intel + "
        f"{len(out) - ti} classified cross-site) @ {int(time.time())}"
--- a/packages/secubox-toolbox/secubox_toolbox/api.py
+++ b/packages/secubox-toolbox/secubox_toolbox/api.py
@ -113,12 +113,20 @@ async def toolbox_ad_event(request: Request) -> Response:
            return Response(status_code=204)
        blocks = body.get("blocks") or []
        clients = body.get("clients") or []
+        # #662 — the Go engine now also feeds the AUTO-LEARN loop: 3rd-party
+        # ad-path requests it saw on the allow/mitm path (ad_ghost's _AD_PATH
+        # heuristic), recorded as candidates here for secubox-toolbox-autolearn
+        # to promote into learned-trackers.txt at AD_MIN_SITES distinct sites.
+        candidates = body.get("candidates") or []
        if not isinstance(blocks, list):
            blocks = []
        if not isinstance(clients, list):
            clients = []
+        if not isinstance(candidates, list):
+            candidates = []
        blocks = blocks[:_AD_EVENT_ROW_CAP]
        clients = clients[:_AD_EVENT_ROW_CAP]
+        candidates = candidates[:_AD_EVENT_ROW_CAP]

        block_rows = [
            (b["ad_host"], b.get("site", ""), "block", int(b.get("hits", 0)), int(b.get("bytes", 0)))
@ -130,10 +138,17 @@ async def toolbox_ad_event(request: Request) -> Response:
            for c in clients
            if isinstance(c, dict) and c.get("mac_hash") and c.get("ad_host")
        ]
+        cand_rows = [
+            (c["host"], c.get("site", ""), int(c.get("hits", 0)))
+            for c in candidates
+            if isinstance(c, dict) and c.get("host")
+        ]
        if block_rows:
            store.record_ad_blocks(block_rows)
        if client_rows:
            store.record_ad_client_blocks(client_rows)
+        if cand_rows:
+            store.record_ad_candidates(cand_rows)
    except Exception as e:  # never raise into the engine's fire-and-forget POST
        log.debug("ad-event ingest failed: %s", e)
    return Response(status_code=204)
--- a/packages/secubox-toolbox/tests/test_ad_event_candidates.py
+++ b/packages/secubox-toolbox/tests/test_ad_event_candidates.py
@ -0,0 +1,68 @@
+# tests/test_ad_event_candidates.py
+# SPDX-License-Identifier: LicenseRef-CMSD-1.0
+"""#662 — /__toolbox/ad-event accepts a "candidates" list (the Go engine's
+auto-learn feed) → store.record_ad_candidates(). Never 500s the engine."""
+import asyncio
+import json
+
+from secubox_toolbox import api, store
+
+
+class _FakeRequest:
+    """Minimal Request stand-in: headers + an async json() body."""
+
+    def __init__(self, body: dict, content_length=None):
+        self._body = body
+        cl = content_length
+        if cl is None:
+            cl = len(json.dumps(body).encode())
+        self.headers = {"content-length": str(cl)}
+
+    async def json(self):
+        return self._body
+
+
+def test_candidates_ingested(monkeypatch):
+    captured = {}
+    monkeypatch.setattr(store, "record_ad_candidates", lambda rows: captured.setdefault("cand", list(rows)))
+    monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
+    monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
+
+    body = {
+        "blocks": [],
+        "clients": [],
+        "candidates": [
+            {"host": "metrics.acotedemoi.com", "site": "lemonde.fr", "hits": 3},
+            {"host": "ads.foo.io", "site": "news.example", "hits": 1},
+            {"site": "no-host.example", "hits": 9},        # missing host → skipped
+            {"host": "", "site": "x", "hits": 2},          # empty host → skipped
+        ],
+    }
+    resp = asyncio.run(api.toolbox_ad_event(_FakeRequest(body)))
+    assert resp.status_code == 204
+    rows = captured.get("cand")
+    assert rows == [
+        ("metrics.acotedemoi.com", "lemonde.fr", 3),
+        ("ads.foo.io", "news.example", 1),
+    ]
+
+
+def test_candidates_absent_is_noop(monkeypatch):
+    called = {"cand": False}
+    monkeypatch.setattr(store, "record_ad_candidates", lambda rows: called.__setitem__("cand", True))
+    monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
+    monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
+
+    resp = asyncio.run(api.toolbox_ad_event(_FakeRequest({"blocks": [], "clients": []})))
+    assert resp.status_code == 204
+    assert called["cand"] is False  # no candidates key → record_ad_candidates not called
+
+
+def test_candidates_bad_payload_never_500s(monkeypatch):
+    monkeypatch.setattr(store, "record_ad_candidates", lambda rows: (_ for _ in ()).throw(RuntimeError("boom")))
+    monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
+    monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
+
+    body = {"candidates": [{"host": "x.io", "site": "s", "hits": 1}]}
+    resp = asyncio.run(api.toolbox_ad_event(_FakeRequest(body)))
+    assert resp.status_code == 204  # store raised, but the endpoint swallows it
--- a/packages/secubox-toolbox/tests/test_autolearn_socialfeed.py
+++ b/packages/secubox-toolbox/tests/test_autolearn_socialfeed.py
@ -0,0 +1,98 @@
+# tests/test_autolearn_socialfeed.py
+# SPDX-License-Identifier: LicenseRef-CMSD-1.0
+"""#662 — cross-site-reuse promotion: a tracker_domain seen on >= N distinct
+src_site across recent social_edges is a behaviourally-confirmed cross-site
+tracker and gets promoted into learned-trackers.txt. Allowlist + self guard
+reused from _ad_feed; merges (never overwrites)."""
+import sqlite3
+import importlib.util
+import pathlib
+import time
+
+
+def _load_autolearn():
+    p = pathlib.Path(__file__).resolve().parents[1] / "sbin" / "secubox-toolbox-autolearn"
+    spec = importlib.util.spec_from_loader("autolearn", loader=None)
+    mod = importlib.util.module_from_spec(spec)
+    exec(compile(p.read_text(), str(p), "exec"), mod.__dict__)
+    return mod
+
+
+def _mk_db(db):
+    con = sqlite3.connect(db)
+    con.executescript(
+        "CREATE TABLE social_edges("
+        " id INTEGER PRIMARY KEY AUTOINCREMENT, ts INTEGER NOT NULL,"
+        " client_mac_hash TEXT, src_site TEXT NOT NULL,"
+        " tracker_domain TEXT NOT NULL, cookie_id_hash TEXT,"
+        " ja4_hash TEXT, consent_state TEXT DEFAULT 'none_seen');")
+    return con
+
+
+def test_social_feed_promotes_cross_site_tracker(tmp_path, monkeypatch):
+    db = tmp_path / "t.db"
+    con = _mk_db(db)
+    now = int(time.time())
+    rows = [
+        # tracker.io: 3 distinct src_sites (>= SOCIAL_MIN_SITES=3) → promote
+        (now, "m1", "cnn.com", "tracker.io"),
+        (now, "m1", "bbc.com", "tracker.io"),
+        (now, "m2", "lemonde.fr", "tracker.io"),
+        # twosite.net: only 2 distinct sites → NOT promoted
+        (now, "m1", "cnn.com", "twosite.net"),
+        (now, "m1", "bbc.com", "twosite.net"),
+        # safe.cdn.net: 3 sites but ALLOWLISTED → excluded
+        (now, "m1", "a.com", "safe.cdn.net"),
+        (now, "m1", "b.com", "safe.cdn.net"),
+        (now, "m1", "c.com", "safe.cdn.net"),
+        # secubox.in: 3 sites but SELF domain → excluded
+        (now, "m1", "a.com", "secubox.in"),
+        (now, "m1", "b.com", "secubox.in"),
+        (now, "m1", "c.com", "secubox.in"),
+        # stale.io: 3 sites but OUTSIDE the recent window → excluded
+        (now - 999999, "m1", "a.com", "stale.io"),
+        (now - 999999, "m1", "b.com", "stale.io"),
+        (now - 999999, "m1", "c.com", "stale.io"),
+    ]
+    con.executemany(
+        "INSERT INTO social_edges(ts,client_mac_hash,src_site,tracker_domain) "
+        "VALUES(?,?,?,?)", rows)
+    con.commit()
+    con.close()
+
+    allow = tmp_path / "ad-allowlist.txt"
+    allow.write_text("safe.cdn.net\n")
+    out = tmp_path / "learned-trackers.txt"
+    out.write_text("preexisting.tracker.com\n")
+
+    monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
+    monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
+    monkeypatch.setenv("SECUBOX_AD_ALLOWLIST", str(allow))
+    monkeypatch.setenv("SECUBOX_SOCIAL_MIN_SITES", "3")
+    monkeypatch.setenv("SECUBOX_SOCIAL_WINDOW_HOURS", "168")
+
+    al = _load_autolearn()
+    n = al._social_feed()
+
+    lines = out.read_text().split()
+    assert "tracker.io" in lines                # 3 distinct sites, recent → promoted
+    assert "twosite.net" not in lines           # below threshold
+    assert "safe.cdn.net" not in lines          # allowlisted
+    assert "secubox.in" not in lines            # self domain
+    assert "stale.io" not in lines              # outside window
+    assert "preexisting.tracker.com" in lines   # merge, not overwrite
+    assert len(lines) == len(set(lines))        # no dups
+    assert n == 1
+
+
+def test_social_feed_no_table_is_safe(tmp_path, monkeypatch):
+    db = tmp_path / "empty.db"
+    sqlite3.connect(db).close()  # no social_edges table
+    out = tmp_path / "learned-trackers.txt"
+    out.write_text("x.tracker.com\n")
+    monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
+    monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
+    al = _load_autolearn()
+    n = al._social_feed()
+    assert n == -1                              # gated/unavailable, not a crash
+    assert "x.tracker.com" in out.read_text()   # file untouched