mirror of
https://github.com/CyberMind-FR/secubox-deb.git
synced 2026-06-29 13:31:30 +00:00
Compare commits
3 Commits
77da033371
...
78ad554ece
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
78ad554ece | ||
| 895356dc00 | |||
| 4063ae1a95 |
147
packages/secubox-toolbox-ng/cmd/sbxmitm/adcand_test.go
Normal file
147
packages/secubox-toolbox-ng/cmd/sbxmitm/adcand_test.go
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
// SPDX-License-Identifier: LicenseRef-CMSD-1.0
|
||||
// Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
|
||||
//
|
||||
// SecuBox-Deb :: toolbox-ng :: ad-candidate learning-feed tests (#662)
|
||||
//
|
||||
// The Go cutover blocked from STATIC lists but never emitted LEARNING
|
||||
// candidates, so a brand-new adware (acotedemoi.com) was never observed → never
|
||||
// promoted → slipped through forever. These tests prove the engine now ports
|
||||
// ad_ghost's _AD_PATH heuristic and records a candidate (host,site) for every
|
||||
// 3rd-party ad-path request on the allow/mitm path — the feed autolearn promotes.
|
||||
package main
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAdPathRegex(t *testing.T) {
|
||||
hit := []string{
|
||||
"/ad/1.gif", "/ads/x", "/adserver/req", "/pagead/conversion",
|
||||
"/gampad/ads", "/doubleclick/x", "/beacon", "/pixel.gif",
|
||||
"/collect", "/track", "/tracking/p", "/telemetry/v2", "/metric",
|
||||
"/PAGEAD/Upper", // case-insensitive
|
||||
}
|
||||
for _, p := range hit {
|
||||
if !adPathRE.MatchString(p) {
|
||||
t.Errorf("adPathRE should MATCH %q", p)
|
||||
}
|
||||
}
|
||||
miss := []string{"/", "/index.html", "/api/users", "/static/app.js", "/cart", "/headline"}
|
||||
for _, p := range miss {
|
||||
if adPathRE.MatchString(p) {
|
||||
t.Errorf("adPathRE should NOT match %q", p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// newAdCandTestPolicy builds a Policy with doubleclick.net allowlisted (so the
|
||||
// allowlist-skip branch is exercised) and nothing learned.
|
||||
func newAdCandTestPolicy(t *testing.T) *Policy {
|
||||
t.Helper()
|
||||
pol, err := LoadPolicy(PolicyOpts{
|
||||
AllowPath: writeTemp(t, "doubleclick.net\n"),
|
||||
LearnedPath: writeTemp(t, ""),
|
||||
SpliceSeedPath: writeTemp(t, ""),
|
||||
SpliceLearnPath: writeTemp(t, ""),
|
||||
PureTrackersPath: writeTemp(t, ""),
|
||||
SelfDomains: []string{"secubox.in"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadPolicy: %v", err)
|
||||
}
|
||||
return pol
|
||||
}
|
||||
|
||||
func TestMaybeRecordAdCandidate(t *testing.T) {
|
||||
pol := newAdCandTestPolicy(t)
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
host string // request host
|
||||
site string // referer site (registrable)
|
||||
path string
|
||||
want bool // candidate recorded?
|
||||
wantHK string
|
||||
}{
|
||||
{"3rd-party ad-path → candidate", "metrics.acotedemoi.com", "lemonde.fr", "/collect", true, "metrics.acotedemoi.com"},
|
||||
{"3rd-party ad-path /pagead", "ads.foo.io", "news.example", "/pagead/x", true, "ads.foo.io"},
|
||||
{"1st-party (same registrable) → no candidate", "static.lemonde.fr", "lemonde.fr", "/ads/x", false, ""},
|
||||
{"3rd-party non-ad-path → no candidate", "cdn.acotedemoi.com", "lemonde.fr", "/app.js", false, ""},
|
||||
{"no site (no Referer) → no candidate", "metrics.acotedemoi.com", "", "/collect", false, ""},
|
||||
{"allowlisted host → no candidate", "ads.doubleclick.net", "lemonde.fr", "/pagead/x", false, ""},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cand := newAdCandidates()
|
||||
px := &Proxy{pol: pol, cand: cand, analysisRelay: true}
|
||||
px.maybeRecordAdCandidate(tc.host, tc.site, tc.path)
|
||||
snap := cand.snapshot()
|
||||
if tc.want {
|
||||
if len(snap) != 1 {
|
||||
t.Fatalf("want 1 candidate, got %d (%+v)", len(snap), snap)
|
||||
}
|
||||
if snap[0].Host != tc.wantHK {
|
||||
t.Fatalf("candidate host = %q, want %q", snap[0].Host, tc.wantHK)
|
||||
}
|
||||
if snap[0].Site != tc.site {
|
||||
t.Fatalf("candidate site = %q, want %q", snap[0].Site, tc.site)
|
||||
}
|
||||
if snap[0].Hits != 1 {
|
||||
t.Fatalf("candidate hits = %d, want 1", snap[0].Hits)
|
||||
}
|
||||
} else if len(snap) != 0 {
|
||||
t.Fatalf("want 0 candidates, got %d (%+v)", len(snap), snap)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestAdCandidateGatedByRelay proves the feed is gated behind the analysis/ad
|
||||
// relay flag: with the gate off, nothing is recorded even on a textbook hit.
|
||||
func TestAdCandidateGatedByRelay(t *testing.T) {
|
||||
pol := newAdCandTestPolicy(t)
|
||||
cand := newAdCandidates()
|
||||
px := &Proxy{pol: pol, cand: cand, analysisRelay: false}
|
||||
px.maybeRecordAdCandidate("metrics.acotedemoi.com", "lemonde.fr", "/collect")
|
||||
if n := len(cand.snapshot()); n != 0 {
|
||||
t.Fatalf("relay off: want 0 candidates, got %d", n)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAdCandidateHitsAccumulate proves repeated (host,site) hits coalesce.
|
||||
func TestAdCandidateHitsAccumulate(t *testing.T) {
|
||||
cand := newAdCandidates()
|
||||
for i := 0; i < 5; i++ {
|
||||
cand.record("x.tracker.io", "site.example")
|
||||
}
|
||||
snap := cand.snapshot()
|
||||
if len(snap) != 1 || snap[0].Hits != 5 {
|
||||
t.Fatalf("want 1 row hits=5, got %+v", snap)
|
||||
}
|
||||
// snapshot clears.
|
||||
if n := len(cand.snapshot()); n != 0 {
|
||||
t.Fatalf("snapshot should clear: got %d", n)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAdCandidatePayloadShape proves the candidates list serialises into the
|
||||
// extended ad-event payload (host/site/hits keys).
|
||||
func TestAdCandidatePayloadShape(t *testing.T) {
|
||||
cand := newAdCandidates()
|
||||
cand.record("x.tracker.io", "site.example")
|
||||
rows := cand.snapshot()
|
||||
p := adEventPayload{Candidates: rows}
|
||||
if p.empty() {
|
||||
t.Fatal("payload with candidates must not be empty()")
|
||||
}
|
||||
}
|
||||
|
||||
// writeTemp writes content to a fresh temp file and returns its path.
|
||||
func writeTemp(t *testing.T, content string) string {
|
||||
t.Helper()
|
||||
f := filepath.Join(t.TempDir(), "list.txt")
|
||||
writeFile(t, f, content)
|
||||
return f
|
||||
}
|
||||
|
|
@ -26,10 +26,74 @@ import (
|
|||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── ad-candidate learning feed (#662 auto-learn loop) ─────────────────────────
|
||||
//
|
||||
// The STATIC block list never grows on its own; ad_ghost fed autolearn by
|
||||
// capturing CANDIDATES — 3rd-party requests whose PATH smells like an ad/track
|
||||
// endpoint — into ad_candidates, which secubox-toolbox-autolearn later promotes
|
||||
// into learned-trackers.txt at AD_MIN_SITES distinct sites. The Go cutover
|
||||
// dropped this feed, so new adwares (acotedemoi.com) were never observed. This
|
||||
// restores it in the engine: the allow/mitm hot path records (host,site) when
|
||||
// the request is 3rd-party AND adPathRE matches, buffered + flushed with the
|
||||
// existing ad-event machinery.
|
||||
|
||||
// adPathRE ports ad_ghost._AD_PATH (RE2-safe, case-insensitive). Matches a path
|
||||
// that looks like an ad/track endpoint. Learning only — never a block decision.
|
||||
//
|
||||
// Python: re.compile(r"/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|"
|
||||
// r"/pixel|/collect|/track(ing)?|/telemetry|/metric", re.I)
|
||||
var adPathRE = regexp.MustCompile(`(?i)/ads?/|/adserver|/pagead|/gampad|/doubleclick|/beacon|/pixel|/collect|/track(ing)?|/telemetry|/metric`)
|
||||
|
||||
// adCandMapCap bounds the candidate buffer (mirrors ad_ghost's `len(_cand) <
|
||||
// 20000` guard): NEW keys past the cap are dropped until the next flush clears
|
||||
// it, so a dead portal can never grow memory unbounded.
|
||||
const adCandMapCap = 20000
|
||||
|
||||
// adCandidates is the lock-guarded (host,site)→hits candidate aggregator,
|
||||
// drained by the ad-stats flusher into the ad-event payload's "candidates" list.
|
||||
type adCandidates struct {
|
||||
mu sync.Mutex
|
||||
hit map[adKey]int64
|
||||
}
|
||||
|
||||
func newAdCandidates() *adCandidates { return &adCandidates{hit: map[adKey]int64{}} }
|
||||
|
||||
// record tallies one ad-candidate (host,site). O(1); the cap drops only NEW keys
|
||||
// (existing keys keep accumulating). Empty host is ignored.
|
||||
func (a *adCandidates) record(host, site string) {
|
||||
if host == "" {
|
||||
return
|
||||
}
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
k := adKey{adHost: host, site: site}
|
||||
if _, ok := a.hit[k]; ok {
|
||||
a.hit[k]++
|
||||
} else if len(a.hit) < adCandMapCap {
|
||||
a.hit[k] = 1
|
||||
}
|
||||
}
|
||||
|
||||
// snapshot atomically reads-and-clears the buffer, returning the candidate rows.
|
||||
func (a *adCandidates) snapshot() []adCandidateRow {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
if len(a.hit) == 0 {
|
||||
return nil
|
||||
}
|
||||
rows := make([]adCandidateRow, 0, len(a.hit))
|
||||
for k, n := range a.hit {
|
||||
rows = append(rows, adCandidateRow{Host: k.adHost, Site: k.site, Hits: n})
|
||||
}
|
||||
a.hit = map[adKey]int64{}
|
||||
return rows
|
||||
}
|
||||
|
||||
// refererSite ports the ad_ghost _site_of logic: parse the Referer header as a
|
||||
// URL, take its hostname, and return registrable(hostname). Empty Referer or a
|
||||
// parse failure → "" (the page that issued the blocked request is unknown).
|
||||
|
|
@ -133,9 +197,19 @@ type adClientRow struct {
|
|||
Bytes int64 `json:"bytes"`
|
||||
}
|
||||
|
||||
// adCandidateRow is one learning candidate (host seen issuing ad-path requests
|
||||
// from a 1st-party site). Mirrors the portal /__toolbox/ad-event "candidates"
|
||||
// contract → store.record_ad_candidates([(host, site, hits), ...]).
|
||||
type adCandidateRow struct {
|
||||
Host string `json:"host"`
|
||||
Site string `json:"site"`
|
||||
Hits int64 `json:"hits"`
|
||||
}
|
||||
|
||||
type adEventPayload struct {
|
||||
Blocks []adBlockRow `json:"blocks"`
|
||||
Clients []adClientRow `json:"clients"`
|
||||
Blocks []adBlockRow `json:"blocks"`
|
||||
Clients []adClientRow `json:"clients"`
|
||||
Candidates []adCandidateRow `json:"candidates,omitempty"`
|
||||
}
|
||||
|
||||
// snapshot atomically reads-and-clears both maps, returning the accumulated rows.
|
||||
|
|
@ -159,7 +233,9 @@ func (a *adStats) snapshot() adEventPayload {
|
|||
}
|
||||
|
||||
// empty reports whether a payload carries no rows (nothing to POST).
|
||||
func (p adEventPayload) empty() bool { return len(p.Blocks) == 0 && len(p.Clients) == 0 }
|
||||
func (p adEventPayload) empty() bool {
|
||||
return len(p.Blocks) == 0 && len(p.Clients) == 0 && len(p.Candidates) == 0
|
||||
}
|
||||
|
||||
// adEventClient is a short-timeout fire-and-forget client for the ad-event POST.
|
||||
// Sibling of portalClient (banner.go): the portal is a fixed loopback base, so
|
||||
|
|
@ -175,8 +251,15 @@ var adEventClient = &http.Client{
|
|||
// non-2xx) is swallowed with at most a debug log — the metrics are stats, not
|
||||
// security, and the engine must never block on the portal. Exposed (returns the
|
||||
// flushed payload) so the test can assert the snapshot/clear + payload shape.
|
||||
func (a *adStats) flushOnce(portal string) adEventPayload {
|
||||
//
|
||||
// cand may be nil (the CONNECT PoC / tests with no learning feed); when set its
|
||||
// candidate rows are drained into the SAME payload so the learning feed rides
|
||||
// the existing ad-event channel (one POST per 10s, not two).
|
||||
func (a *adStats) flushOnce(portal string, cand *adCandidates) adEventPayload {
|
||||
p := a.snapshot()
|
||||
if cand != nil {
|
||||
p.Candidates = cand.snapshot()
|
||||
}
|
||||
if p.empty() {
|
||||
return p
|
||||
}
|
||||
|
|
@ -198,10 +281,10 @@ func (a *adStats) flushOnce(portal string) adEventPayload {
|
|||
// runAdStatsFlusher is the background flusher goroutine: every adFlushInterval it
|
||||
// drains the aggregator to the portal. Start it once from main() (like the
|
||||
// engine's other startup goroutines). It runs forever (the process lifetime).
|
||||
func (a *adStats) runAdStatsFlusher(portal string) {
|
||||
func (a *adStats) runAdStatsFlusher(portal string, cand *adCandidates) {
|
||||
t := time.NewTicker(adFlushInterval)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
a.flushOnce(portal)
|
||||
a.flushOnce(portal, cand)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ func TestRecordAdBlockEmptyHostIgnored(t *testing.T) {
|
|||
|
||||
func TestRecordAdBlockPerClientOnlyWhenMacSet(t *testing.T) {
|
||||
a := newAdStats()
|
||||
a.recordAdBlock("ads.example.com", "site", "") // no mac → no client row
|
||||
a.recordAdBlock("ads.example.com", "site", "") // no mac → no client row
|
||||
a.recordAdBlock("ads.example.com", "site", "mac1") // mac → client row
|
||||
a.recordAdBlock("ads.example.com", "site", "mac1")
|
||||
|
||||
|
|
@ -111,7 +111,7 @@ func TestFlushOncePayloadShapeMatchesContract(t *testing.T) {
|
|||
}))
|
||||
defer srv.Close()
|
||||
|
||||
a.flushOnce(srv.URL)
|
||||
a.flushOnce(srv.URL, nil)
|
||||
|
||||
if ct != "application/json" {
|
||||
t.Fatalf("Content-Type = %q, want application/json", ct)
|
||||
|
|
@ -145,7 +145,7 @@ func TestFlushOnceEmptySkipsPost(t *testing.T) {
|
|||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer srv.Close()
|
||||
a.flushOnce(srv.URL)
|
||||
a.flushOnce(srv.URL, nil)
|
||||
if posted {
|
||||
t.Fatalf("flushOnce on empty aggregator must not POST")
|
||||
}
|
||||
|
|
@ -156,7 +156,7 @@ func TestFlushOnceSwallowsPortalError(t *testing.T) {
|
|||
a.recordAdBlock("ads.example.com", "site", "")
|
||||
// Unreachable portal → must not panic, must still clear the maps (snapshot
|
||||
// happens before the POST).
|
||||
a.flushOnce("http://127.0.0.1:1")
|
||||
a.flushOnce("http://127.0.0.1:1", nil)
|
||||
if len(a.blocks) != 0 {
|
||||
t.Fatalf("flushOnce must clear maps even on POST failure")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -199,12 +199,13 @@ func ja4ish(h *tls.ClientHelloInfo) string {
|
|||
type Proxy struct {
|
||||
ca *CA
|
||||
pol *Policy
|
||||
jaSink func(string) // JA4 observations (logged; a sidecar in prod)
|
||||
jarKey []byte // anti-track HMAC fake-identity seed (nil → poison off)
|
||||
poison bool // master gate: poison tracker Set-Cookies (default on when jarKey present)
|
||||
portal string // portal base URL for /__toolbox/* reverse-proxy (banner assets)
|
||||
ads *adStats // #662 — ad-block metrics aggregator (flushed to the portal)
|
||||
cspDemo bool // #662 CONSENTED-DEMONSTRATION: relax a page's CSP so the injected loader runs, and flag the bypass (data-csp=1 → 🔓). Default on.
|
||||
jaSink func(string) // JA4 observations (logged; a sidecar in prod)
|
||||
jarKey []byte // anti-track HMAC fake-identity seed (nil → poison off)
|
||||
poison bool // master gate: poison tracker Set-Cookies (default on when jarKey present)
|
||||
portal string // portal base URL for /__toolbox/* reverse-proxy (banner assets)
|
||||
ads *adStats // #662 — ad-block metrics aggregator (flushed to the portal)
|
||||
cand *adCandidates // #662 — ad-candidate learning feed (flushed with ads to the portal)
|
||||
cspDemo bool // #662 CONSENTED-DEMONSTRATION: relax a page's CSP so the injected loader runs, and flag the bypass (data-csp=1 → 🔓). Default on.
|
||||
|
||||
// analysisRelay gates the per-flow telemetry relay to the dpi/cookies/ja4
|
||||
// analysis sidecar sockets (#662 — restoring the "Qui te piste?" events the
|
||||
|
|
@ -229,6 +230,33 @@ func (px *Proxy) recordAdBlock(adHost, site, macHash string) {
|
|||
}
|
||||
}
|
||||
|
||||
// maybeRecordAdCandidate feeds the auto-learn loop (#662): on the allow/mitm
|
||||
// path (NOT block — already caught; NOT allowlisted/own-infra), it records an
|
||||
// ad-candidate (host, site) when the request is 3rd-party
|
||||
// (registrable(host) != registrable(site)) AND the path smells like an ad/track
|
||||
// endpoint (adPathRE). It is the engine port of ad_ghost's candidate capture —
|
||||
// the feed secubox-toolbox-autolearn promotes into learned-trackers.txt at
|
||||
// AD_MIN_SITES distinct sites. Gated behind the analysis/ad relay flag, O(1) hot
|
||||
// path, fire-and-forget, nil-safe (CONNECT PoC / tests with no feed).
|
||||
func (px *Proxy) maybeRecordAdCandidate(host, site, path string) {
|
||||
if px == nil || px.cand == nil || !px.relayEnabled() || px.pol == nil {
|
||||
return
|
||||
}
|
||||
if site == "" || host == "" {
|
||||
return // no 1st-party context (no Referer) → nothing to attribute.
|
||||
}
|
||||
if px.pol.allowedSafe(host) {
|
||||
return // own-infra / allowlist: never learn our own / trusted hosts.
|
||||
}
|
||||
if registrable(host) == registrable(site) {
|
||||
return // 1st-party request: not a cross-site ad/track signal.
|
||||
}
|
||||
if !adPathRE.MatchString(path) {
|
||||
return // path doesn't look like an ad/track endpoint.
|
||||
}
|
||||
px.cand.record(host, site)
|
||||
}
|
||||
|
||||
func (px *Proxy) serverTLSConfig() *tls.Config {
|
||||
return px.serverTLSConfigCapture(nil)
|
||||
}
|
||||
|
|
@ -414,6 +442,15 @@ func (px *Proxy) mitmPipeline(tconn *tls.Conn, rawClient net.Conn, host, verdict
|
|||
relayIP := peerIP(rawClient)
|
||||
px.emitDPI(relayIP, clientHash, host, req)
|
||||
|
||||
// #662 — feed the auto-learn loop: on this allow/mitm flow, record an
|
||||
// ad-candidate when the request is 3rd-party AND its path smells like an
|
||||
// ad/track endpoint (ad_ghost's _AD_PATH heuristic). site = registrable of
|
||||
// the Referer (the ad_ghost _site_of flavour). Done BEFORE anonymize mutates
|
||||
// headers (so the Referer is the client's original). O(1), gated,
|
||||
// fire-and-forget — a new adware host gets observed here, promoted by
|
||||
// autolearn, then blocked+smogged after the policy live-reloads it.
|
||||
px.maybeRecordAdCandidate(host, refererSite(req.Header.Get("Referer")), req.URL.Path)
|
||||
|
||||
anonymizeRequest(req.Header)
|
||||
|
||||
// #662 — do NOT touch Accept-Encoding. We FORWARD the client's original
|
||||
|
|
@ -569,6 +606,7 @@ func main() {
|
|||
poison: *poison,
|
||||
portal: *portal,
|
||||
ads: newAdStats(),
|
||||
cand: newAdCandidates(),
|
||||
cspDemo: *cspDemo,
|
||||
|
||||
analysisRelay: *analysisRelay,
|
||||
|
|
@ -585,7 +623,9 @@ func main() {
|
|||
// #662 — start the ad-block metrics flusher: the block path tallies every
|
||||
// 204 into px.ads, drained every 10s to the portal's /__toolbox/ad-event
|
||||
// (best-effort, fire-and-forget) so the #ads dashboard sees blocks again.
|
||||
go px.ads.runAdStatsFlusher(*portal)
|
||||
// #662 — the candidate feed (px.cand) is drained in the SAME flush so the
|
||||
// learning candidates ride the existing ad-event channel (one POST / 10s).
|
||||
go px.ads.runAdStatsFlusher(*portal, px.cand)
|
||||
if *transparent {
|
||||
// Transparent R3 mode: raw accept loop, each conn carries its pre-DNAT
|
||||
// destination via SO_ORIGINAL_DST (recovered in handleTransparent). The
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@ import (
|
|||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ── ad_ghost: static ad/tracker host pattern (port of _AD_HOST) ──────────────
|
||||
|
|
@ -95,19 +97,55 @@ func envOr(key, def string) string {
|
|||
// Policy carries the loaded sets/regex and decides per-host actions. It also
|
||||
// keeps the legacy PoC fields (Inject) so the existing wiring/tests still work.
|
||||
type Policy struct {
|
||||
adHost *regexp.Regexp
|
||||
learned map[string]bool // learned-trackers (host or registrable, lowercased)
|
||||
allow map[string]bool // ad-allowlist (host or registrable, lowercased)
|
||||
spliceSeed map[string]bool // splice seed patterns
|
||||
spliceLearn map[string]bool // splice learned patterns
|
||||
never map[string]bool // pure-trackers ∪ fortknox (splice never-set)
|
||||
selfRegs map[string]bool // own-infra registrable domains
|
||||
selfDomains []string // own-infra (for the host==d || host endswith .d guard)
|
||||
// mu guards the live-reloadable map fields below. Decide/allowed/blockedByAd/
|
||||
// shouldSplice take RLock; maybeReload takes Lock only when a backing file
|
||||
// actually changed (the throttle + stat happen under a separate lighter lock).
|
||||
mu sync.RWMutex
|
||||
|
||||
adHost *regexp.Regexp
|
||||
learned map[string]bool // learned-trackers (host or registrable, lowercased)
|
||||
allow map[string]bool // ad-allowlist (host or registrable, lowercased)
|
||||
spliceSeed map[string]bool // splice seed patterns
|
||||
spliceLearn map[string]bool // splice learned patterns
|
||||
never map[string]bool // pure-trackers ∪ fortknox (splice never-set)
|
||||
selfRegs map[string]bool // own-infra registrable domains
|
||||
selfDomains []string // own-infra (for the host==d || host endswith .d guard)
|
||||
|
||||
// ── live-reload state (#662 auto-learn loop) ─────────────────────────────
|
||||
//
|
||||
// The lists are loaded once at startup, then re-read on-disk when their
|
||||
// mtime changes so autolearn promotions / manual edits take effect WITHOUT a
|
||||
// worker restart (mirrors ad_ghost._maybe_reload). The hot path (Decide)
|
||||
// calls maybeReload(): a throttle check, then — at most every reloadThrottle —
|
||||
// a cheap stat() of each backing file. Only a changed file is re-read and its
|
||||
// map atomically swapped under mu.
|
||||
reloadFiles []reloadTarget // backing files + their swap target
|
||||
fortknoxSites []string // kept for rebuilding the never-set on pure-trackers reload
|
||||
reloadMu sync.Mutex // guards lastReloadCheck + the per-file mtimes
|
||||
lastReloadID int64 // unix-nano of the last throttle pass (0 = never)
|
||||
reloadThrottle time.Duration // min interval between stat passes (0 in tests = eager)
|
||||
|
||||
// Legacy PoC fields kept so non-policy behaviour is unchanged.
|
||||
Inject []byte // banner / ad-CSS marker injected before </head> or </body>
|
||||
}
|
||||
|
||||
// reloadTarget describes one backing file the engine live-reloads: its path, the
|
||||
// last mtime we read, whether comment-stripping applies (loadLines vs
|
||||
// loadLinesRaw), and an applier that swaps the freshly-read set into the right
|
||||
// Policy field (under p.mu, held by the caller). pure-trackers re-derives the
|
||||
// never-set (∪ fortknox) so it stays consistent.
|
||||
type reloadTarget struct {
|
||||
path string
|
||||
stripComm bool
|
||||
lastMtime int64
|
||||
apply func(p *Policy, set map[string]bool)
|
||||
}
|
||||
|
||||
// defaultReloadThrottle is the production stat cadence: a backing-file change
|
||||
// (autolearn runs hourly; a promotion is rare) is observed within ~15s, and the
|
||||
// hot path stats at most ~4×/minute regardless of request rate.
|
||||
const defaultReloadThrottle = 15 * time.Second
|
||||
|
||||
// loadLines mirrors the comment-stripping Python loaders (splice._load_lines,
|
||||
// ad_ghost._allowed's allowlist read): split on first '#', trim, lowercase,
|
||||
// skip blanks. Missing/unreadable file → empty set (best-effort).
|
||||
|
|
@ -196,16 +234,107 @@ func LoadPolicy(opts PolicyOpts) (*Policy, error) {
|
|||
selfDomains = append(selfDomains, d)
|
||||
}
|
||||
|
||||
return &Policy{
|
||||
adHost: re,
|
||||
learned: loadLinesRaw(opts.LearnedPath), // mirrors _learned_set (no comment-strip)
|
||||
allow: loadLines(opts.AllowPath),
|
||||
spliceSeed: loadLines(opts.SpliceSeedPath),
|
||||
spliceLearn: loadLines(opts.SpliceLearnPath),
|
||||
never: never,
|
||||
selfRegs: selfRegs,
|
||||
selfDomains: selfDomains,
|
||||
}, nil
|
||||
p := &Policy{
|
||||
adHost: re,
|
||||
learned: loadLinesRaw(opts.LearnedPath), // mirrors _learned_set (no comment-strip)
|
||||
allow: loadLines(opts.AllowPath),
|
||||
spliceSeed: loadLines(opts.SpliceSeedPath),
|
||||
spliceLearn: loadLines(opts.SpliceLearnPath),
|
||||
never: never,
|
||||
selfRegs: selfRegs,
|
||||
selfDomains: selfDomains,
|
||||
fortknoxSites: append([]string(nil), opts.FortknoxSites...),
|
||||
reloadThrottle: defaultReloadThrottle,
|
||||
}
|
||||
|
||||
// ── register the live-reloadable backing files (#662 auto-learn loop) ─────
|
||||
//
|
||||
// Each entry re-reads its file when its mtime changes and atomically swaps
|
||||
// the map under p.mu (held by maybeReload). learned-trackers + ad-allowlist
|
||||
// are the load-bearing pair (autolearn promotes into learned; the operator
|
||||
// edits the allowlist); the splice seed/learned + pure-trackers files are
|
||||
// reloaded too for consistency (pure-trackers re-derives the never-set).
|
||||
p.reloadFiles = []reloadTarget{
|
||||
{path: opts.LearnedPath, stripComm: false, lastMtime: statMtime(opts.LearnedPath),
|
||||
apply: func(p *Policy, s map[string]bool) { p.learned = s }},
|
||||
{path: opts.AllowPath, stripComm: true, lastMtime: statMtime(opts.AllowPath),
|
||||
apply: func(p *Policy, s map[string]bool) { p.allow = s }},
|
||||
{path: opts.SpliceSeedPath, stripComm: true, lastMtime: statMtime(opts.SpliceSeedPath),
|
||||
apply: func(p *Policy, s map[string]bool) { p.spliceSeed = s }},
|
||||
{path: opts.SpliceLearnPath, stripComm: true, lastMtime: statMtime(opts.SpliceLearnPath),
|
||||
apply: func(p *Policy, s map[string]bool) { p.spliceLearn = s }},
|
||||
{path: opts.PureTrackersPath, stripComm: true, lastMtime: statMtime(opts.PureTrackersPath),
|
||||
apply: func(p *Policy, s map[string]bool) {
|
||||
// pure-trackers ∪ fortknox → never-set (mirrors LoadPolicy above).
|
||||
for _, fk := range p.fortknoxSites {
|
||||
if fk = strings.Trim(strings.ToLower(strings.TrimSpace(fk)), "."); fk != "" {
|
||||
s[fk] = true
|
||||
}
|
||||
}
|
||||
p.never = s
|
||||
}},
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// statMtime returns the file's mtime in unix-nano, or 0 when the file is missing
|
||||
// or unreadable (best-effort, like the Python loaders: a missing file → empty
|
||||
// set, mtime 0). A file appearing/disappearing therefore registers as a change.
|
||||
func statMtime(path string) int64 {
|
||||
if path == "" {
|
||||
return 0
|
||||
}
|
||||
fi, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return fi.ModTime().UnixNano()
|
||||
}
|
||||
|
||||
// maybeReload re-reads any backing list whose on-disk mtime changed since the
|
||||
// last pass, swapping the affected map(s) under p.mu. Throttled to at most one
|
||||
// stat pass per p.reloadThrottle (cheap: a time compare + a few stats), so the
|
||||
// Decide hot path pays almost nothing. Concurrency-safe: the throttle/mtime
|
||||
// bookkeeping is under reloadMu and the map swap under mu — Decide's readers
|
||||
// hold mu.RLock, so a swap is atomic w.r.t. any in-flight decision.
|
||||
func (p *Policy) maybeReload() {
|
||||
now := time.Now()
|
||||
p.reloadMu.Lock()
|
||||
if p.reloadThrottle > 0 && p.lastReloadID != 0 &&
|
||||
now.Sub(time.Unix(0, p.lastReloadID)) < p.reloadThrottle {
|
||||
p.reloadMu.Unlock()
|
||||
return
|
||||
}
|
||||
p.lastReloadID = now.UnixNano()
|
||||
|
||||
// Collect the files that changed (stat under reloadMu; re-read outside mu).
|
||||
type pending struct {
|
||||
idx int
|
||||
set map[string]bool
|
||||
}
|
||||
var changed []pending
|
||||
for i := range p.reloadFiles {
|
||||
rt := &p.reloadFiles[i]
|
||||
if rt.path == "" {
|
||||
continue
|
||||
}
|
||||
m := statMtime(rt.path)
|
||||
if m != rt.lastMtime {
|
||||
rt.lastMtime = m
|
||||
changed = append(changed, pending{idx: i, set: scanLines(rt.path, rt.stripComm)})
|
||||
}
|
||||
}
|
||||
p.reloadMu.Unlock()
|
||||
|
||||
if len(changed) == 0 {
|
||||
return
|
||||
}
|
||||
// Swap the affected maps atomically under the write lock.
|
||||
p.mu.Lock()
|
||||
for _, c := range changed {
|
||||
p.reloadFiles[c.idx].apply(p, c.set)
|
||||
}
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
// ── registrable: port of ad_ghost._registrable ───────────────────────────────
|
||||
|
|
@ -279,6 +408,11 @@ func hostMatches(host string, patterns map[string]bool) bool {
|
|||
|
||||
// allowed: port of ad_ghost._allowed. Own-infra ALWAYS wins (reflash-safe),
|
||||
// then the operator allowlist (host or registrable).
|
||||
//
|
||||
// LOCK CONTRACT: reads the reloadable allow map — the caller MUST hold at least
|
||||
// p.mu.RLock (Decide / shouldPoison do). Lock-free internally so Decide can call
|
||||
// it alongside shouldSplice/blockedByAd under a single RLock (sync.RWMutex is
|
||||
// not reentrant).
|
||||
func (p *Policy) allowed(host string) bool {
|
||||
h := strings.ToLower(host)
|
||||
reg := registrable(h)
|
||||
|
|
@ -297,7 +431,19 @@ func (p *Policy) allowed(host string) bool {
|
|||
return p.allow[h] || p.allow[reg]
|
||||
}
|
||||
|
||||
// allowedSafe is the lock-taking entry point to allowed() for callers OUTSIDE a
|
||||
// Decide RLock (e.g. the ad-candidate feed). It also picks up a live-reloaded
|
||||
// allowlist via maybeReload, so a freshly-allowlisted host stops being learned.
|
||||
func (p *Policy) allowedSafe(host string) bool {
|
||||
p.maybeReload()
|
||||
p.mu.RLock()
|
||||
defer p.mu.RUnlock()
|
||||
return p.allowed(host)
|
||||
}
|
||||
|
||||
// shouldSplice: port of splice.should_splice (never wins; then seed ∪ learned).
|
||||
// LOCK CONTRACT: reads the reloadable never/spliceSeed/spliceLearn maps — the
|
||||
// caller MUST hold at least p.mu.RLock (Decide does).
|
||||
func (p *Policy) shouldSplice(sni string) bool {
|
||||
s := strings.Trim(strings.ToLower(sni), ".")
|
||||
if s == "" {
|
||||
|
|
@ -312,6 +458,10 @@ func (p *Policy) shouldSplice(sni string) bool {
|
|||
// blockedByAd: port of the ad_ghost requestheaders block decision (sans the
|
||||
// allowlist guard, which Decide applies first): _AD_HOST match OR
|
||||
// registrable/host in learned-trackers.
|
||||
//
|
||||
// LOCK CONTRACT: reads the reloadable learned map — the caller MUST hold at
|
||||
// least p.mu.RLock. Decide and shouldPoison (via isTracker) do; the candidate-
|
||||
// emit path calls it only through those.
|
||||
func (p *Policy) blockedByAd(host string) bool {
|
||||
if p.adHost.MatchString(host) {
|
||||
return true
|
||||
|
|
@ -339,9 +489,16 @@ func (p *Policy) blockedByAd(host string) bool {
|
|||
// sni defaults to host when empty (the live engine splices on SNI == the TLS
|
||||
// host; for the parity harness host and sni are the same value).
|
||||
func (p *Policy) Decide(host, sni string) string {
|
||||
// #662 — pick up autolearn promotions / manual edits without a worker
|
||||
// restart. Throttled to ~every reloadThrottle and best-effort, so the hot
|
||||
// path normally pays only a time compare. Done BEFORE taking the read lock
|
||||
// (maybeReload may take the write lock to swap a changed map).
|
||||
p.maybeReload()
|
||||
if sni == "" {
|
||||
sni = host
|
||||
}
|
||||
p.mu.RLock()
|
||||
defer p.mu.RUnlock()
|
||||
if p.allowed(host) {
|
||||
return "allow"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -148,6 +148,12 @@ func (p *Policy) isTracker(host string) bool {
|
|||
// allowlisted — own-infra flows are left clean (same dark safety as the block
|
||||
// path). The caller additionally requires a loaded jar key.
|
||||
func (p *Policy) shouldPoison(host string) bool {
|
||||
// #662 — consult the same live-reloaded learned set Decide uses, so a host
|
||||
// promoted into learned-trackers (by autolearn) is poisoned (smogged), not
|
||||
// only 204'd, without a worker restart. RLock-guard the reloadable maps
|
||||
// (allowed + isTracker→blockedByAd read them); maybeReload may swap them.
|
||||
p.mu.RLock()
|
||||
defer p.mu.RUnlock()
|
||||
if p.allowed(host) {
|
||||
return false // own-infra / allowlist → never poison
|
||||
}
|
||||
|
|
|
|||
189
packages/secubox-toolbox-ng/cmd/sbxmitm/reload_test.go
Normal file
189
packages/secubox-toolbox-ng/cmd/sbxmitm/reload_test.go
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
// SPDX-License-Identifier: LicenseRef-CMSD-1.0
|
||||
// Copyright (c) 2026 CyberMind — Gérald Kerma <devel@cybermind.fr>
|
||||
//
|
||||
// SecuBox-Deb :: toolbox-ng :: policy live-reload tests (#662 auto-learn loop)
|
||||
//
|
||||
// The #662 Go cutover loaded the BLOCK/SPLICE lists ONCE at startup, so an
|
||||
// autolearn promotion (or a manual edit) of learned-trackers.txt never took
|
||||
// effect until a worker restart — the very thing that made new adwares slip
|
||||
// through forever. These tests prove the mtime-based live-reload: after the
|
||||
// throttle window, a host appended to learned-trackers.txt flips Decide from
|
||||
// "mitm" to "block" with NO restart. Concurrency is exercised under -race.
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// writeFile is a tiny helper that (re)writes a backing list file with content.
|
||||
func writeFile(t *testing.T, path, content string) {
|
||||
t.Helper()
|
||||
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("write %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
// bumpMtime forces the file's mtime forward so the reload's stat sees a change
|
||||
// even on coarse-granularity filesystems or sub-second test runs.
|
||||
func bumpMtime(t *testing.T, path string, d time.Duration) {
|
||||
t.Helper()
|
||||
ft := time.Now().Add(d)
|
||||
if err := os.Chtimes(path, ft, ft); err != nil {
|
||||
t.Fatalf("chtimes %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeReloadPicksUpAppendedLearnedTracker is the linchpin test: a host that
|
||||
// initially Decides "mitm" must flip to "block" once it is appended to
|
||||
// learned-trackers.txt and the throttle window elapses — without reloading the
|
||||
// Policy from scratch.
|
||||
func TestMaybeReloadPicksUpAppendedLearnedTracker(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
learned := filepath.Join(dir, "learned-trackers.txt")
|
||||
allow := filepath.Join(dir, "ad-allowlist.txt")
|
||||
writeFile(t, learned, "")
|
||||
writeFile(t, allow, "")
|
||||
|
||||
pol, err := LoadPolicy(PolicyOpts{
|
||||
LearnedPath: learned,
|
||||
AllowPath: allow,
|
||||
// keep the splice/never paths in the temp dir so missing-file behaviour
|
||||
// (empty set) is deterministic.
|
||||
SpliceSeedPath: filepath.Join(dir, "seed"),
|
||||
SpliceLearnPath: filepath.Join(dir, "slearn"),
|
||||
PureTrackersPath: filepath.Join(dir, "pure"),
|
||||
SelfDomains: []string{"secubox.in"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadPolicy: %v", err)
|
||||
}
|
||||
// Make the reload eager for the test (no 15s wait): zero throttle.
|
||||
pol.reloadThrottle = 0
|
||||
|
||||
const host = "acotedemoi.com"
|
||||
if got := pol.Decide(host, host); got != "mitm" {
|
||||
t.Fatalf("before promotion: Decide(%q) = %q, want mitm", host, got)
|
||||
}
|
||||
|
||||
// Promote: append the host and bump mtime forward.
|
||||
writeFile(t, learned, host+"\n")
|
||||
bumpMtime(t, learned, 2*time.Second)
|
||||
|
||||
if got := pol.Decide(host, host); got != "block" {
|
||||
t.Fatalf("after promotion: Decide(%q) = %q, want block", host, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeReloadThrottled proves the throttle: with a non-zero throttle window,
|
||||
// a change made just after a reload is NOT observed until the window elapses,
|
||||
// keeping the hot path cheap (one stat per ~window, not per request).
|
||||
func TestMaybeReloadThrottled(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
learned := filepath.Join(dir, "learned-trackers.txt")
|
||||
writeFile(t, learned, "")
|
||||
|
||||
pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: filepath.Join(dir, "allow")})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadPolicy: %v", err)
|
||||
}
|
||||
pol.reloadThrottle = time.Hour // effectively "never re-stat during the test"
|
||||
|
||||
// Prime the throttle clock with one Decide (does the initial stat).
|
||||
_ = pol.Decide("x.example", "x.example")
|
||||
|
||||
const host = "tracker.example"
|
||||
writeFile(t, learned, host+"\n")
|
||||
bumpMtime(t, learned, 2*time.Second)
|
||||
|
||||
if got := pol.Decide(host, host); got != "mitm" {
|
||||
t.Fatalf("throttled: Decide(%q) = %q, want mitm (change not yet observed)", host, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeReloadAllowlist proves the allowlist file is live-reloaded too: a
|
||||
// host the ad-host regex would block ("doubleclick.net") flips block→allow once
|
||||
// appended to the allowlist and the window elapses.
|
||||
func TestMaybeReloadAllowlist(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
learned := filepath.Join(dir, "learned-trackers.txt")
|
||||
allow := filepath.Join(dir, "ad-allowlist.txt")
|
||||
writeFile(t, learned, "")
|
||||
writeFile(t, allow, "")
|
||||
|
||||
pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: allow})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadPolicy: %v", err)
|
||||
}
|
||||
pol.reloadThrottle = 0
|
||||
|
||||
const host = "doubleclick.net"
|
||||
if got := pol.Decide(host, host); got != "block" {
|
||||
t.Fatalf("before allow: Decide(%q) = %q, want block", host, got)
|
||||
}
|
||||
writeFile(t, allow, host+"\n")
|
||||
bumpMtime(t, allow, 2*time.Second)
|
||||
if got := pol.Decide(host, host); got != "allow" {
|
||||
t.Fatalf("after allow: Decide(%q) = %q, want allow", host, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeReloadConcurrent runs Decide from many goroutines while the backing
|
||||
// learned file is rewritten concurrently. Under `go test -race` this proves the
|
||||
// RWMutex-guarded swap is data-race-free.
|
||||
func TestMaybeReloadConcurrent(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
learned := filepath.Join(dir, "learned-trackers.txt")
|
||||
writeFile(t, learned, "seed.example\n")
|
||||
|
||||
pol, err := LoadPolicy(PolicyOpts{LearnedPath: learned, AllowPath: filepath.Join(dir, "allow")})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadPolicy: %v", err)
|
||||
}
|
||||
pol.reloadThrottle = 0 // force a stat on every Decide → maximal contention
|
||||
|
||||
var wg sync.WaitGroup
|
||||
var blocks int64
|
||||
stop := make(chan struct{})
|
||||
|
||||
// Writer: keep appending hosts + bumping mtime.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
i := 0
|
||||
for {
|
||||
select {
|
||||
case <-stop:
|
||||
return
|
||||
default:
|
||||
}
|
||||
writeFile(t, learned, "seed.example\nh"+itoa(i)+".example\n")
|
||||
bumpMtime(t, learned, time.Duration(i+1)*time.Second)
|
||||
i++
|
||||
}
|
||||
}()
|
||||
|
||||
// Readers: hammer Decide on the seed (stable → always block) + a live host.
|
||||
for r := 0; r < 8; r++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for j := 0; j < 2000; j++ {
|
||||
if pol.Decide("seed.example", "seed.example") == "block" {
|
||||
atomic.AddInt64(&blocks, 1)
|
||||
}
|
||||
pol.Decide("h0.example", "h0.example")
|
||||
}
|
||||
}()
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
close(stop)
|
||||
wg.Wait()
|
||||
if blocks == 0 {
|
||||
t.Fatal("expected the stable seed host to block at least once")
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +1,12 @@
|
|||
secubox-toolbox-ng (0.1.12-1~bookworm1) bookworm; urgency=medium
|
||||
|
||||
* adlearn: live-reload the blocklist (mtime) so promotions/edits block without
|
||||
a worker restart; emit ad-candidates (3rd-party ad-path) to the portal;
|
||||
autolearn also promotes cross-site trackers from social_edges. Learned
|
||||
trackers are auto-204 + poison-smogged. (ref #662)
|
||||
|
||||
-- Gerald KERMA <devel@cybermind.fr> Thu, 19 Jun 2026 12:30:00 +0000
|
||||
|
||||
secubox-toolbox-ng (0.1.11-1~bookworm1) bookworm; urgency=medium
|
||||
|
||||
* social: ALSO correlate on the block path — blocked 3rd-party trackers still
|
||||
|
|
|
|||
|
|
@ -221,6 +221,92 @@ def _ad_feed() -> int:
|
|||
return len(promoted)
|
||||
|
||||
|
||||
# #662 — cross-site-reuse promotion. A tracker_domain seen issuing cookies on
|
||||
# >= SOCIAL_MIN_SITES DISTINCT src_site (across peers, recent window) is a
|
||||
# BEHAVIOURALLY-confirmed cross-site tracker (the social graph), independent of
|
||||
# the ad-path heuristic. Promote it into learned-trackers.txt so the engine
|
||||
# blocks (204) + smogs it. Conservative + reuses the SAME allowlist/self guard as
|
||||
# _ad_feed (NEVER promote allowlisted or self domains). De-dups against OUT.
|
||||
SOCIAL_MIN_SITES = int(os.environ.get("SECUBOX_SOCIAL_MIN_SITES", "3"))
|
||||
SOCIAL_WINDOW_HOURS = int(os.environ.get("SECUBOX_SOCIAL_WINDOW_HOURS", "168"))
|
||||
|
||||
|
||||
def _social_feed() -> int:
|
||||
"""Promote cross-site cookie-reuse trackers (social_edges) into the learned
|
||||
blocklist. A tracker_domain linking >= SOCIAL_MIN_SITES distinct src_site in
|
||||
the last SOCIAL_WINDOW_HOURS is promoted. Allowlist + self domains excluded
|
||||
(reused guard). MERGES into OUT (never overwrites). Returns count promoted, or
|
||||
-1 if unavailable (e.g. no social_edges table). Best-effort: never raises."""
|
||||
cutoff = int(time.time()) - SOCIAL_WINDOW_HOURS * 3600
|
||||
try:
|
||||
con = sqlite3.connect(DB, timeout=5)
|
||||
rows = con.execute(
|
||||
"SELECT tracker_domain, COUNT(DISTINCT src_site) AS sites "
|
||||
"FROM social_edges WHERE ts >= ? "
|
||||
"GROUP BY tracker_domain", (cutoff,)).fetchall()
|
||||
con.close()
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"autolearn: social query failed: {e}\n")
|
||||
return -1
|
||||
# Fold to registrable and aggregate the distinct-site count per eTLD+1 (two
|
||||
# tracker subdomains of the same registrable jointly meet the threshold).
|
||||
by_reg: dict[str, set] = {}
|
||||
try:
|
||||
scon = sqlite3.connect(DB, timeout=5)
|
||||
for td, _sites in rows:
|
||||
reg = registrable(td)
|
||||
if not reg:
|
||||
continue
|
||||
ss = by_reg.setdefault(reg, set())
|
||||
for (s,) in scon.execute(
|
||||
"SELECT DISTINCT src_site FROM social_edges "
|
||||
"WHERE ts >= ? AND tracker_domain = ?", (cutoff, td)):
|
||||
if s:
|
||||
ss.add(s)
|
||||
scon.close()
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"autolearn: social fold failed: {e}\n")
|
||||
return -1
|
||||
|
||||
allow = _load_ad_allowlist()
|
||||
self_doms = {d.strip().lower() for d in
|
||||
os.environ.get("SECUBOX_SELF_DOMAINS", "secubox.in").split(",")
|
||||
if d.strip()}
|
||||
promoted: set = set()
|
||||
for reg, sites in by_reg.items():
|
||||
if len(sites) < SOCIAL_MIN_SITES:
|
||||
continue
|
||||
if reg in allow:
|
||||
continue
|
||||
if reg in self_doms or any(reg == d or reg.endswith("." + d) for d in self_doms):
|
||||
continue
|
||||
promoted.add(reg)
|
||||
if not promoted:
|
||||
return 0
|
||||
existing: set = set()
|
||||
try:
|
||||
if os.path.exists(OUT):
|
||||
with open(OUT, encoding="utf-8") as fh:
|
||||
for ln in fh:
|
||||
ln = ln.strip()
|
||||
if ln:
|
||||
existing.add(ln)
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"autolearn: social merge read failed: {e}\n")
|
||||
new = promoted - existing
|
||||
merged = sorted(existing | promoted)[:MAX_ENTRIES]
|
||||
try:
|
||||
os.makedirs(os.path.dirname(OUT), exist_ok=True)
|
||||
tmp = OUT + ".tmp"
|
||||
with open(tmp, "w", encoding="utf-8") as fh:
|
||||
fh.write("\n".join(merged) + ("\n" if merged else ""))
|
||||
os.replace(tmp, OUT)
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"autolearn: social write failed: {e}\n")
|
||||
return -1
|
||||
return len(new)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
learned: set[str] = set()
|
||||
try:
|
||||
|
|
@ -317,6 +403,11 @@ def main() -> int:
|
|||
sys.stderr.write(f"autolearn: {_n_ad} ad-candidate hosts promoted\n")
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"autolearn: ad feed error: {e}\n")
|
||||
try:
|
||||
_n_social = _social_feed()
|
||||
sys.stderr.write(f"autolearn: {_n_social} cross-site cookie trackers promoted\n")
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"autolearn: social feed error: {e}\n")
|
||||
sys.stderr.write(
|
||||
f"autolearn: {len(out)} hosts learned ({ti} threat-intel + "
|
||||
f"{len(out) - ti} classified cross-site) @ {int(time.time())}"
|
||||
|
|
|
|||
|
|
@ -113,12 +113,20 @@ async def toolbox_ad_event(request: Request) -> Response:
|
|||
return Response(status_code=204)
|
||||
blocks = body.get("blocks") or []
|
||||
clients = body.get("clients") or []
|
||||
# #662 — the Go engine now also feeds the AUTO-LEARN loop: 3rd-party
|
||||
# ad-path requests it saw on the allow/mitm path (ad_ghost's _AD_PATH
|
||||
# heuristic), recorded as candidates here for secubox-toolbox-autolearn
|
||||
# to promote into learned-trackers.txt at AD_MIN_SITES distinct sites.
|
||||
candidates = body.get("candidates") or []
|
||||
if not isinstance(blocks, list):
|
||||
blocks = []
|
||||
if not isinstance(clients, list):
|
||||
clients = []
|
||||
if not isinstance(candidates, list):
|
||||
candidates = []
|
||||
blocks = blocks[:_AD_EVENT_ROW_CAP]
|
||||
clients = clients[:_AD_EVENT_ROW_CAP]
|
||||
candidates = candidates[:_AD_EVENT_ROW_CAP]
|
||||
|
||||
block_rows = [
|
||||
(b["ad_host"], b.get("site", ""), "block", int(b.get("hits", 0)), int(b.get("bytes", 0)))
|
||||
|
|
@ -130,10 +138,17 @@ async def toolbox_ad_event(request: Request) -> Response:
|
|||
for c in clients
|
||||
if isinstance(c, dict) and c.get("mac_hash") and c.get("ad_host")
|
||||
]
|
||||
cand_rows = [
|
||||
(c["host"], c.get("site", ""), int(c.get("hits", 0)))
|
||||
for c in candidates
|
||||
if isinstance(c, dict) and c.get("host")
|
||||
]
|
||||
if block_rows:
|
||||
store.record_ad_blocks(block_rows)
|
||||
if client_rows:
|
||||
store.record_ad_client_blocks(client_rows)
|
||||
if cand_rows:
|
||||
store.record_ad_candidates(cand_rows)
|
||||
except Exception as e: # never raise into the engine's fire-and-forget POST
|
||||
log.debug("ad-event ingest failed: %s", e)
|
||||
return Response(status_code=204)
|
||||
|
|
|
|||
68
packages/secubox-toolbox/tests/test_ad_event_candidates.py
Normal file
68
packages/secubox-toolbox/tests/test_ad_event_candidates.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# tests/test_ad_event_candidates.py
|
||||
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
|
||||
"""#662 — /__toolbox/ad-event accepts a "candidates" list (the Go engine's
|
||||
auto-learn feed) → store.record_ad_candidates(). Never 500s the engine."""
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from secubox_toolbox import api, store
|
||||
|
||||
|
||||
class _FakeRequest:
|
||||
"""Minimal Request stand-in: headers + an async json() body."""
|
||||
|
||||
def __init__(self, body: dict, content_length=None):
|
||||
self._body = body
|
||||
cl = content_length
|
||||
if cl is None:
|
||||
cl = len(json.dumps(body).encode())
|
||||
self.headers = {"content-length": str(cl)}
|
||||
|
||||
async def json(self):
|
||||
return self._body
|
||||
|
||||
|
||||
def test_candidates_ingested(monkeypatch):
|
||||
captured = {}
|
||||
monkeypatch.setattr(store, "record_ad_candidates", lambda rows: captured.setdefault("cand", list(rows)))
|
||||
monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
|
||||
monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
|
||||
|
||||
body = {
|
||||
"blocks": [],
|
||||
"clients": [],
|
||||
"candidates": [
|
||||
{"host": "metrics.acotedemoi.com", "site": "lemonde.fr", "hits": 3},
|
||||
{"host": "ads.foo.io", "site": "news.example", "hits": 1},
|
||||
{"site": "no-host.example", "hits": 9}, # missing host → skipped
|
||||
{"host": "", "site": "x", "hits": 2}, # empty host → skipped
|
||||
],
|
||||
}
|
||||
resp = asyncio.run(api.toolbox_ad_event(_FakeRequest(body)))
|
||||
assert resp.status_code == 204
|
||||
rows = captured.get("cand")
|
||||
assert rows == [
|
||||
("metrics.acotedemoi.com", "lemonde.fr", 3),
|
||||
("ads.foo.io", "news.example", 1),
|
||||
]
|
||||
|
||||
|
||||
def test_candidates_absent_is_noop(monkeypatch):
|
||||
called = {"cand": False}
|
||||
monkeypatch.setattr(store, "record_ad_candidates", lambda rows: called.__setitem__("cand", True))
|
||||
monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
|
||||
monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
|
||||
|
||||
resp = asyncio.run(api.toolbox_ad_event(_FakeRequest({"blocks": [], "clients": []})))
|
||||
assert resp.status_code == 204
|
||||
assert called["cand"] is False # no candidates key → record_ad_candidates not called
|
||||
|
||||
|
||||
def test_candidates_bad_payload_never_500s(monkeypatch):
|
||||
monkeypatch.setattr(store, "record_ad_candidates", lambda rows: (_ for _ in ()).throw(RuntimeError("boom")))
|
||||
monkeypatch.setattr(store, "record_ad_blocks", lambda rows: None)
|
||||
monkeypatch.setattr(store, "record_ad_client_blocks", lambda rows: None)
|
||||
|
||||
body = {"candidates": [{"host": "x.io", "site": "s", "hits": 1}]}
|
||||
resp = asyncio.run(api.toolbox_ad_event(_FakeRequest(body)))
|
||||
assert resp.status_code == 204 # store raised, but the endpoint swallows it
|
||||
98
packages/secubox-toolbox/tests/test_autolearn_socialfeed.py
Normal file
98
packages/secubox-toolbox/tests/test_autolearn_socialfeed.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
# tests/test_autolearn_socialfeed.py
|
||||
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
|
||||
"""#662 — cross-site-reuse promotion: a tracker_domain seen on >= N distinct
|
||||
src_site across recent social_edges is a behaviourally-confirmed cross-site
|
||||
tracker and gets promoted into learned-trackers.txt. Allowlist + self guard
|
||||
reused from _ad_feed; merges (never overwrites)."""
|
||||
import sqlite3
|
||||
import importlib.util
|
||||
import pathlib
|
||||
import time
|
||||
|
||||
|
||||
def _load_autolearn():
|
||||
p = pathlib.Path(__file__).resolve().parents[1] / "sbin" / "secubox-toolbox-autolearn"
|
||||
spec = importlib.util.spec_from_loader("autolearn", loader=None)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
exec(compile(p.read_text(), str(p), "exec"), mod.__dict__)
|
||||
return mod
|
||||
|
||||
|
||||
def _mk_db(db):
|
||||
con = sqlite3.connect(db)
|
||||
con.executescript(
|
||||
"CREATE TABLE social_edges("
|
||||
" id INTEGER PRIMARY KEY AUTOINCREMENT, ts INTEGER NOT NULL,"
|
||||
" client_mac_hash TEXT, src_site TEXT NOT NULL,"
|
||||
" tracker_domain TEXT NOT NULL, cookie_id_hash TEXT,"
|
||||
" ja4_hash TEXT, consent_state TEXT DEFAULT 'none_seen');")
|
||||
return con
|
||||
|
||||
|
||||
def test_social_feed_promotes_cross_site_tracker(tmp_path, monkeypatch):
|
||||
db = tmp_path / "t.db"
|
||||
con = _mk_db(db)
|
||||
now = int(time.time())
|
||||
rows = [
|
||||
# tracker.io: 3 distinct src_sites (>= SOCIAL_MIN_SITES=3) → promote
|
||||
(now, "m1", "cnn.com", "tracker.io"),
|
||||
(now, "m1", "bbc.com", "tracker.io"),
|
||||
(now, "m2", "lemonde.fr", "tracker.io"),
|
||||
# twosite.net: only 2 distinct sites → NOT promoted
|
||||
(now, "m1", "cnn.com", "twosite.net"),
|
||||
(now, "m1", "bbc.com", "twosite.net"),
|
||||
# safe.cdn.net: 3 sites but ALLOWLISTED → excluded
|
||||
(now, "m1", "a.com", "safe.cdn.net"),
|
||||
(now, "m1", "b.com", "safe.cdn.net"),
|
||||
(now, "m1", "c.com", "safe.cdn.net"),
|
||||
# secubox.in: 3 sites but SELF domain → excluded
|
||||
(now, "m1", "a.com", "secubox.in"),
|
||||
(now, "m1", "b.com", "secubox.in"),
|
||||
(now, "m1", "c.com", "secubox.in"),
|
||||
# stale.io: 3 sites but OUTSIDE the recent window → excluded
|
||||
(now - 999999, "m1", "a.com", "stale.io"),
|
||||
(now - 999999, "m1", "b.com", "stale.io"),
|
||||
(now - 999999, "m1", "c.com", "stale.io"),
|
||||
]
|
||||
con.executemany(
|
||||
"INSERT INTO social_edges(ts,client_mac_hash,src_site,tracker_domain) "
|
||||
"VALUES(?,?,?,?)", rows)
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
allow = tmp_path / "ad-allowlist.txt"
|
||||
allow.write_text("safe.cdn.net\n")
|
||||
out = tmp_path / "learned-trackers.txt"
|
||||
out.write_text("preexisting.tracker.com\n")
|
||||
|
||||
monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
|
||||
monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
|
||||
monkeypatch.setenv("SECUBOX_AD_ALLOWLIST", str(allow))
|
||||
monkeypatch.setenv("SECUBOX_SOCIAL_MIN_SITES", "3")
|
||||
monkeypatch.setenv("SECUBOX_SOCIAL_WINDOW_HOURS", "168")
|
||||
|
||||
al = _load_autolearn()
|
||||
n = al._social_feed()
|
||||
|
||||
lines = out.read_text().split()
|
||||
assert "tracker.io" in lines # 3 distinct sites, recent → promoted
|
||||
assert "twosite.net" not in lines # below threshold
|
||||
assert "safe.cdn.net" not in lines # allowlisted
|
||||
assert "secubox.in" not in lines # self domain
|
||||
assert "stale.io" not in lines # outside window
|
||||
assert "preexisting.tracker.com" in lines # merge, not overwrite
|
||||
assert len(lines) == len(set(lines)) # no dups
|
||||
assert n == 1
|
||||
|
||||
|
||||
def test_social_feed_no_table_is_safe(tmp_path, monkeypatch):
|
||||
db = tmp_path / "empty.db"
|
||||
sqlite3.connect(db).close() # no social_edges table
|
||||
out = tmp_path / "learned-trackers.txt"
|
||||
out.write_text("x.tracker.com\n")
|
||||
monkeypatch.setenv("SECUBOX_AUTOLEARN_DB", str(db))
|
||||
monkeypatch.setenv("SECUBOX_AUTOLEARN_OUT", str(out))
|
||||
al = _load_autolearn()
|
||||
n = al._social_feed()
|
||||
assert n == -1 # gated/unavailable, not a crash
|
||||
assert "x.tracker.com" in out.read_text() # file untouched
|
||||
Loading…
Reference in New Issue
Block a user