mirror of
https://github.com/CyberMind-FR/secubox-deb.git
synced 2026-07-01 10:47:30 +00:00
Compare commits
1 Commits
3064fb61fd
...
55626e510b
| Author | SHA1 | Date | |
|---|---|---|---|
| 55626e510b |
|
|
@ -69,7 +69,12 @@ CREATE TABLE IF NOT EXISTS social_edges (
|
||||||
src_site TEXT NOT NULL,
|
src_site TEXT NOT NULL,
|
||||||
tracker_domain TEXT NOT NULL,
|
tracker_domain TEXT NOT NULL,
|
||||||
cookie_id_hash TEXT NOT NULL,
|
cookie_id_hash TEXT NOT NULL,
|
||||||
ja4_hash TEXT
|
ja4_hash TEXT,
|
||||||
|
-- Phase 11.C (#508) — consent state at the moment the edge was
|
||||||
|
-- recorded. Computed by the addon based on whether a consent
|
||||||
|
-- platform cookie (OneTrust/Didomi/Quantcast/Sourcepoint) has
|
||||||
|
-- already been observed for this peer × site pair.
|
||||||
|
consent_state TEXT NOT NULL DEFAULT 'none_seen'
|
||||||
);
|
);
|
||||||
CREATE INDEX IF NOT EXISTS idx_social_edges_mac_ts
|
CREATE INDEX IF NOT EXISTS idx_social_edges_mac_ts
|
||||||
ON social_edges(client_mac_hash, ts);
|
ON social_edges(client_mac_hash, ts);
|
||||||
|
|
@ -83,6 +88,16 @@ CREATE TABLE IF NOT EXISTS social_nodes (
|
||||||
first_seen INTEGER NOT NULL,
|
first_seen INTEGER NOT NULL,
|
||||||
last_seen INTEGER NOT NULL,
|
last_seen INTEGER NOT NULL,
|
||||||
sites_jsonl TEXT NOT NULL DEFAULT '[]',
|
sites_jsonl TEXT NOT NULL DEFAULT '[]',
|
||||||
|
-- Phase 11.C (#508) — GeoIP-derived metadata populated at fold
|
||||||
|
-- time so reads + PDF rendering don't have to do per-row mmdb
|
||||||
|
-- lookups. eu_inside is 1 when country_iso ∈ EU/EEA whitelist.
|
||||||
|
country_iso TEXT,
|
||||||
|
asn_org TEXT,
|
||||||
|
eu_inside INTEGER NOT NULL DEFAULT 1,
|
||||||
|
-- Number of edges recorded against this (peer, tracker) BEFORE a
|
||||||
|
-- consent cookie was observed. >0 = legal-grade evidence of
|
||||||
|
-- tracker firing before consent (RGPD art. 6.1.a + 7).
|
||||||
|
pre_consent_hits INTEGER NOT NULL DEFAULT 0,
|
||||||
PRIMARY KEY (client_mac_hash, tracker_domain)
|
PRIMARY KEY (client_mac_hash, tracker_domain)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
@ -103,9 +118,79 @@ def _conn() -> sqlite3.Connection:
|
||||||
c = sqlite3.connect(str(DB_PATH), timeout=5.0, isolation_level=None)
|
c = sqlite3.connect(str(DB_PATH), timeout=5.0, isolation_level=None)
|
||||||
c.row_factory = sqlite3.Row
|
c.row_factory = sqlite3.Row
|
||||||
c.executescript(_SCHEMA)
|
c.executescript(_SCHEMA)
|
||||||
|
_migrate_phase11c(c)
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
||||||
|
# ───── Phase 11.C migrations — additive columns on pre-existing tables ─────
|
||||||
|
# CREATE TABLE IF NOT EXISTS skips creation if the table already exists, so
|
||||||
|
# the new columns won't auto-appear on a 2.6.0 → 2.6.2 upgrade. Idempotent
|
||||||
|
# ALTERs : we probe the column list first to skip the duplicate-column
|
||||||
|
# error case (which would raise on every connection otherwise).
|
||||||
|
_PHASE11C_MIGRATIONS = (
|
||||||
|
("social_edges", "consent_state", "TEXT NOT NULL DEFAULT 'none_seen'"),
|
||||||
|
("social_nodes", "country_iso", "TEXT"),
|
||||||
|
("social_nodes", "asn_org", "TEXT"),
|
||||||
|
("social_nodes", "eu_inside", "INTEGER NOT NULL DEFAULT 1"),
|
||||||
|
("social_nodes", "pre_consent_hits", "INTEGER NOT NULL DEFAULT 0"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_phase11c(c: sqlite3.Connection) -> None:
|
||||||
|
try:
|
||||||
|
for table, col, decl in _PHASE11C_MIGRATIONS:
|
||||||
|
existing = {
|
||||||
|
r["name"] for r in c.execute(f"PRAGMA table_info({table})").fetchall()
|
||||||
|
}
|
||||||
|
if col not in existing:
|
||||||
|
c.execute(f"ALTER TABLE {table} ADD COLUMN {col} {decl}")
|
||||||
|
except Exception as e: # pragma: no cover
|
||||||
|
log.warning("Phase 11.C migration failed: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
# ───── EU / EEA whitelist (RGPD scope, art. 45 + Schengen extension) ─────
|
||||||
|
# Codes are ISO 3166-1 alpha-2. Source : EU member state list + EFTA
|
||||||
|
# (NO, IS, LI) + UK (adequacy decision in force as of writing). The
|
||||||
|
# Phase C "extra_eu" flag is set when GeoIP says the tracker's country
|
||||||
|
# ISO is NOT in this set.
|
||||||
|
_EU_EEA_ISO: frozenset = frozenset({
|
||||||
|
"AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "ES", "FI", "FR",
|
||||||
|
"GR", "HR", "HU", "IE", "IT", "LT", "LU", "LV", "MT", "NL", "PL",
|
||||||
|
"PT", "RO", "SE", "SI", "SK", # 27 EU
|
||||||
|
"IS", "LI", "NO", # EFTA / EEA
|
||||||
|
"GB", # UK adequacy decision
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def is_eu_iso(iso: str | None) -> bool:
|
||||||
|
return bool(iso) and (iso or "").upper() in _EU_EEA_ISO
|
||||||
|
|
||||||
|
|
||||||
|
# Lightweight cache around the existing `geo` module so the fold loop
|
||||||
|
# doesn't pay the lookup cost per repeated tracker_domain. Bounded to
|
||||||
|
# 4096 entries (well above any realistic distinct tracker count seen
|
||||||
|
# in a 7d retention window).
|
||||||
|
import functools as _functools
|
||||||
|
|
||||||
|
|
||||||
|
@_functools.lru_cache(maxsize=4096)
|
||||||
|
def _geo_for(host: str) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""Return (country_iso, asn_org) for a tracker host.
|
||||||
|
|
||||||
|
Best-effort. Falls back to (None, None) when the GeoIP module isn't
|
||||||
|
importable (worker hasn't installed the mmdb yet) or when the host
|
||||||
|
is a raw IP and the underlying lookup misses.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from secubox_toolbox import geo as _g # type: ignore
|
||||||
|
info = _g.lookup(host) or {}
|
||||||
|
iso = (info.get("country_iso") or "").upper() or None
|
||||||
|
asn = (info.get("asn_org") or "")[:64] or None
|
||||||
|
return iso, asn
|
||||||
|
except Exception:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
def cookie_id_hash(tracker_domain: str, cookie_name: str, cookie_value: str) -> str:
|
def cookie_id_hash(tracker_domain: str, cookie_name: str, cookie_value: str) -> str:
|
||||||
"""Stable short hash for an observed tracker identifier.
|
"""Stable short hash for an observed tracker identifier.
|
||||||
|
|
||||||
|
|
@ -147,13 +232,14 @@ def _record_edge_sync(
|
||||||
tracker_domain: str,
|
tracker_domain: str,
|
||||||
cookie_id_hash_val: str,
|
cookie_id_hash_val: str,
|
||||||
ja4_hash: Optional[str],
|
ja4_hash: Optional[str],
|
||||||
|
consent_state: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
try:
|
try:
|
||||||
with _conn() as c:
|
with _conn() as c:
|
||||||
c.execute(
|
c.execute(
|
||||||
"INSERT INTO social_edges(ts, client_mac_hash, src_site, "
|
"INSERT INTO social_edges(ts, client_mac_hash, src_site, "
|
||||||
"tracker_domain, cookie_id_hash, ja4_hash) "
|
"tracker_domain, cookie_id_hash, ja4_hash, consent_state) "
|
||||||
"VALUES (?, ?, ?, ?, ?, ?)",
|
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
(
|
(
|
||||||
int(time.time()),
|
int(time.time()),
|
||||||
client_mac_hash,
|
client_mac_hash,
|
||||||
|
|
@ -161,6 +247,7 @@ def _record_edge_sync(
|
||||||
tracker_domain,
|
tracker_domain,
|
||||||
cookie_id_hash_val,
|
cookie_id_hash_val,
|
||||||
ja4_hash,
|
ja4_hash,
|
||||||
|
consent_state or "none_seen",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
except Exception as e: # pragma: no cover — best-effort
|
except Exception as e: # pragma: no cover — best-effort
|
||||||
|
|
@ -174,9 +261,15 @@ def record_edge(
|
||||||
tracker_domain: str,
|
tracker_domain: str,
|
||||||
cookie_id_hash_val: str,
|
cookie_id_hash_val: str,
|
||||||
ja4_hash: Optional[str] = None,
|
ja4_hash: Optional[str] = None,
|
||||||
|
consent_state: str = "none_seen",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Submit one edge off-thread. Best-effort, never raises into the
|
"""Submit one edge off-thread. Best-effort, never raises into the
|
||||||
addon, never blocks the mitmproxy asyncio loop."""
|
addon, never blocks the mitmproxy asyncio loop.
|
||||||
|
|
||||||
|
`consent_state` is one of {none_seen, pre_consent, post_consent} as
|
||||||
|
computed by the addon based on the per-peer × per-site consent
|
||||||
|
cookie observation log (Phase 11.C).
|
||||||
|
"""
|
||||||
if not (client_mac_hash and src_site and tracker_domain and cookie_id_hash_val):
|
if not (client_mac_hash and src_site and tracker_domain and cookie_id_hash_val):
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
|
|
@ -187,6 +280,7 @@ def record_edge(
|
||||||
tracker_domain,
|
tracker_domain,
|
||||||
cookie_id_hash_val,
|
cookie_id_hash_val,
|
||||||
ja4_hash,
|
ja4_hash,
|
||||||
|
consent_state,
|
||||||
)
|
)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
# Executor shut down (interpreter teardown) — silent drop.
|
# Executor shut down (interpreter teardown) — silent drop.
|
||||||
|
|
@ -208,7 +302,7 @@ def fold_recent(window_seconds: int = 300) -> Tuple[int, int]:
|
||||||
with _conn() as c:
|
with _conn() as c:
|
||||||
edges = c.execute(
|
edges = c.execute(
|
||||||
"SELECT client_mac_hash, src_site, tracker_domain, "
|
"SELECT client_mac_hash, src_site, tracker_domain, "
|
||||||
"cookie_id_hash, ja4_hash, ts "
|
"cookie_id_hash, ja4_hash, ts, consent_state "
|
||||||
"FROM social_edges WHERE ts >= ?",
|
"FROM social_edges WHERE ts >= ?",
|
||||||
(since,),
|
(since,),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
@ -233,12 +327,20 @@ def fold_recent(window_seconds: int = 300) -> Tuple[int, int]:
|
||||||
key_n = (mac, trk)
|
key_n = (mac, trk)
|
||||||
n = per_node.setdefault(
|
n = per_node.setdefault(
|
||||||
key_n,
|
key_n,
|
||||||
{"hits": 0, "first_seen": ts, "last_seen": ts, "sites": set()},
|
{
|
||||||
|
"hits": 0,
|
||||||
|
"first_seen": ts,
|
||||||
|
"last_seen": ts,
|
||||||
|
"sites": set(),
|
||||||
|
"pre_consent_hits": 0,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
n["hits"] += 1
|
n["hits"] += 1
|
||||||
n["first_seen"] = min(n["first_seen"], ts)
|
n["first_seen"] = min(n["first_seen"], ts)
|
||||||
n["last_seen"] = max(n["last_seen"], ts)
|
n["last_seen"] = max(n["last_seen"], ts)
|
||||||
n["sites"].add(site)
|
n["sites"].add(site)
|
||||||
|
if e["consent_state"] == "pre_consent":
|
||||||
|
n["pre_consent_hits"] += 1
|
||||||
|
|
||||||
# Per-site tracker index (for link fold below)
|
# Per-site tracker index (for link fold below)
|
||||||
per_site_trackers.setdefault((mac, site), set()).add(trk)
|
per_site_trackers.setdefault((mac, site), set()).add(trk)
|
||||||
|
|
@ -248,8 +350,13 @@ def fold_recent(window_seconds: int = 300) -> Tuple[int, int]:
|
||||||
# Persist nodes
|
# Persist nodes
|
||||||
for (mac, trk), n in per_node.items():
|
for (mac, trk), n in per_node.items():
|
||||||
# Merge into existing row if present
|
# Merge into existing row if present
|
||||||
|
# Phase 11.C : enrich with GeoIP at fold time so reads
|
||||||
|
# + PDF rendering never block on mmdb lookups.
|
||||||
|
country_iso, asn_org = _geo_for(trk)
|
||||||
|
eu_inside = 1 if is_eu_iso(country_iso) else 0
|
||||||
cur = c.execute(
|
cur = c.execute(
|
||||||
"SELECT hits, first_seen, sites_jsonl FROM social_nodes "
|
"SELECT hits, first_seen, sites_jsonl, pre_consent_hits "
|
||||||
|
"FROM social_nodes "
|
||||||
"WHERE client_mac_hash = ? AND tracker_domain = ?",
|
"WHERE client_mac_hash = ? AND tracker_domain = ?",
|
||||||
(mac, trk),
|
(mac, trk),
|
||||||
).fetchone()
|
).fetchone()
|
||||||
|
|
@ -261,17 +368,22 @@ def fold_recent(window_seconds: int = 300) -> Tuple[int, int]:
|
||||||
except Exception:
|
except Exception:
|
||||||
existing_sites = set()
|
existing_sites = set()
|
||||||
sites = sorted(existing_sites | n["sites"])
|
sites = sorted(existing_sites | n["sites"])
|
||||||
|
pre = (cur["pre_consent_hits"] or 0) + n["pre_consent_hits"]
|
||||||
c.execute(
|
c.execute(
|
||||||
"UPDATE social_nodes SET hits = ?, first_seen = ?, "
|
"UPDATE social_nodes SET hits = ?, first_seen = ?, "
|
||||||
"last_seen = ?, sites_jsonl = ? "
|
"last_seen = ?, sites_jsonl = ?, country_iso = ?, "
|
||||||
|
"asn_org = ?, eu_inside = ?, pre_consent_hits = ? "
|
||||||
"WHERE client_mac_hash = ? AND tracker_domain = ?",
|
"WHERE client_mac_hash = ? AND tracker_domain = ?",
|
||||||
(hits, first, n["last_seen"], json.dumps(sites), mac, trk),
|
(hits, first, n["last_seen"], json.dumps(sites),
|
||||||
|
country_iso, asn_org, eu_inside, pre, mac, trk),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
c.execute(
|
c.execute(
|
||||||
"INSERT INTO social_nodes(client_mac_hash, "
|
"INSERT INTO social_nodes(client_mac_hash, "
|
||||||
"tracker_domain, hits, first_seen, last_seen, "
|
"tracker_domain, hits, first_seen, last_seen, "
|
||||||
"sites_jsonl) VALUES (?, ?, ?, ?, ?, ?)",
|
"sites_jsonl, country_iso, asn_org, eu_inside, "
|
||||||
|
"pre_consent_hits) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
(
|
(
|
||||||
mac,
|
mac,
|
||||||
trk,
|
trk,
|
||||||
|
|
@ -279,6 +391,10 @@ def fold_recent(window_seconds: int = 300) -> Tuple[int, int]:
|
||||||
n["first_seen"],
|
n["first_seen"],
|
||||||
n["last_seen"],
|
n["last_seen"],
|
||||||
json.dumps(sorted(n["sites"])),
|
json.dumps(sorted(n["sites"])),
|
||||||
|
country_iso,
|
||||||
|
asn_org,
|
||||||
|
eu_inside,
|
||||||
|
n["pre_consent_hits"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
nodes_touched += 1
|
nodes_touched += 1
|
||||||
|
|
@ -502,6 +618,74 @@ def aggregate(hours: int = 24) -> Dict:
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def evidence(mac_hash: str, since_seconds: int = 86400) -> Dict:
|
||||||
|
"""Phase 11.C evidence helper — returns the legal-grade slice
|
||||||
|
consumed by the bilingual PDF report.
|
||||||
|
|
||||||
|
Two evidence buckets, both fact-only (no interpretation) :
|
||||||
|
- ``pre_consent`` : (tracker_domain, sites, pre_consent_hits,
|
||||||
|
country_iso, asn_org). Trackers that fired BEFORE a consent
|
||||||
|
cookie was observed for that peer × site. Direct RGPD art. 7
|
||||||
|
+ art. 6.1.a evidence.
|
||||||
|
- ``extra_eu`` : (tracker_domain, country_iso, asn_org,
|
||||||
|
sites). Trackers resolving to non-EU/EEA countries. Note :
|
||||||
|
we report the fact, not SCC absence (we can't prove a
|
||||||
|
negative). RGPD art. 44+ evidence.
|
||||||
|
"""
|
||||||
|
since = int(time.time()) - max(since_seconds, 3600)
|
||||||
|
out: Dict = {"pre_consent": [], "extra_eu": []}
|
||||||
|
if not mac_hash:
|
||||||
|
return out
|
||||||
|
try:
|
||||||
|
with _conn() as c:
|
||||||
|
for r in c.execute(
|
||||||
|
"SELECT tracker_domain, hits, pre_consent_hits, sites_jsonl, "
|
||||||
|
"country_iso, asn_org, last_seen "
|
||||||
|
"FROM social_nodes "
|
||||||
|
"WHERE client_mac_hash = ? AND last_seen >= ? "
|
||||||
|
"AND pre_consent_hits > 0 "
|
||||||
|
"ORDER BY pre_consent_hits DESC, hits DESC LIMIT 100",
|
||||||
|
(mac_hash, since),
|
||||||
|
).fetchall():
|
||||||
|
try:
|
||||||
|
sites = json.loads(r["sites_jsonl"])
|
||||||
|
except Exception:
|
||||||
|
sites = []
|
||||||
|
out["pre_consent"].append({
|
||||||
|
"tracker_domain": r["tracker_domain"],
|
||||||
|
"hits": r["hits"],
|
||||||
|
"pre_consent_hits": r["pre_consent_hits"],
|
||||||
|
"sites": sites,
|
||||||
|
"country_iso": r["country_iso"],
|
||||||
|
"asn_org": r["asn_org"],
|
||||||
|
"last_seen": r["last_seen"],
|
||||||
|
})
|
||||||
|
for r in c.execute(
|
||||||
|
"SELECT tracker_domain, hits, sites_jsonl, country_iso, "
|
||||||
|
"asn_org, last_seen "
|
||||||
|
"FROM social_nodes "
|
||||||
|
"WHERE client_mac_hash = ? AND last_seen >= ? "
|
||||||
|
"AND eu_inside = 0 AND country_iso IS NOT NULL "
|
||||||
|
"ORDER BY hits DESC LIMIT 100",
|
||||||
|
(mac_hash, since),
|
||||||
|
).fetchall():
|
||||||
|
try:
|
||||||
|
sites = json.loads(r["sites_jsonl"])
|
||||||
|
except Exception:
|
||||||
|
sites = []
|
||||||
|
out["extra_eu"].append({
|
||||||
|
"tracker_domain": r["tracker_domain"],
|
||||||
|
"hits": r["hits"],
|
||||||
|
"sites": sites,
|
||||||
|
"country_iso": r["country_iso"],
|
||||||
|
"asn_org": r["asn_org"],
|
||||||
|
"last_seen": r["last_seen"],
|
||||||
|
})
|
||||||
|
except Exception as e: # pragma: no cover
|
||||||
|
log.warning("evidence query failed: %s", e)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def purge_older_than(days: int = 7) -> int:
|
def purge_older_than(days: int = 7) -> int:
|
||||||
"""Drop raw edges older than `days`. The aggregate node/link tables
|
"""Drop raw edges older than `days`. The aggregate node/link tables
|
||||||
stay : they represent the durable fold. Operator-side wipe goes
|
stay : they represent the durable fold. Operator-side wipe goes
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user