Compare commits

...

3 Commits

Author SHA1 Message Date
CyberMind
a44d9c51d8
Merge pull request #647 from CyberMind-FR/feature/646-perf-toolbox-adaptive-accept-encoding-st
Some checks are pending
License Headers / check (push) Waiting to run
perf(toolbox): adaptive Accept-Encoding strip — stop pulling CSP-strict pages uncompressed through R3
2026-06-18 09:34:00 +02:00
013385a6c4 chore(toolbox): changelog 2.6.53 for adaptive Accept-Encoding strip (ref #646) 2026-06-18 09:31:50 +02:00
6ee7fe3cbc perf(toolbox): adaptive Accept-Encoding strip — only force identity on stream-eligible hosts (ref #646) 2026-06-18 09:31:29 +02:00
3 changed files with 165 additions and 15 deletions

View File

@ -1,3 +1,16 @@
secubox-toolbox (2.6.53-1~bookworm1) bookworm; urgency=medium
* perf(#646): adaptive Accept-Encoding strip in inject_banner. Keep gzip/br by
default; only force identity (for stream-injecting the loader) on hosts
proven streaming-eligible (top-level html, 2xx-3xx, r2/r3, banner on, NOT
CSP-strict) on a prior response. CSP-strict / heavy sites stay compressed
instead of being pulled uncompressed (3-5x bytes) through the GIL-bound R3
worker for zero benefit — cuts per-document CPU + transfer on the slow path.
No feature loss: banner still injects via the buffer path; streaming TTFB
win preserved on eligible hosts after the first visit.
-- Gerald KERMA <devel@cybermind.fr> Thu, 18 Jun 2026 11:00:00 +0200
secubox-toolbox (2.6.52-1~bookworm1) bookworm; urgency=medium secubox-toolbox (2.6.52-1~bookworm1) bookworm; urgency=medium
* perf(#644): /admin/clients/rich enriches only the ENRICH_LIMIT (12) * perf(#644): /admin/clients/rich enriches only the ENRICH_LIMIT (12)

View File

@ -671,6 +671,26 @@ _MAX_INJECT_BYTES = 2 * 1024 * 1024 # Phase 10 perf cap : skip injection on hug
# critical path. Gated, fail-open: any miss falls back to passthrough (no # critical path. Gated, fail-open: any miss falls back to passthrough (no
# banner on that page) or to the legacy buffer path when the body is compressed. # banner on that page) or to the legacy buffer path when the body is compressed.
# ── #646 : adaptive Accept-Encoding strip ───────────────────────────────────
# Forcing identity on EVERY document pulled CSP-strict / heavy pages
# uncompressed (3-5x bytes) through the GIL-bound worker for ZERO benefit —
# streaming is disqualified on those pages anyway. So we keep gzip/br by
# default and only strip Accept-Encoding for hosts we've PROVEN
# streaming-eligible (top-level html + 2xx-3xx + r2/r3 + banner on + NOT
# CSP-strict) on a prior response. CSP-strict / non-doc hosts stay compressed
# forever (banner still injects via the buffer path, which decompresses fine).
# Per-process, in-memory, size-capped, self-healing — verdicts re-learn cheaply.
_STREAM_VERDICT: dict = {} # host -> bool (True = strip identity & stream)
_STREAM_VERDICT_MAX = 8192
def _record_stream_verdict(host: str, eligible: bool) -> None:
if not host:
return
if len(_STREAM_VERDICT) >= _STREAM_VERDICT_MAX:
_STREAM_VERDICT.clear() # crude self-heal; cheap to re-learn
_STREAM_VERDICT[host] = eligible
def _stream_enabled() -> bool: def _stream_enabled() -> bool:
try: try:
import sys as _sys import sys as _sys
@ -760,15 +780,18 @@ class InjectBanner:
return return
except Exception as e: except Exception as e:
log.warning("toolbox asset serve failed for %s: %s", flow.request.path, e) log.warning("toolbox asset serve failed for %s: %s", flow.request.path, e)
# #620 : for top-level HTML navigations, ask upstream for identity # #620/#646 : for top-level HTML navigations to hosts PROVEN
# encoding so we can stream-inject the loader without decompressing. # streaming-eligible, ask upstream for identity encoding so we can
# stream-inject the loader without decompressing. Unknown / CSP-strict
# hosts keep their gzip/br compression (learned in responseheaders).
if not _stream_enabled(): if not _stream_enabled():
return return
try: try:
req = flow.request req = flow.request
accept = (req.headers.get("accept", "") or "").lower() accept = (req.headers.get("accept", "") or "").lower()
dest = (req.headers.get("sec-fetch-dest", "") or "").lower() dest = (req.headers.get("sec-fetch-dest", "") or "").lower()
if dest == "document" or "text/html" in accept: is_doc = dest == "document" or "text/html" in accept
if is_doc and _STREAM_VERDICT.get(flow.request.pretty_host or ""):
if "accept-encoding" in req.headers: if "accept-encoding" in req.headers:
req.headers["accept-encoding"] = "identity" req.headers["accept-encoding"] = "identity"
except Exception: except Exception:
@ -787,10 +810,6 @@ class InjectBanner:
return return
if resp.status_code < 200 or resp.status_code >= 400: if resp.status_code < 200 or resp.status_code >= 400:
return return
# Compressed (upstream ignored our identity request) → let the buffer
# path handle it (mitmproxy auto-decodes there). Don't stream.
if resp.headers.get("content-encoding"):
return
if _client_level(flow) not in ("r2", "r3"): if _client_level(flow) not in ("r2", "r3"):
return return
try: try:
@ -802,15 +821,20 @@ class InjectBanner:
return return
except Exception: except Exception:
pass pass
# #636 — strict CSP would block the injected loader <script> and its # #646 — learn per-host streaming eligibility from THIS response, so the
# /__toolbox/bundle fetch → no banner. Don't stream; fall through to the # next visit's request() knows whether to strip Accept-Encoding. This is
# legacy buffer path, which injects an inline-CSS banner (no script/fetch) # independent of the current encoding: a host eligible today is worth
# that survives strict CSP. # asking identity from tomorrow. #636 — strict CSP blocks the injected
if _detect_csp_strict(flow): # loader <script>; #639 — only top-level navigations get the banner.
# Both disqualify streaming → ineligible → keep compression.
eligible = _is_top_level_document(flow) and not _detect_csp_strict(flow)
_record_stream_verdict(flow.request.pretty_host or "", eligible)
if not eligible:
return return
# #639 — only inject into top-level navigations; iframes/sub-documents # Compressed (upstream ignored our identity request, or this is the first
# each get their own responseheaders call → multiple banners per visit. # visit before we'd learned to ask identity) → let the buffer path handle
if not _is_top_level_document(flow): # it (mitmproxy auto-decodes there). Can only stream an identity body.
if resp.headers.get("content-encoding"):
return return
try: try:
resp.stream = _LoaderInjector(_loader_script(flow)) resp.stream = _LoaderInjector(_loader_script(flow))

View File

@ -0,0 +1,113 @@
# SPDX-License-Identifier: LicenseRef-CMSD-1.0
"""#646 — adaptive Accept-Encoding strip.
The loader stream-inject needs an identity-encoded body, but forcing identity on
EVERY document pulls CSP-strict/heavy pages uncompressed for no benefit. We only
strip Accept-Encoding for hosts proven streaming-eligible on a prior visit;
unknown/ineligible hosts keep gzip/br.
"""
import sys
import pathlib
import importlib
import json
ADDON_DIR = pathlib.Path(__file__).resolve().parents[1] / "mitmproxy_addons"
sys.path.insert(0, str(ADDON_DIR))
from mitmproxy.test import tflow, tutils # noqa: E402
from secubox_toolbox import filters # noqa: E402
def _addon(monkeypatch, tmp_path):
fp = tmp_path / "filters.json"
fp.write_text(json.dumps({"banner": True, "stream_inject": True}))
monkeypatch.setattr(filters, "FILTERS_PATH", str(fp))
filters.get_filters(force=True)
import inject_banner
importlib.reload(inject_banner)
monkeypatch.setattr(inject_banner, "_client_level", lambda flow: "r3")
inject_banner._STREAM_VERDICT.clear()
return inject_banner
def _doc_request(host="example.com"):
f = tflow.tflow()
f.request.host = host
f.request.headers["accept"] = "text/html,application/xhtml+xml"
f.request.headers["accept-encoding"] = "gzip, br"
f.request.headers["sec-fetch-dest"] = "document"
return f
def _html_response(host="example.com"):
f = tflow.tflow(resp=tutils.tresp())
f.request.host = host
f.response.headers["content-type"] = "text/html; charset=utf-8"
f.response.status_code = 200
f.request.headers["sec-fetch-dest"] = "document"
return f
def test_unknown_host_keeps_compression(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
f = _doc_request()
ib.InjectBanner().request(f)
assert f.request.headers["accept-encoding"] == "gzip, br" # NOT stripped
def test_eligible_host_strips_identity(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
ib._STREAM_VERDICT["example.com"] = True
f = _doc_request()
ib.InjectBanner().request(f)
assert f.request.headers["accept-encoding"] == "identity"
def test_ineligible_host_keeps_compression(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
ib._STREAM_VERDICT["example.com"] = False
f = _doc_request()
ib.InjectBanner().request(f)
assert f.request.headers["accept-encoding"] == "gzip, br"
def test_responseheaders_records_eligible(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
f = _html_response()
ib.InjectBanner().responseheaders(f)
assert ib._STREAM_VERDICT.get("example.com") is True
def test_responseheaders_records_ineligible_for_csp_strict(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
monkeypatch.setattr(ib, "_detect_csp_strict", lambda flow: True)
f = _html_response(host="strict.example.com")
ib.InjectBanner().responseheaders(f)
assert ib._STREAM_VERDICT.get("strict.example.com") is False
assert not f.metadata.get("sbx_streamed") # CSP-strict → buffer path, not stream
def test_learn_then_strip_end_to_end(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
# First visit: unknown host → not stripped.
r1 = _doc_request()
ib.InjectBanner().request(r1)
assert r1.request.headers["accept-encoding"] == "gzip, br"
# Response observed → host learned eligible.
ib.InjectBanner().responseheaders(_html_response())
assert ib._STREAM_VERDICT.get("example.com") is True
# Second visit: now stripped → streaming will engage.
r2 = _doc_request()
ib.InjectBanner().request(r2)
assert r2.request.headers["accept-encoding"] == "identity"
def test_verdict_cache_self_heals_on_overflow(monkeypatch, tmp_path):
ib = _addon(monkeypatch, tmp_path)
monkeypatch.setattr(ib, "_STREAM_VERDICT_MAX", 4)
for i in range(4):
ib._record_stream_verdict(f"h{i}", True)
assert len(ib._STREAM_VERDICT) == 4
ib._record_stream_verdict("h4", True) # overflow → clear then add
assert len(ib._STREAM_VERDICT) == 1
assert ib._STREAM_VERDICT.get("h4") is True