Implement secubox-ai-gateway package with intelligent AI request routing based on data sensitivity classification for GDPR/ANSSI compliance. Features: - 3-tier data classification: LOCAL_ONLY, SANITIZED, CLOUD_DIRECT - Provider hierarchy: LocalAI > Mistral (EU) > Claude > GPT > Gemini > xAI - PII sanitizer: IPv4/IPv6, MAC, credentials, private keys scrubbing - OpenAI-compatible API proxy on port 4050 - aigatewayctl CLI: status, classify, sanitize, provider, audit commands - RPCD backend with 11 ubus methods for LuCI integration - ANSSI CSPN audit logging in JSONL format Classification patterns detect: - IP addresses, MAC addresses, private keys - Credentials (password, secret, token, api_key) - System paths, security tool references - WireGuard configuration data All cloud providers are opt-in. Default LOCAL_ONLY ensures data sovereignty - sensitive data never leaves the device. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
137 lines
3.8 KiB
Bash
137 lines
3.8 KiB
Bash
#!/bin/sh
|
|
# SecuBox AI Gateway - Data Classifier (Sovereignty Engine)
|
|
# Classifies AI requests into LOCAL_ONLY, SANITIZED, or CLOUD_DIRECT tiers
|
|
|
|
CONFIG="ai-gateway"
|
|
PATTERNS_LOCAL="/tmp/ai-gateway/patterns_local.txt"
|
|
PATTERNS_SANITIZABLE="/tmp/ai-gateway/patterns_sanitizable.txt"
|
|
|
|
# Classification levels (numeric for comparison)
|
|
CLASS_LOCAL_ONLY=0
|
|
CLASS_SANITIZED=1
|
|
CLASS_CLOUD_DIRECT=2
|
|
|
|
# Initialize pattern cache from UCI
|
|
init_patterns() {
|
|
mkdir -p /tmp/ai-gateway
|
|
|
|
# Load LOCAL_ONLY patterns from UCI
|
|
> "$PATTERNS_LOCAL"
|
|
local patterns=$(uci -q get ${CONFIG}.local_only_patterns.pattern 2>/dev/null)
|
|
for p in $patterns; do
|
|
echo "$p" >> "$PATTERNS_LOCAL"
|
|
done
|
|
|
|
# Load SANITIZABLE patterns from UCI
|
|
> "$PATTERNS_SANITIZABLE"
|
|
patterns=$(uci -q get ${CONFIG}.sanitizable_patterns.pattern 2>/dev/null)
|
|
for p in $patterns; do
|
|
echo "$p" >> "$PATTERNS_SANITIZABLE"
|
|
done
|
|
}
|
|
|
|
# Classify a text block
|
|
# Returns: 0=LOCAL_ONLY, 1=SANITIZED, 2=CLOUD_DIRECT
|
|
classify_text() {
|
|
local text="$1"
|
|
local classification=$CLASS_CLOUD_DIRECT
|
|
|
|
# Check for LOCAL_ONLY patterns (highest priority)
|
|
while IFS= read -r pattern; do
|
|
[ -z "$pattern" ] && continue
|
|
if echo "$text" | grep -qiE "$pattern" 2>/dev/null; then
|
|
return $CLASS_LOCAL_ONLY
|
|
fi
|
|
done < "$PATTERNS_LOCAL"
|
|
|
|
# Check for SANITIZABLE patterns
|
|
while IFS= read -r pattern; do
|
|
[ -z "$pattern" ] && continue
|
|
if echo "$text" | grep -qiE "$pattern" 2>/dev/null; then
|
|
classification=$CLASS_SANITIZED
|
|
fi
|
|
done < "$PATTERNS_SANITIZABLE"
|
|
|
|
return $classification
|
|
}
|
|
|
|
# Detailed classification with reason (JSON output)
|
|
classify_with_reason() {
|
|
local text="$1"
|
|
local matched_pattern=""
|
|
|
|
# Check LOCAL_ONLY patterns
|
|
while IFS= read -r pattern; do
|
|
[ -z "$pattern" ] && continue
|
|
if echo "$text" | grep -qiE "$pattern" 2>/dev/null; then
|
|
printf '{"classification":"local_only","reason":"matched_sensitive_pattern","pattern":"%s"}\n' "$pattern"
|
|
return 0
|
|
fi
|
|
done < "$PATTERNS_LOCAL"
|
|
|
|
# Check SANITIZABLE patterns
|
|
while IFS= read -r pattern; do
|
|
[ -z "$pattern" ] && continue
|
|
if echo "$text" | grep -qiE "$pattern" 2>/dev/null; then
|
|
printf '{"classification":"sanitized","reason":"contains_pii","pattern":"%s"}\n' "$pattern"
|
|
return 0
|
|
fi
|
|
done < "$PATTERNS_SANITIZABLE"
|
|
|
|
# Default: safe for cloud
|
|
printf '{"classification":"cloud_direct","reason":"no_sensitive_data","pattern":""}\n'
|
|
}
|
|
|
|
# High-level classification of full JSON request
|
|
classify_request() {
|
|
local request_json="$1"
|
|
local worst_classification=$CLASS_CLOUD_DIRECT
|
|
|
|
# Extract all text content from messages array
|
|
local messages=$(echo "$request_json" | jsonfilter -e '@.messages[*].content' 2>/dev/null)
|
|
|
|
for msg in $messages; do
|
|
classify_text "$msg"
|
|
local msg_class=$?
|
|
[ $msg_class -lt $worst_classification ] && worst_classification=$msg_class
|
|
done
|
|
|
|
# Also check prompt field (for completions API)
|
|
local prompt=$(echo "$request_json" | jsonfilter -e '@.prompt' 2>/dev/null)
|
|
if [ -n "$prompt" ]; then
|
|
classify_text "$prompt"
|
|
local prompt_class=$?
|
|
[ $prompt_class -lt $worst_classification ] && worst_classification=$prompt_class
|
|
fi
|
|
|
|
# Check system message
|
|
local system=$(echo "$request_json" | jsonfilter -e '@.messages[0].content' 2>/dev/null)
|
|
if [ -n "$system" ]; then
|
|
classify_text "$system"
|
|
local sys_class=$?
|
|
[ $sys_class -lt $worst_classification ] && worst_classification=$sys_class
|
|
fi
|
|
|
|
return $worst_classification
|
|
}
|
|
|
|
# Convert numeric classification to string
|
|
classification_to_string() {
|
|
case "$1" in
|
|
0) echo "local_only" ;;
|
|
1) echo "sanitized" ;;
|
|
2) echo "cloud_direct" ;;
|
|
*) echo "local_only" ;; # Default to most restrictive
|
|
esac
|
|
}
|
|
|
|
# Convert string classification to numeric
|
|
string_to_classification() {
|
|
case "$1" in
|
|
local_only) echo 0 ;;
|
|
sanitized) echo 1 ;;
|
|
cloud_direct) echo 2 ;;
|
|
*) echo 0 ;; # Default to most restrictive
|
|
esac
|
|
}
|