#!/bin/sh # PeerTube Video Transcript Extraction & AI Analysis # SecuBox Intelligence Module # Compatible: OpenWrt, Alpine, Debian, Ubuntu set -e #============================================================================= # CONFIGURATION #============================================================================= SCRIPT_VERSION="1.0.0" PEERTUBE_INSTANCE="${PEERTUBE_INSTANCE:-tube.gk2.secubox.in}" OUTPUT_BASE="${OUTPUT_BASE:-./output}" WHISPER_MODEL="${WHISPER_MODEL:-medium}" WHISPER_LANG="${WHISPER_LANG:-fr}" CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}" MAX_TRANSCRIPT_CHARS=12000 MAX_TOKENS=2000 #============================================================================= # COLORS & LOGGING #============================================================================= # Check if terminal supports colors if [ -t 1 ] && command -v tput >/dev/null 2>&1; then RED=$(tput setaf 1) GREEN=$(tput setaf 2) YELLOW=$(tput setaf 3) CYAN=$(tput setaf 6) BOLD=$(tput bold) NC=$(tput sgr0) else RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' fi log_info() { printf "%b[INFO]%b %s\n" "$CYAN" "$NC" "$1"; } log_ok() { printf "%b[OK]%b %s\n" "$GREEN" "$NC" "$1"; } log_warn() { printf "%b[WARN]%b %s\n" "$YELLOW" "$NC" "$1"; } log_error() { printf "%b[ERROR]%b %s\n" "$RED" "$NC" "$1" >&2; } log_step() { printf "\n%b==> %s%b\n" "$BOLD$CYAN" "$1" "$NC"; } #============================================================================= # DEPENDENCY CHECK #============================================================================= check_dependencies() { log_step "Checking dependencies" local missing="" local deps="yt-dlp ffmpeg jq curl" for dep in $deps; do if command -v "$dep" >/dev/null 2>&1; then log_ok "$dep found: $(command -v "$dep")" else log_error "$dep not found" missing="$missing $dep" fi done # Whisper check (optional but warned) if command -v whisper >/dev/null 2>&1; then log_ok "whisper found: $(command -v whisper)" WHISPER_CMD="whisper" elif command -v whisper-cpp >/dev/null 2>&1; then log_ok "whisper-cpp found: $(command -v whisper-cpp)" WHISPER_CMD="whisper-cpp" elif command -v main >/dev/null 2>&1 && [ -f "$(dirname "$(command -v main)")/models/ggml-medium.bin" ]; then log_ok "whisper.cpp (main) found" WHISPER_CMD="main" else log_warn "whisper not found - transcription will only work with existing subtitles" WHISPER_CMD="" fi # API key check if [ -z "$ANTHROPIC_API_KEY" ]; then log_warn "ANTHROPIC_API_KEY not set - AI analysis disabled" else log_ok "ANTHROPIC_API_KEY configured" fi if [ -n "$missing" ]; then log_error "Missing required dependencies:$missing" return 1 fi return 0 } #============================================================================= # UTILITY FUNCTIONS #============================================================================= # Extract video ID from PeerTube URL extract_video_id() { local url="$1" # Handle various URL formats: # https://instance/w/VIDEO_ID # https://instance/videos/watch/VIDEO_ID # https://instance/videos/watch/VIDEO_UUID echo "$url" | sed -E 's|.*/w/([^/?]+).*|\1|; s|.*/videos/watch/([^/?]+).*|\1|' } # Generate slug from title generate_slug() { echo "$1" | tr '[:upper:]' '[:lower:]' | \ sed -E 's/[àáâãäå]/a/g; s/[èéêë]/e/g; s/[ìíîï]/i/g; s/[òóôõö]/o/g; s/[ùúûü]/u/g; s/[ç]/c/g' | \ sed -E 's/[^a-z0-9]+/-/g; s/^-+|-+$//g' | \ cut -c1-50 } # Clean VTT to plain text vtt_to_text() { local vtt_file="$1" local txt_file="$2" # Remove VTT headers, timestamps, positioning, and duplicates sed -E ' /^WEBVTT/d /^Kind:/d /^Language:/d /^NOTE/d /^[0-9]+$/d /^[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/d /^$/d s/<[^>]*>//g s/ / /g s/&/\&/g s/<//g ' "$vtt_file" | \ awk '!seen[$0]++' | \ tr '\n' ' ' | \ sed -E 's/ +/ /g; s/^ +| +$//g' > "$txt_file" } # Truncate text to max chars while preserving word boundaries truncate_text() { local text="$1" local max="$2" if [ ${#text} -le "$max" ]; then echo "$text" else echo "$text" | cut -c1-"$max" | sed 's/[^ ]*$//' echo "... [TRUNCATED]" fi } #============================================================================= # 1. METADATA EXTRACTION #============================================================================= extract_metadata() { local url="$1" local output_dir="$2" local slug="$3" log_step "Extracting metadata" local meta_file="$output_dir/${slug}.meta.json" # Use yt-dlp to dump JSON metadata if yt-dlp --dump-json --no-warnings "$url" 2>/dev/null > "$meta_file.tmp"; then # Extract relevant fields with jq jq '{ id: .id, title: .title, description: .description, duration: .duration, duration_string: .duration_string, upload_date: .upload_date, uploader: .uploader, uploader_id: .uploader_id, channel: .channel, view_count: .view_count, like_count: .like_count, tags: .tags, categories: .categories, webpage_url: .webpage_url, thumbnail: .thumbnail, language: .language, subtitles: ((.subtitles // {}) | keys), automatic_captions: ((.automatic_captions // {}) | keys) }' "$meta_file.tmp" > "$meta_file" rm -f "$meta_file.tmp" log_ok "Metadata saved to $meta_file" # Display summary local title=$(jq -r '.title' "$meta_file") local duration=$(jq -r '.duration_string // .duration' "$meta_file") local uploader=$(jq -r '.uploader // .channel // "Unknown"' "$meta_file") printf " Title: %s\n" "$title" printf " Duration: %s\n" "$duration" printf " Uploader: %s\n" "$uploader" return 0 else log_error "Failed to extract metadata" return 1 fi } #============================================================================= # 2. SUBTITLE DOWNLOAD & CONVERSION #============================================================================= check_peertube_captions() { local video_id="$1" local instance="$2" log_info "Checking PeerTube captions API..." local api_url="https://${instance}/api/v1/videos/${video_id}/captions" local response response=$(curl -s -w "\n%{http_code}" "$api_url" 2>/dev/null) local http_code=$(echo "$response" | tail -n1) local body=$(echo "$response" | sed '$d') if [ "$http_code" = "200" ]; then local caption_count=$(echo "$body" | jq '.total // 0') if [ "$caption_count" -gt 0 ]; then log_ok "Found $caption_count caption(s) via API" echo "$body" | jq -r '.data[].language.id' 2>/dev/null return 0 fi fi return 1 } download_subtitles() { local url="$1" local output_dir="$2" local slug="$3" local lang="${4:-fr}" log_step "Downloading subtitles" local vtt_file="$output_dir/${slug}.${lang}.vtt" local txt_file="$output_dir/${slug}.transcript.txt" # Try to download subtitles with yt-dlp if yt-dlp --write-sub --write-auto-sub --sub-lang "$lang,en" \ --sub-format vtt --skip-download \ -o "$output_dir/${slug}" "$url" 2>/dev/null; then # Find downloaded VTT file local found_vtt=$(find "$output_dir" -name "${slug}*.vtt" -type f | head -1) if [ -n "$found_vtt" ] && [ -f "$found_vtt" ]; then # Rename to standard name mv "$found_vtt" "$vtt_file" 2>/dev/null || cp "$found_vtt" "$vtt_file" log_ok "Subtitles downloaded: $vtt_file" # Convert to plain text vtt_to_text "$vtt_file" "$txt_file" log_ok "Converted to text: $txt_file" local word_count=$(wc -w < "$txt_file" | tr -d ' ') printf " Word count: %s\n" "$word_count" return 0 fi fi log_warn "No subtitles available for download" return 1 } #============================================================================= # 3. WHISPER TRANSCRIPTION #============================================================================= extract_audio() { local url="$1" local output_dir="$2" local slug="$3" log_info "Extracting audio..." local audio_file="$output_dir/${slug}.audio.wav" # Download and convert to 16kHz mono WAV (Whisper optimal format) if yt-dlp -x --audio-format wav -o "$output_dir/${slug}.%(ext)s" "$url" 2>/dev/null; then # Convert to Whisper-optimal format local downloaded=$(find "$output_dir" -name "${slug}.*" -type f | grep -E '\.(wav|mp3|m4a|opus|webm)$' | head -1) if [ -n "$downloaded" ]; then ffmpeg -y -i "$downloaded" -vn -ac 1 -ar 16000 -f wav "$audio_file" 2>/dev/null rm -f "$downloaded" log_ok "Audio extracted: $audio_file" return 0 fi fi log_error "Failed to extract audio" return 1 } run_whisper() { local audio_file="$1" local output_dir="$2" local slug="$3" local model="$4" local lang="$5" log_step "Running Whisper transcription" if [ -z "$WHISPER_CMD" ]; then log_error "Whisper not available" return 1 fi local txt_file="$output_dir/${slug}.transcript.txt" log_info "Model: $model, Language: $lang" log_info "This may take a while..." case "$WHISPER_CMD" in whisper) # OpenAI Whisper Python whisper "$audio_file" \ --model "$model" \ --language "$lang" \ --output_format txt \ --output_dir "$output_dir" \ --verbose False 2>/dev/null # Rename output local whisper_out="$output_dir/$(basename "$audio_file" .wav).txt" [ -f "$whisper_out" ] && mv "$whisper_out" "$txt_file" ;; whisper-cpp|main) # whisper.cpp local model_path="${WHISPER_MODELS_PATH:-$HOME/.cache/whisper}/ggml-${model}.bin" if [ ! -f "$model_path" ]; then log_warn "Model not found: $model_path" log_info "Downloading model..." # Try to download model mkdir -p "$(dirname "$model_path")" curl -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${model}.bin" \ -o "$model_path" 2>/dev/null || { log_error "Failed to download model" return 1 } fi "$WHISPER_CMD" -m "$model_path" -l "$lang" -otxt -of "$output_dir/${slug}.transcript" "$audio_file" 2>/dev/null ;; esac if [ -f "$txt_file" ]; then # Clean up the transcript sed -i 's/\[.*\]//g; s/ */ /g' "$txt_file" 2>/dev/null || \ sed 's/\[.*\]//g; s/ */ /g' "$txt_file" > "$txt_file.tmp" && mv "$txt_file.tmp" "$txt_file" local word_count=$(wc -w < "$txt_file" | tr -d ' ') log_ok "Transcription complete: $word_count words" return 0 else log_error "Whisper transcription failed" return 1 fi } #============================================================================= # 4. CLAUDE AI ANALYSIS #============================================================================= analyse_with_claude() { local meta_file="$1" local transcript_file="$2" local output_dir="$3" local slug="$4" log_step "Running Claude AI analysis" if [ -z "$ANTHROPIC_API_KEY" ]; then log_error "ANTHROPIC_API_KEY not set" return 1 fi if [ ! -f "$transcript_file" ]; then log_error "Transcript file not found: $transcript_file" return 1 fi local analysis_file="$output_dir/${slug}.analyse.md" # Read metadata local title=$(jq -r '.title // "Unknown"' "$meta_file" 2>/dev/null) local duration=$(jq -r '.duration_string // .duration // "Unknown"' "$meta_file" 2>/dev/null) local uploader=$(jq -r '.uploader // .channel // "Unknown"' "$meta_file" 2>/dev/null) local upload_date=$(jq -r '.upload_date // "Unknown"' "$meta_file" 2>/dev/null) local tags=$(jq -r '.tags | if . then join(", ") else "None" end' "$meta_file" 2>/dev/null) local url=$(jq -r '.webpage_url // "Unknown"' "$meta_file" 2>/dev/null) # Read and truncate transcript local transcript=$(cat "$transcript_file") local transcript_len=${#transcript} if [ "$transcript_len" -gt "$MAX_TRANSCRIPT_CHARS" ]; then log_warn "Transcript truncated from $transcript_len to $MAX_TRANSCRIPT_CHARS chars" transcript=$(echo "$transcript" | head -c "$MAX_TRANSCRIPT_CHARS") transcript="${transcript}... [TRUNCATED - Original: ${transcript_len} chars]" fi # Escape special characters for JSON transcript=$(echo "$transcript" | jq -Rs '.') title=$(echo "$title" | jq -Rs '.' | sed 's/^"//;s/"$//') log_info "Calling Claude API ($CLAUDE_MODEL)..." # Build the API request local system_prompt="Tu es un analyste expert en renseignement, cybersécurité et géopolitique. Tu analyses des transcripts vidéo de manière structurée et rigoureuse. Tu réponds toujours en français." local user_prompt="Analyse le transcript vidéo suivant: === MÉTADONNÉES === Titre: ${title} Durée: ${duration} Auteur: ${uploader} Date: ${upload_date} Tags: ${tags} URL: ${url} === TRANSCRIPT === ${transcript} === INSTRUCTIONS === Produis une analyse structurée en Markdown comprenant: 1. **Résumé exécutif** (5 lignes maximum) 2. **Thèmes principaux et sous-thèmes** 3. **Acteurs / entités mentionnés** (personnes, organisations, pays) 4. **Points factuels clés et révélations notables** 5. **Angle narratif et biais éventuels** 6. **Pertinence pour un professionnel en cybersécurité et renseignement** 7. **Questions ouvertes ou points à approfondir** Sois factuel, précis et critique dans ton analyse." # Make API call local response response=$(curl -s -w "\n%{http_code}" "https://api.anthropic.com/v1/messages" \ -H "Content-Type: application/json" \ -H "x-api-key: $ANTHROPIC_API_KEY" \ -H "anthropic-version: 2023-06-01" \ -d "$(jq -n \ --arg model "$CLAUDE_MODEL" \ --argjson max_tokens "$MAX_TOKENS" \ --arg system "$system_prompt" \ --arg user "$user_prompt" \ '{ model: $model, max_tokens: $max_tokens, system: $system, messages: [ {role: "user", content: $user} ] }')" 2>/dev/null) local http_code=$(echo "$response" | tail -n1) local body=$(echo "$response" | sed '$d') if [ "$http_code" != "200" ]; then log_error "API error (HTTP $http_code)" echo "$body" | jq -r '.error.message // .' 2>/dev/null | head -3 return 1 fi # Extract and save the analysis local analysis=$(echo "$body" | jq -r '.content[0].text // empty') if [ -z "$analysis" ]; then log_error "Empty response from Claude" return 1 fi # Create Markdown file with header cat > "$analysis_file" << EOF # Analyse: ${title} **Source:** ${url} **Durée:** ${duration} **Auteur:** ${uploader} **Date:** ${upload_date} **Analysé le:** $(date +"%Y-%m-%d %H:%M") **Modèle:** ${CLAUDE_MODEL} --- ${analysis} --- *Analyse générée automatiquement par SecuBox Intelligence Module v${SCRIPT_VERSION}* EOF log_ok "Analysis saved to $analysis_file" # Display summary in terminal printf "\n%b=== RÉSUMÉ ===%b\n" "$BOLD$GREEN" "$NC" echo "$analysis" | sed -n '/Résumé exécutif/,/^##\|^[0-9]\./p' | head -10 return 0 } #============================================================================= # MAIN PIPELINE #============================================================================= process_video() { local url="$1" log_step "Processing video: $url" # Extract video ID and create output directory local video_id=$(extract_video_id "$url") local instance=$(echo "$url" | sed -E 's|https?://([^/]+).*|\1|') log_info "Video ID: $video_id" log_info "Instance: $instance" # Create temporary metadata to get title for slug local temp_meta=$(mktemp) if ! yt-dlp --dump-json --no-warnings "$url" 2>/dev/null > "$temp_meta"; then log_error "Failed to fetch video info" rm -f "$temp_meta" return 1 fi local title=$(jq -r '.title // "video"' "$temp_meta") local slug=$(generate_slug "$title") slug="${slug:-$video_id}" rm -f "$temp_meta" log_info "Slug: $slug" # Create output directory local output_dir="$OUTPUT_BASE/$slug" mkdir -p "$output_dir" log_ok "Output directory: $output_dir" # 1. Extract metadata extract_metadata "$url" "$output_dir" "$slug" || { log_warn "Metadata extraction failed, continuing..." } local transcript_file="$output_dir/${slug}.transcript.txt" local has_transcript=0 # 2. Try to download existing subtitles if [ "$NO_WHISPER" != "1" ] || [ "$FORCE_WHISPER" != "1" ]; then # Check PeerTube API first check_peertube_captions "$video_id" "$instance" 2>/dev/null if download_subtitles "$url" "$output_dir" "$slug" "$WHISPER_LANG"; then has_transcript=1 fi fi # 3. Run Whisper if needed if [ "$has_transcript" = "0" ] || [ "$FORCE_WHISPER" = "1" ]; then if [ "$NO_WHISPER" = "1" ]; then log_warn "Whisper disabled, no transcript available" elif [ -n "$WHISPER_CMD" ]; then local audio_file="$output_dir/${slug}.audio.wav" if extract_audio "$url" "$output_dir" "$slug"; then if run_whisper "$audio_file" "$output_dir" "$slug" "$WHISPER_MODEL" "$WHISPER_LANG"; then has_transcript=1 fi # Clean up audio file rm -f "$audio_file" fi else log_warn "No Whisper available and no subtitles found" fi fi # 4. Run Claude analysis if [ "$NO_ANALYSE" != "1" ] && [ "$has_transcript" = "1" ]; then local meta_file="$output_dir/${slug}.meta.json" analyse_with_claude "$meta_file" "$transcript_file" "$output_dir" "$slug" || { log_warn "Analysis failed" } elif [ "$NO_ANALYSE" = "1" ]; then log_info "Analysis disabled (--no-analyse)" else log_warn "No transcript available for analysis" fi # Summary log_step "Processing complete" printf "\nOutput files in %s:\n" "$output_dir" ls -la "$output_dir" 2>/dev/null | tail -n +2 return 0 } #============================================================================= # CLI PARSING #============================================================================= show_help() { cat << EOF ${BOLD}PeerTube Video Transcript & Analysis Tool${NC} SecuBox Intelligence Module v${SCRIPT_VERSION} ${BOLD}Usage:${NC} $(basename "$0") [OPTIONS] --url $(basename "$0") [OPTIONS] ${BOLD}Options:${NC} --url PeerTube video URL --no-whisper Disable Whisper (subtitles only) --force-whisper Force Whisper even if subtitles exist --no-analyse Download/transcribe without Claude analysis --model Whisper model (tiny, base, small, medium, large-v3) Default: ${WHISPER_MODEL} --lang Whisper language code (fr, en, de, etc.) Default: ${WHISPER_LANG} --output Output base directory Default: ${OUTPUT_BASE} --claude-model Claude model for analysis Default: ${CLAUDE_MODEL} -h, --help Show this help message -v, --version Show version ${BOLD}Environment Variables:${NC} ANTHROPIC_API_KEY Claude API key (required for analysis) PEERTUBE_INSTANCE Default PeerTube instance WHISPER_MODELS_PATH Path to Whisper models ${BOLD}Examples:${NC} # Basic usage $(basename "$0") https://tube.gk2.secubox.in/w/abc123 # Force Whisper transcription with large model $(basename "$0") --force-whisper --model large-v3 --url https://... # Subtitles only, no AI analysis $(basename "$0") --no-whisper --no-analyse https://... ${BOLD}Output Structure:${NC} ./output// ├── .meta.json # Video metadata ├── .fr.vtt # Original subtitles (if available) ├── .transcript.txt # Plain text transcript └── .analyse.md # Claude AI analysis EOF } show_version() { echo "PeerTube Analyse v${SCRIPT_VERSION}" echo "SecuBox Intelligence Module" } parse_args() { VIDEO_URL="" NO_WHISPER="" FORCE_WHISPER="" NO_ANALYSE="" while [ $# -gt 0 ]; do case "$1" in --url) VIDEO_URL="$2" shift 2 ;; --no-whisper) NO_WHISPER=1 shift ;; --force-whisper) FORCE_WHISPER=1 shift ;; --no-analyse|--no-analyze) NO_ANALYSE=1 shift ;; --model) WHISPER_MODEL="$2" shift 2 ;; --lang) WHISPER_LANG="$2" shift 2 ;; --output) OUTPUT_BASE="$2" shift 2 ;; --claude-model) CLAUDE_MODEL="$2" shift 2 ;; -h|--help) show_help exit 0 ;; -v|--version) show_version exit 0 ;; -*) log_error "Unknown option: $1" show_help exit 1 ;; *) # Positional argument = URL if [ -z "$VIDEO_URL" ]; then VIDEO_URL="$1" else log_error "Multiple URLs not supported" exit 1 fi shift ;; esac done if [ -z "$VIDEO_URL" ]; then log_error "No video URL provided" show_help exit 1 fi } #============================================================================= # ENTRY POINT #============================================================================= main() { parse_args "$@" printf "\n%b╔══════════════════════════════════════════════════════╗%b\n" "$BOLD$CYAN" "$NC" printf "%b║ PeerTube Transcript & Analysis Tool v%-16s║%b\n" "$BOLD$CYAN" "$SCRIPT_VERSION" "$NC" printf "%b║ SecuBox Intelligence Module ║%b\n" "$BOLD$CYAN" "$NC" printf "%b╚══════════════════════════════════════════════════════╝%b\n\n" "$BOLD$CYAN" "$NC" check_dependencies || exit 1 process_video "$VIDEO_URL" local exit_code=$? if [ $exit_code -eq 0 ]; then printf "\n%b✓ All done!%b\n\n" "$BOLD$GREEN" "$NC" else printf "\n%b✗ Completed with errors%b\n\n" "$BOLD$YELLOW" "$NC" fi return $exit_code } # Run if not sourced if [ "${0##*/}" = "peertube-analyse" ] || [ "${0##*/}" = "sh" ]; then main "$@" fi