PeerTube videos don't include the automatic_captions field in their
yt-dlp JSON output. The jq filter was attempting (.automatic_captions | keys)
which fails with "null (null) has no keys" when the field is missing.
Fixed by adding null-coalescing: ((.automatic_captions // {}) | keys)
Also applied same fix to subtitles field for consistency.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
779 lines
24 KiB
Bash
Executable File
779 lines
24 KiB
Bash
Executable File
#!/bin/sh
|
|
# PeerTube Video Transcript Extraction & AI Analysis
|
|
# SecuBox Intelligence Module
|
|
# Compatible: OpenWrt, Alpine, Debian, Ubuntu
|
|
|
|
set -e
|
|
|
|
#=============================================================================
|
|
# CONFIGURATION
|
|
#=============================================================================
|
|
|
|
SCRIPT_VERSION="1.0.0"
|
|
PEERTUBE_INSTANCE="${PEERTUBE_INSTANCE:-tube.gk2.secubox.in}"
|
|
OUTPUT_BASE="${OUTPUT_BASE:-./output}"
|
|
WHISPER_MODEL="${WHISPER_MODEL:-medium}"
|
|
WHISPER_LANG="${WHISPER_LANG:-fr}"
|
|
CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
|
|
MAX_TRANSCRIPT_CHARS=12000
|
|
MAX_TOKENS=2000
|
|
|
|
#=============================================================================
|
|
# COLORS & LOGGING
|
|
#=============================================================================
|
|
|
|
# Check if terminal supports colors
|
|
if [ -t 1 ] && command -v tput >/dev/null 2>&1; then
|
|
RED=$(tput setaf 1)
|
|
GREEN=$(tput setaf 2)
|
|
YELLOW=$(tput setaf 3)
|
|
CYAN=$(tput setaf 6)
|
|
BOLD=$(tput bold)
|
|
NC=$(tput sgr0)
|
|
else
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
fi
|
|
|
|
log_info() { printf "%b[INFO]%b %s\n" "$CYAN" "$NC" "$1"; }
|
|
log_ok() { printf "%b[OK]%b %s\n" "$GREEN" "$NC" "$1"; }
|
|
log_warn() { printf "%b[WARN]%b %s\n" "$YELLOW" "$NC" "$1"; }
|
|
log_error() { printf "%b[ERROR]%b %s\n" "$RED" "$NC" "$1" >&2; }
|
|
log_step() { printf "\n%b==> %s%b\n" "$BOLD$CYAN" "$1" "$NC"; }
|
|
|
|
#=============================================================================
|
|
# DEPENDENCY CHECK
|
|
#=============================================================================
|
|
|
|
check_dependencies() {
|
|
log_step "Checking dependencies"
|
|
|
|
local missing=""
|
|
local deps="yt-dlp ffmpeg jq curl"
|
|
|
|
for dep in $deps; do
|
|
if command -v "$dep" >/dev/null 2>&1; then
|
|
log_ok "$dep found: $(command -v "$dep")"
|
|
else
|
|
log_error "$dep not found"
|
|
missing="$missing $dep"
|
|
fi
|
|
done
|
|
|
|
# Whisper check (optional but warned)
|
|
if command -v whisper >/dev/null 2>&1; then
|
|
log_ok "whisper found: $(command -v whisper)"
|
|
WHISPER_CMD="whisper"
|
|
elif command -v whisper-cpp >/dev/null 2>&1; then
|
|
log_ok "whisper-cpp found: $(command -v whisper-cpp)"
|
|
WHISPER_CMD="whisper-cpp"
|
|
elif command -v main >/dev/null 2>&1 && [ -f "$(dirname "$(command -v main)")/models/ggml-medium.bin" ]; then
|
|
log_ok "whisper.cpp (main) found"
|
|
WHISPER_CMD="main"
|
|
else
|
|
log_warn "whisper not found - transcription will only work with existing subtitles"
|
|
WHISPER_CMD=""
|
|
fi
|
|
|
|
# API key check
|
|
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
|
log_warn "ANTHROPIC_API_KEY not set - AI analysis disabled"
|
|
else
|
|
log_ok "ANTHROPIC_API_KEY configured"
|
|
fi
|
|
|
|
if [ -n "$missing" ]; then
|
|
log_error "Missing required dependencies:$missing"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
#=============================================================================
|
|
# UTILITY FUNCTIONS
|
|
#=============================================================================
|
|
|
|
# Extract video ID from PeerTube URL
|
|
extract_video_id() {
|
|
local url="$1"
|
|
# Handle various URL formats:
|
|
# https://instance/w/VIDEO_ID
|
|
# https://instance/videos/watch/VIDEO_ID
|
|
# https://instance/videos/watch/VIDEO_UUID
|
|
echo "$url" | sed -E 's|.*/w/([^/?]+).*|\1|; s|.*/videos/watch/([^/?]+).*|\1|'
|
|
}
|
|
|
|
# Generate slug from title
|
|
generate_slug() {
|
|
echo "$1" | tr '[:upper:]' '[:lower:]' | \
|
|
sed -E 's/[àáâãäå]/a/g; s/[èéêë]/e/g; s/[ìíîï]/i/g; s/[òóôõö]/o/g; s/[ùúûü]/u/g; s/[ç]/c/g' | \
|
|
sed -E 's/[^a-z0-9]+/-/g; s/^-+|-+$//g' | \
|
|
cut -c1-50
|
|
}
|
|
|
|
# Clean VTT to plain text
|
|
vtt_to_text() {
|
|
local vtt_file="$1"
|
|
local txt_file="$2"
|
|
|
|
# Remove VTT headers, timestamps, positioning, and duplicates
|
|
sed -E '
|
|
/^WEBVTT/d
|
|
/^Kind:/d
|
|
/^Language:/d
|
|
/^NOTE/d
|
|
/^[0-9]+$/d
|
|
/^[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/d
|
|
/^$/d
|
|
s/<[^>]*>//g
|
|
s/ / /g
|
|
s/&/\&/g
|
|
s/</</g
|
|
s/>/>/g
|
|
' "$vtt_file" | \
|
|
awk '!seen[$0]++' | \
|
|
tr '\n' ' ' | \
|
|
sed -E 's/ +/ /g; s/^ +| +$//g' > "$txt_file"
|
|
}
|
|
|
|
# Truncate text to max chars while preserving word boundaries
|
|
truncate_text() {
|
|
local text="$1"
|
|
local max="$2"
|
|
|
|
if [ ${#text} -le "$max" ]; then
|
|
echo "$text"
|
|
else
|
|
echo "$text" | cut -c1-"$max" | sed 's/[^ ]*$//'
|
|
echo "... [TRUNCATED]"
|
|
fi
|
|
}
|
|
|
|
#=============================================================================
|
|
# 1. METADATA EXTRACTION
|
|
#=============================================================================
|
|
|
|
extract_metadata() {
|
|
local url="$1"
|
|
local output_dir="$2"
|
|
local slug="$3"
|
|
|
|
log_step "Extracting metadata"
|
|
|
|
local meta_file="$output_dir/${slug}.meta.json"
|
|
|
|
# Use yt-dlp to dump JSON metadata
|
|
if yt-dlp --dump-json --no-warnings "$url" 2>/dev/null > "$meta_file.tmp"; then
|
|
# Extract relevant fields with jq
|
|
jq '{
|
|
id: .id,
|
|
title: .title,
|
|
description: .description,
|
|
duration: .duration,
|
|
duration_string: .duration_string,
|
|
upload_date: .upload_date,
|
|
uploader: .uploader,
|
|
uploader_id: .uploader_id,
|
|
channel: .channel,
|
|
view_count: .view_count,
|
|
like_count: .like_count,
|
|
tags: .tags,
|
|
categories: .categories,
|
|
webpage_url: .webpage_url,
|
|
thumbnail: .thumbnail,
|
|
language: .language,
|
|
subtitles: ((.subtitles // {}) | keys),
|
|
automatic_captions: ((.automatic_captions // {}) | keys)
|
|
}' "$meta_file.tmp" > "$meta_file"
|
|
rm -f "$meta_file.tmp"
|
|
|
|
log_ok "Metadata saved to $meta_file"
|
|
|
|
# Display summary
|
|
local title=$(jq -r '.title' "$meta_file")
|
|
local duration=$(jq -r '.duration_string // .duration' "$meta_file")
|
|
local uploader=$(jq -r '.uploader // .channel // "Unknown"' "$meta_file")
|
|
|
|
printf " Title: %s\n" "$title"
|
|
printf " Duration: %s\n" "$duration"
|
|
printf " Uploader: %s\n" "$uploader"
|
|
|
|
return 0
|
|
else
|
|
log_error "Failed to extract metadata"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
#=============================================================================
|
|
# 2. SUBTITLE DOWNLOAD & CONVERSION
|
|
#=============================================================================
|
|
|
|
check_peertube_captions() {
|
|
local video_id="$1"
|
|
local instance="$2"
|
|
|
|
log_info "Checking PeerTube captions API..."
|
|
|
|
local api_url="https://${instance}/api/v1/videos/${video_id}/captions"
|
|
local response
|
|
|
|
response=$(curl -s -w "\n%{http_code}" "$api_url" 2>/dev/null)
|
|
local http_code=$(echo "$response" | tail -n1)
|
|
local body=$(echo "$response" | sed '$d')
|
|
|
|
if [ "$http_code" = "200" ]; then
|
|
local caption_count=$(echo "$body" | jq '.total // 0')
|
|
if [ "$caption_count" -gt 0 ]; then
|
|
log_ok "Found $caption_count caption(s) via API"
|
|
echo "$body" | jq -r '.data[].language.id' 2>/dev/null
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
download_subtitles() {
|
|
local url="$1"
|
|
local output_dir="$2"
|
|
local slug="$3"
|
|
local lang="${4:-fr}"
|
|
|
|
log_step "Downloading subtitles"
|
|
|
|
local vtt_file="$output_dir/${slug}.${lang}.vtt"
|
|
local txt_file="$output_dir/${slug}.transcript.txt"
|
|
|
|
# Try to download subtitles with yt-dlp
|
|
if yt-dlp --write-sub --write-auto-sub --sub-lang "$lang,en" \
|
|
--sub-format vtt --skip-download \
|
|
-o "$output_dir/${slug}" "$url" 2>/dev/null; then
|
|
|
|
# Find downloaded VTT file
|
|
local found_vtt=$(find "$output_dir" -name "${slug}*.vtt" -type f | head -1)
|
|
|
|
if [ -n "$found_vtt" ] && [ -f "$found_vtt" ]; then
|
|
# Rename to standard name
|
|
mv "$found_vtt" "$vtt_file" 2>/dev/null || cp "$found_vtt" "$vtt_file"
|
|
|
|
log_ok "Subtitles downloaded: $vtt_file"
|
|
|
|
# Convert to plain text
|
|
vtt_to_text "$vtt_file" "$txt_file"
|
|
log_ok "Converted to text: $txt_file"
|
|
|
|
local word_count=$(wc -w < "$txt_file" | tr -d ' ')
|
|
printf " Word count: %s\n" "$word_count"
|
|
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
log_warn "No subtitles available for download"
|
|
return 1
|
|
}
|
|
|
|
#=============================================================================
|
|
# 3. WHISPER TRANSCRIPTION
|
|
#=============================================================================
|
|
|
|
extract_audio() {
|
|
local url="$1"
|
|
local output_dir="$2"
|
|
local slug="$3"
|
|
|
|
log_info "Extracting audio..."
|
|
|
|
local audio_file="$output_dir/${slug}.audio.wav"
|
|
|
|
# Download and convert to 16kHz mono WAV (Whisper optimal format)
|
|
if yt-dlp -x --audio-format wav -o "$output_dir/${slug}.%(ext)s" "$url" 2>/dev/null; then
|
|
# Convert to Whisper-optimal format
|
|
local downloaded=$(find "$output_dir" -name "${slug}.*" -type f | grep -E '\.(wav|mp3|m4a|opus|webm)$' | head -1)
|
|
|
|
if [ -n "$downloaded" ]; then
|
|
ffmpeg -y -i "$downloaded" -vn -ac 1 -ar 16000 -f wav "$audio_file" 2>/dev/null
|
|
rm -f "$downloaded"
|
|
log_ok "Audio extracted: $audio_file"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
log_error "Failed to extract audio"
|
|
return 1
|
|
}
|
|
|
|
run_whisper() {
|
|
local audio_file="$1"
|
|
local output_dir="$2"
|
|
local slug="$3"
|
|
local model="$4"
|
|
local lang="$5"
|
|
|
|
log_step "Running Whisper transcription"
|
|
|
|
if [ -z "$WHISPER_CMD" ]; then
|
|
log_error "Whisper not available"
|
|
return 1
|
|
fi
|
|
|
|
local txt_file="$output_dir/${slug}.transcript.txt"
|
|
|
|
log_info "Model: $model, Language: $lang"
|
|
log_info "This may take a while..."
|
|
|
|
case "$WHISPER_CMD" in
|
|
whisper)
|
|
# OpenAI Whisper Python
|
|
whisper "$audio_file" \
|
|
--model "$model" \
|
|
--language "$lang" \
|
|
--output_format txt \
|
|
--output_dir "$output_dir" \
|
|
--verbose False 2>/dev/null
|
|
|
|
# Rename output
|
|
local whisper_out="$output_dir/$(basename "$audio_file" .wav).txt"
|
|
[ -f "$whisper_out" ] && mv "$whisper_out" "$txt_file"
|
|
;;
|
|
|
|
whisper-cpp|main)
|
|
# whisper.cpp
|
|
local model_path="${WHISPER_MODELS_PATH:-$HOME/.cache/whisper}/ggml-${model}.bin"
|
|
|
|
if [ ! -f "$model_path" ]; then
|
|
log_warn "Model not found: $model_path"
|
|
log_info "Downloading model..."
|
|
# Try to download model
|
|
mkdir -p "$(dirname "$model_path")"
|
|
curl -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${model}.bin" \
|
|
-o "$model_path" 2>/dev/null || {
|
|
log_error "Failed to download model"
|
|
return 1
|
|
}
|
|
fi
|
|
|
|
"$WHISPER_CMD" -m "$model_path" -l "$lang" -otxt -of "$output_dir/${slug}.transcript" "$audio_file" 2>/dev/null
|
|
;;
|
|
esac
|
|
|
|
if [ -f "$txt_file" ]; then
|
|
# Clean up the transcript
|
|
sed -i 's/\[.*\]//g; s/ */ /g' "$txt_file" 2>/dev/null || \
|
|
sed 's/\[.*\]//g; s/ */ /g' "$txt_file" > "$txt_file.tmp" && mv "$txt_file.tmp" "$txt_file"
|
|
|
|
local word_count=$(wc -w < "$txt_file" | tr -d ' ')
|
|
log_ok "Transcription complete: $word_count words"
|
|
return 0
|
|
else
|
|
log_error "Whisper transcription failed"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
#=============================================================================
|
|
# 4. CLAUDE AI ANALYSIS
|
|
#=============================================================================
|
|
|
|
analyse_with_claude() {
|
|
local meta_file="$1"
|
|
local transcript_file="$2"
|
|
local output_dir="$3"
|
|
local slug="$4"
|
|
|
|
log_step "Running Claude AI analysis"
|
|
|
|
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
|
log_error "ANTHROPIC_API_KEY not set"
|
|
return 1
|
|
fi
|
|
|
|
if [ ! -f "$transcript_file" ]; then
|
|
log_error "Transcript file not found: $transcript_file"
|
|
return 1
|
|
fi
|
|
|
|
local analysis_file="$output_dir/${slug}.analyse.md"
|
|
|
|
# Read metadata
|
|
local title=$(jq -r '.title // "Unknown"' "$meta_file" 2>/dev/null)
|
|
local duration=$(jq -r '.duration_string // .duration // "Unknown"' "$meta_file" 2>/dev/null)
|
|
local uploader=$(jq -r '.uploader // .channel // "Unknown"' "$meta_file" 2>/dev/null)
|
|
local upload_date=$(jq -r '.upload_date // "Unknown"' "$meta_file" 2>/dev/null)
|
|
local tags=$(jq -r '.tags | if . then join(", ") else "None" end' "$meta_file" 2>/dev/null)
|
|
local url=$(jq -r '.webpage_url // "Unknown"' "$meta_file" 2>/dev/null)
|
|
|
|
# Read and truncate transcript
|
|
local transcript=$(cat "$transcript_file")
|
|
local transcript_len=${#transcript}
|
|
|
|
if [ "$transcript_len" -gt "$MAX_TRANSCRIPT_CHARS" ]; then
|
|
log_warn "Transcript truncated from $transcript_len to $MAX_TRANSCRIPT_CHARS chars"
|
|
transcript=$(echo "$transcript" | head -c "$MAX_TRANSCRIPT_CHARS")
|
|
transcript="${transcript}... [TRUNCATED - Original: ${transcript_len} chars]"
|
|
fi
|
|
|
|
# Escape special characters for JSON
|
|
transcript=$(echo "$transcript" | jq -Rs '.')
|
|
title=$(echo "$title" | jq -Rs '.' | sed 's/^"//;s/"$//')
|
|
|
|
log_info "Calling Claude API ($CLAUDE_MODEL)..."
|
|
|
|
# Build the API request
|
|
local system_prompt="Tu es un analyste expert en renseignement, cybersécurité et géopolitique. Tu analyses des transcripts vidéo de manière structurée et rigoureuse. Tu réponds toujours en français."
|
|
|
|
local user_prompt="Analyse le transcript vidéo suivant:
|
|
|
|
=== MÉTADONNÉES ===
|
|
Titre: ${title}
|
|
Durée: ${duration}
|
|
Auteur: ${uploader}
|
|
Date: ${upload_date}
|
|
Tags: ${tags}
|
|
URL: ${url}
|
|
|
|
=== TRANSCRIPT ===
|
|
${transcript}
|
|
|
|
=== INSTRUCTIONS ===
|
|
Produis une analyse structurée en Markdown comprenant:
|
|
|
|
1. **Résumé exécutif** (5 lignes maximum)
|
|
2. **Thèmes principaux et sous-thèmes**
|
|
3. **Acteurs / entités mentionnés** (personnes, organisations, pays)
|
|
4. **Points factuels clés et révélations notables**
|
|
5. **Angle narratif et biais éventuels**
|
|
6. **Pertinence pour un professionnel en cybersécurité et renseignement**
|
|
7. **Questions ouvertes ou points à approfondir**
|
|
|
|
Sois factuel, précis et critique dans ton analyse."
|
|
|
|
# Make API call
|
|
local response
|
|
response=$(curl -s -w "\n%{http_code}" "https://api.anthropic.com/v1/messages" \
|
|
-H "Content-Type: application/json" \
|
|
-H "x-api-key: $ANTHROPIC_API_KEY" \
|
|
-H "anthropic-version: 2023-06-01" \
|
|
-d "$(jq -n \
|
|
--arg model "$CLAUDE_MODEL" \
|
|
--argjson max_tokens "$MAX_TOKENS" \
|
|
--arg system "$system_prompt" \
|
|
--arg user "$user_prompt" \
|
|
'{
|
|
model: $model,
|
|
max_tokens: $max_tokens,
|
|
system: $system,
|
|
messages: [
|
|
{role: "user", content: $user}
|
|
]
|
|
}')" 2>/dev/null)
|
|
|
|
local http_code=$(echo "$response" | tail -n1)
|
|
local body=$(echo "$response" | sed '$d')
|
|
|
|
if [ "$http_code" != "200" ]; then
|
|
log_error "API error (HTTP $http_code)"
|
|
echo "$body" | jq -r '.error.message // .' 2>/dev/null | head -3
|
|
return 1
|
|
fi
|
|
|
|
# Extract and save the analysis
|
|
local analysis=$(echo "$body" | jq -r '.content[0].text // empty')
|
|
|
|
if [ -z "$analysis" ]; then
|
|
log_error "Empty response from Claude"
|
|
return 1
|
|
fi
|
|
|
|
# Create Markdown file with header
|
|
cat > "$analysis_file" << EOF
|
|
# Analyse: ${title}
|
|
|
|
**Source:** ${url}
|
|
**Durée:** ${duration}
|
|
**Auteur:** ${uploader}
|
|
**Date:** ${upload_date}
|
|
**Analysé le:** $(date +"%Y-%m-%d %H:%M")
|
|
**Modèle:** ${CLAUDE_MODEL}
|
|
|
|
---
|
|
|
|
${analysis}
|
|
|
|
---
|
|
|
|
*Analyse générée automatiquement par SecuBox Intelligence Module v${SCRIPT_VERSION}*
|
|
EOF
|
|
|
|
log_ok "Analysis saved to $analysis_file"
|
|
|
|
# Display summary in terminal
|
|
printf "\n%b=== RÉSUMÉ ===%b\n" "$BOLD$GREEN" "$NC"
|
|
echo "$analysis" | sed -n '/Résumé exécutif/,/^##\|^[0-9]\./p' | head -10
|
|
|
|
return 0
|
|
}
|
|
|
|
#=============================================================================
|
|
# MAIN PIPELINE
|
|
#=============================================================================
|
|
|
|
process_video() {
|
|
local url="$1"
|
|
|
|
log_step "Processing video: $url"
|
|
|
|
# Extract video ID and create output directory
|
|
local video_id=$(extract_video_id "$url")
|
|
local instance=$(echo "$url" | sed -E 's|https?://([^/]+).*|\1|')
|
|
|
|
log_info "Video ID: $video_id"
|
|
log_info "Instance: $instance"
|
|
|
|
# Create temporary metadata to get title for slug
|
|
local temp_meta=$(mktemp)
|
|
if ! yt-dlp --dump-json --no-warnings "$url" 2>/dev/null > "$temp_meta"; then
|
|
log_error "Failed to fetch video info"
|
|
rm -f "$temp_meta"
|
|
return 1
|
|
fi
|
|
|
|
local title=$(jq -r '.title // "video"' "$temp_meta")
|
|
local slug=$(generate_slug "$title")
|
|
slug="${slug:-$video_id}"
|
|
rm -f "$temp_meta"
|
|
|
|
log_info "Slug: $slug"
|
|
|
|
# Create output directory
|
|
local output_dir="$OUTPUT_BASE/$slug"
|
|
mkdir -p "$output_dir"
|
|
log_ok "Output directory: $output_dir"
|
|
|
|
# 1. Extract metadata
|
|
extract_metadata "$url" "$output_dir" "$slug" || {
|
|
log_warn "Metadata extraction failed, continuing..."
|
|
}
|
|
|
|
local transcript_file="$output_dir/${slug}.transcript.txt"
|
|
local has_transcript=0
|
|
|
|
# 2. Try to download existing subtitles
|
|
if [ "$NO_WHISPER" != "1" ] || [ "$FORCE_WHISPER" != "1" ]; then
|
|
# Check PeerTube API first
|
|
check_peertube_captions "$video_id" "$instance" 2>/dev/null
|
|
|
|
if download_subtitles "$url" "$output_dir" "$slug" "$WHISPER_LANG"; then
|
|
has_transcript=1
|
|
fi
|
|
fi
|
|
|
|
# 3. Run Whisper if needed
|
|
if [ "$has_transcript" = "0" ] || [ "$FORCE_WHISPER" = "1" ]; then
|
|
if [ "$NO_WHISPER" = "1" ]; then
|
|
log_warn "Whisper disabled, no transcript available"
|
|
elif [ -n "$WHISPER_CMD" ]; then
|
|
local audio_file="$output_dir/${slug}.audio.wav"
|
|
|
|
if extract_audio "$url" "$output_dir" "$slug"; then
|
|
if run_whisper "$audio_file" "$output_dir" "$slug" "$WHISPER_MODEL" "$WHISPER_LANG"; then
|
|
has_transcript=1
|
|
fi
|
|
# Clean up audio file
|
|
rm -f "$audio_file"
|
|
fi
|
|
else
|
|
log_warn "No Whisper available and no subtitles found"
|
|
fi
|
|
fi
|
|
|
|
# 4. Run Claude analysis
|
|
if [ "$NO_ANALYSE" != "1" ] && [ "$has_transcript" = "1" ]; then
|
|
local meta_file="$output_dir/${slug}.meta.json"
|
|
analyse_with_claude "$meta_file" "$transcript_file" "$output_dir" "$slug" || {
|
|
log_warn "Analysis failed"
|
|
}
|
|
elif [ "$NO_ANALYSE" = "1" ]; then
|
|
log_info "Analysis disabled (--no-analyse)"
|
|
else
|
|
log_warn "No transcript available for analysis"
|
|
fi
|
|
|
|
# Summary
|
|
log_step "Processing complete"
|
|
printf "\nOutput files in %s:\n" "$output_dir"
|
|
ls -la "$output_dir" 2>/dev/null | tail -n +2
|
|
|
|
return 0
|
|
}
|
|
|
|
#=============================================================================
|
|
# CLI PARSING
|
|
#=============================================================================
|
|
|
|
show_help() {
|
|
cat << EOF
|
|
${BOLD}PeerTube Video Transcript & Analysis Tool${NC}
|
|
SecuBox Intelligence Module v${SCRIPT_VERSION}
|
|
|
|
${BOLD}Usage:${NC}
|
|
$(basename "$0") [OPTIONS] --url <video_url>
|
|
$(basename "$0") [OPTIONS] <video_url>
|
|
|
|
${BOLD}Options:${NC}
|
|
--url <url> PeerTube video URL
|
|
--no-whisper Disable Whisper (subtitles only)
|
|
--force-whisper Force Whisper even if subtitles exist
|
|
--no-analyse Download/transcribe without Claude analysis
|
|
--model <name> Whisper model (tiny, base, small, medium, large-v3)
|
|
Default: ${WHISPER_MODEL}
|
|
--lang <code> Whisper language code (fr, en, de, etc.)
|
|
Default: ${WHISPER_LANG}
|
|
--output <dir> Output base directory
|
|
Default: ${OUTPUT_BASE}
|
|
--claude-model <m> Claude model for analysis
|
|
Default: ${CLAUDE_MODEL}
|
|
-h, --help Show this help message
|
|
-v, --version Show version
|
|
|
|
${BOLD}Environment Variables:${NC}
|
|
ANTHROPIC_API_KEY Claude API key (required for analysis)
|
|
PEERTUBE_INSTANCE Default PeerTube instance
|
|
WHISPER_MODELS_PATH Path to Whisper models
|
|
|
|
${BOLD}Examples:${NC}
|
|
# Basic usage
|
|
$(basename "$0") https://tube.gk2.secubox.in/w/abc123
|
|
|
|
# Force Whisper transcription with large model
|
|
$(basename "$0") --force-whisper --model large-v3 --url https://...
|
|
|
|
# Subtitles only, no AI analysis
|
|
$(basename "$0") --no-whisper --no-analyse https://...
|
|
|
|
${BOLD}Output Structure:${NC}
|
|
./output/<slug>/
|
|
├── <slug>.meta.json # Video metadata
|
|
├── <slug>.fr.vtt # Original subtitles (if available)
|
|
├── <slug>.transcript.txt # Plain text transcript
|
|
└── <slug>.analyse.md # Claude AI analysis
|
|
|
|
EOF
|
|
}
|
|
|
|
show_version() {
|
|
echo "PeerTube Analyse v${SCRIPT_VERSION}"
|
|
echo "SecuBox Intelligence Module"
|
|
}
|
|
|
|
parse_args() {
|
|
VIDEO_URL=""
|
|
NO_WHISPER=""
|
|
FORCE_WHISPER=""
|
|
NO_ANALYSE=""
|
|
|
|
while [ $# -gt 0 ]; do
|
|
case "$1" in
|
|
--url)
|
|
VIDEO_URL="$2"
|
|
shift 2
|
|
;;
|
|
--no-whisper)
|
|
NO_WHISPER=1
|
|
shift
|
|
;;
|
|
--force-whisper)
|
|
FORCE_WHISPER=1
|
|
shift
|
|
;;
|
|
--no-analyse|--no-analyze)
|
|
NO_ANALYSE=1
|
|
shift
|
|
;;
|
|
--model)
|
|
WHISPER_MODEL="$2"
|
|
shift 2
|
|
;;
|
|
--lang)
|
|
WHISPER_LANG="$2"
|
|
shift 2
|
|
;;
|
|
--output)
|
|
OUTPUT_BASE="$2"
|
|
shift 2
|
|
;;
|
|
--claude-model)
|
|
CLAUDE_MODEL="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
show_help
|
|
exit 0
|
|
;;
|
|
-v|--version)
|
|
show_version
|
|
exit 0
|
|
;;
|
|
-*)
|
|
log_error "Unknown option: $1"
|
|
show_help
|
|
exit 1
|
|
;;
|
|
*)
|
|
# Positional argument = URL
|
|
if [ -z "$VIDEO_URL" ]; then
|
|
VIDEO_URL="$1"
|
|
else
|
|
log_error "Multiple URLs not supported"
|
|
exit 1
|
|
fi
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [ -z "$VIDEO_URL" ]; then
|
|
log_error "No video URL provided"
|
|
show_help
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
#=============================================================================
|
|
# ENTRY POINT
|
|
#=============================================================================
|
|
|
|
main() {
|
|
parse_args "$@"
|
|
|
|
printf "\n%b╔══════════════════════════════════════════════════════╗%b\n" "$BOLD$CYAN" "$NC"
|
|
printf "%b║ PeerTube Transcript & Analysis Tool v%-16s║%b\n" "$BOLD$CYAN" "$SCRIPT_VERSION" "$NC"
|
|
printf "%b║ SecuBox Intelligence Module ║%b\n" "$BOLD$CYAN" "$NC"
|
|
printf "%b╚══════════════════════════════════════════════════════╝%b\n\n" "$BOLD$CYAN" "$NC"
|
|
|
|
check_dependencies || exit 1
|
|
|
|
process_video "$VIDEO_URL"
|
|
|
|
local exit_code=$?
|
|
|
|
if [ $exit_code -eq 0 ]; then
|
|
printf "\n%b✓ All done!%b\n\n" "$BOLD$GREEN" "$NC"
|
|
else
|
|
printf "\n%b✗ Completed with errors%b\n\n" "$BOLD$YELLOW" "$NC"
|
|
fi
|
|
|
|
return $exit_code
|
|
}
|
|
|
|
# Run if not sourced
|
|
if [ "${0##*/}" = "peertube-analyse" ] || [ "${0##*/}" = "sh" ]; then
|
|
main "$@"
|
|
fi
|