feat(peertube): Add transcript extraction & AI analysis tool
New CLI: peertube-analyse
- Extract video metadata via yt-dlp
- Download existing PeerTube subtitles (VTT)
- Fallback to Whisper local transcription (medium model)
- Claude AI analysis with structured intelligence report
Features:
- POSIX-compatible (OpenWrt, Alpine, Debian)
- Modular pipeline with graceful degradation
- Colored terminal output with status indicators
- Configurable Whisper model and language
- Truncation for large transcripts (12k chars)
CLI flags:
--url <url> Video URL
--no-whisper Subtitles only
--force-whisper Force transcription
--no-analyse Skip Claude analysis
--model <name> Whisper model
--lang <code> Language code
Output structure:
./output/<slug>/
├── <slug>.meta.json
├── <slug>.transcript.txt
└── <slug>.analyse.md
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
df58e96a9a
commit
ab49e19c32
@ -2,7 +2,7 @@ include $(TOPDIR)/rules.mk
|
||||
|
||||
PKG_NAME:=secubox-app-peertube
|
||||
PKG_RELEASE:=1
|
||||
PKG_VERSION:=1.0.0
|
||||
PKG_VERSION:=1.1.0
|
||||
PKG_ARCH:=all
|
||||
PKG_MAINTAINER:=CyberMind Studio <contact@cybermind.fr>
|
||||
PKG_LICENSE:=AGPL-3.0
|
||||
@ -22,6 +22,7 @@ define Package/secubox-app-peertube/description
|
||||
PeerTube federated video streaming platform.
|
||||
Runs in an LXC Debian container with PostgreSQL, Redis, and Node.js.
|
||||
Supports video hosting, live streaming, and ActivityPub federation.
|
||||
Includes peertube-analyse: transcript extraction and Claude AI analysis.
|
||||
endef
|
||||
|
||||
define Package/secubox-app-peertube/conffiles
|
||||
@ -40,6 +41,7 @@ define Package/secubox-app-peertube/install
|
||||
|
||||
$(INSTALL_DIR) $(1)/usr/sbin
|
||||
$(INSTALL_BIN) ./files/usr/sbin/peertubectl $(1)/usr/sbin/peertubectl
|
||||
$(INSTALL_BIN) ./files/usr/sbin/peertube-analyse $(1)/usr/sbin/peertube-analyse
|
||||
endef
|
||||
|
||||
$(eval $(call BuildPackage,secubox-app-peertube))
|
||||
|
||||
778
package/secubox/secubox-app-peertube/files/usr/sbin/peertube-analyse
Executable file
778
package/secubox/secubox-app-peertube/files/usr/sbin/peertube-analyse
Executable file
@ -0,0 +1,778 @@
|
||||
#!/bin/sh
|
||||
# PeerTube Video Transcript Extraction & AI Analysis
|
||||
# SecuBox Intelligence Module
|
||||
# Compatible: OpenWrt, Alpine, Debian, Ubuntu
|
||||
|
||||
set -e
|
||||
|
||||
#=============================================================================
|
||||
# CONFIGURATION
|
||||
#=============================================================================
|
||||
|
||||
SCRIPT_VERSION="1.0.0"
|
||||
PEERTUBE_INSTANCE="${PEERTUBE_INSTANCE:-tube.gk2.secubox.in}"
|
||||
OUTPUT_BASE="${OUTPUT_BASE:-./output}"
|
||||
WHISPER_MODEL="${WHISPER_MODEL:-medium}"
|
||||
WHISPER_LANG="${WHISPER_LANG:-fr}"
|
||||
CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-6}"
|
||||
MAX_TRANSCRIPT_CHARS=12000
|
||||
MAX_TOKENS=2000
|
||||
|
||||
#=============================================================================
|
||||
# COLORS & LOGGING
|
||||
#=============================================================================
|
||||
|
||||
# Check if terminal supports colors
|
||||
if [ -t 1 ] && command -v tput >/dev/null 2>&1; then
|
||||
RED=$(tput setaf 1)
|
||||
GREEN=$(tput setaf 2)
|
||||
YELLOW=$(tput setaf 3)
|
||||
CYAN=$(tput setaf 6)
|
||||
BOLD=$(tput bold)
|
||||
NC=$(tput sgr0)
|
||||
else
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
BOLD='\033[1m'
|
||||
NC='\033[0m'
|
||||
fi
|
||||
|
||||
log_info() { printf "%b[INFO]%b %s\n" "$CYAN" "$NC" "$1"; }
|
||||
log_ok() { printf "%b[OK]%b %s\n" "$GREEN" "$NC" "$1"; }
|
||||
log_warn() { printf "%b[WARN]%b %s\n" "$YELLOW" "$NC" "$1"; }
|
||||
log_error() { printf "%b[ERROR]%b %s\n" "$RED" "$NC" "$1" >&2; }
|
||||
log_step() { printf "\n%b==> %s%b\n" "$BOLD$CYAN" "$1" "$NC"; }
|
||||
|
||||
#=============================================================================
|
||||
# DEPENDENCY CHECK
|
||||
#=============================================================================
|
||||
|
||||
check_dependencies() {
|
||||
log_step "Checking dependencies"
|
||||
|
||||
local missing=""
|
||||
local deps="yt-dlp ffmpeg jq curl"
|
||||
|
||||
for dep in $deps; do
|
||||
if command -v "$dep" >/dev/null 2>&1; then
|
||||
log_ok "$dep found: $(command -v "$dep")"
|
||||
else
|
||||
log_error "$dep not found"
|
||||
missing="$missing $dep"
|
||||
fi
|
||||
done
|
||||
|
||||
# Whisper check (optional but warned)
|
||||
if command -v whisper >/dev/null 2>&1; then
|
||||
log_ok "whisper found: $(command -v whisper)"
|
||||
WHISPER_CMD="whisper"
|
||||
elif command -v whisper-cpp >/dev/null 2>&1; then
|
||||
log_ok "whisper-cpp found: $(command -v whisper-cpp)"
|
||||
WHISPER_CMD="whisper-cpp"
|
||||
elif command -v main >/dev/null 2>&1 && [ -f "$(dirname "$(command -v main)")/models/ggml-medium.bin" ]; then
|
||||
log_ok "whisper.cpp (main) found"
|
||||
WHISPER_CMD="main"
|
||||
else
|
||||
log_warn "whisper not found - transcription will only work with existing subtitles"
|
||||
WHISPER_CMD=""
|
||||
fi
|
||||
|
||||
# API key check
|
||||
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
||||
log_warn "ANTHROPIC_API_KEY not set - AI analysis disabled"
|
||||
else
|
||||
log_ok "ANTHROPIC_API_KEY configured"
|
||||
fi
|
||||
|
||||
if [ -n "$missing" ]; then
|
||||
log_error "Missing required dependencies:$missing"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# UTILITY FUNCTIONS
|
||||
#=============================================================================
|
||||
|
||||
# Extract video ID from PeerTube URL
|
||||
extract_video_id() {
|
||||
local url="$1"
|
||||
# Handle various URL formats:
|
||||
# https://instance/w/VIDEO_ID
|
||||
# https://instance/videos/watch/VIDEO_ID
|
||||
# https://instance/videos/watch/VIDEO_UUID
|
||||
echo "$url" | sed -E 's|.*/w/([^/?]+).*|\1|; s|.*/videos/watch/([^/?]+).*|\1|'
|
||||
}
|
||||
|
||||
# Generate slug from title
|
||||
generate_slug() {
|
||||
echo "$1" | tr '[:upper:]' '[:lower:]' | \
|
||||
sed -E 's/[àáâãäå]/a/g; s/[èéêë]/e/g; s/[ìíîï]/i/g; s/[òóôõö]/o/g; s/[ùúûü]/u/g; s/[ç]/c/g' | \
|
||||
sed -E 's/[^a-z0-9]+/-/g; s/^-+|-+$//g' | \
|
||||
cut -c1-50
|
||||
}
|
||||
|
||||
# Clean VTT to plain text
|
||||
vtt_to_text() {
|
||||
local vtt_file="$1"
|
||||
local txt_file="$2"
|
||||
|
||||
# Remove VTT headers, timestamps, positioning, and duplicates
|
||||
sed -E '
|
||||
/^WEBVTT/d
|
||||
/^Kind:/d
|
||||
/^Language:/d
|
||||
/^NOTE/d
|
||||
/^[0-9]+$/d
|
||||
/^[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/d
|
||||
/^$/d
|
||||
s/<[^>]*>//g
|
||||
s/ / /g
|
||||
s/&/\&/g
|
||||
s/</</g
|
||||
s/>/>/g
|
||||
' "$vtt_file" | \
|
||||
awk '!seen[$0]++' | \
|
||||
tr '\n' ' ' | \
|
||||
sed -E 's/ +/ /g; s/^ +| +$//g' > "$txt_file"
|
||||
}
|
||||
|
||||
# Truncate text to max chars while preserving word boundaries
|
||||
truncate_text() {
|
||||
local text="$1"
|
||||
local max="$2"
|
||||
|
||||
if [ ${#text} -le "$max" ]; then
|
||||
echo "$text"
|
||||
else
|
||||
echo "$text" | cut -c1-"$max" | sed 's/[^ ]*$//'
|
||||
echo "... [TRUNCATED]"
|
||||
fi
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# 1. METADATA EXTRACTION
|
||||
#=============================================================================
|
||||
|
||||
extract_metadata() {
|
||||
local url="$1"
|
||||
local output_dir="$2"
|
||||
local slug="$3"
|
||||
|
||||
log_step "Extracting metadata"
|
||||
|
||||
local meta_file="$output_dir/${slug}.meta.json"
|
||||
|
||||
# Use yt-dlp to dump JSON metadata
|
||||
if yt-dlp --dump-json --no-warnings "$url" 2>/dev/null > "$meta_file.tmp"; then
|
||||
# Extract relevant fields with jq
|
||||
jq '{
|
||||
id: .id,
|
||||
title: .title,
|
||||
description: .description,
|
||||
duration: .duration,
|
||||
duration_string: .duration_string,
|
||||
upload_date: .upload_date,
|
||||
uploader: .uploader,
|
||||
uploader_id: .uploader_id,
|
||||
channel: .channel,
|
||||
view_count: .view_count,
|
||||
like_count: .like_count,
|
||||
tags: .tags,
|
||||
categories: .categories,
|
||||
webpage_url: .webpage_url,
|
||||
thumbnail: .thumbnail,
|
||||
language: .language,
|
||||
subtitles: (.subtitles | keys),
|
||||
automatic_captions: (.automatic_captions | keys)
|
||||
}' "$meta_file.tmp" > "$meta_file"
|
||||
rm -f "$meta_file.tmp"
|
||||
|
||||
log_ok "Metadata saved to $meta_file"
|
||||
|
||||
# Display summary
|
||||
local title=$(jq -r '.title' "$meta_file")
|
||||
local duration=$(jq -r '.duration_string // .duration' "$meta_file")
|
||||
local uploader=$(jq -r '.uploader // .channel // "Unknown"' "$meta_file")
|
||||
|
||||
printf " Title: %s\n" "$title"
|
||||
printf " Duration: %s\n" "$duration"
|
||||
printf " Uploader: %s\n" "$uploader"
|
||||
|
||||
return 0
|
||||
else
|
||||
log_error "Failed to extract metadata"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# 2. SUBTITLE DOWNLOAD & CONVERSION
|
||||
#=============================================================================
|
||||
|
||||
check_peertube_captions() {
|
||||
local video_id="$1"
|
||||
local instance="$2"
|
||||
|
||||
log_info "Checking PeerTube captions API..."
|
||||
|
||||
local api_url="https://${instance}/api/v1/videos/${video_id}/captions"
|
||||
local response
|
||||
|
||||
response=$(curl -s -w "\n%{http_code}" "$api_url" 2>/dev/null)
|
||||
local http_code=$(echo "$response" | tail -n1)
|
||||
local body=$(echo "$response" | sed '$d')
|
||||
|
||||
if [ "$http_code" = "200" ]; then
|
||||
local caption_count=$(echo "$body" | jq '.total // 0')
|
||||
if [ "$caption_count" -gt 0 ]; then
|
||||
log_ok "Found $caption_count caption(s) via API"
|
||||
echo "$body" | jq -r '.data[].language.id' 2>/dev/null
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
download_subtitles() {
|
||||
local url="$1"
|
||||
local output_dir="$2"
|
||||
local slug="$3"
|
||||
local lang="${4:-fr}"
|
||||
|
||||
log_step "Downloading subtitles"
|
||||
|
||||
local vtt_file="$output_dir/${slug}.${lang}.vtt"
|
||||
local txt_file="$output_dir/${slug}.transcript.txt"
|
||||
|
||||
# Try to download subtitles with yt-dlp
|
||||
if yt-dlp --write-sub --write-auto-sub --sub-lang "$lang,en" \
|
||||
--sub-format vtt --skip-download \
|
||||
-o "$output_dir/${slug}" "$url" 2>/dev/null; then
|
||||
|
||||
# Find downloaded VTT file
|
||||
local found_vtt=$(find "$output_dir" -name "${slug}*.vtt" -type f | head -1)
|
||||
|
||||
if [ -n "$found_vtt" ] && [ -f "$found_vtt" ]; then
|
||||
# Rename to standard name
|
||||
mv "$found_vtt" "$vtt_file" 2>/dev/null || cp "$found_vtt" "$vtt_file"
|
||||
|
||||
log_ok "Subtitles downloaded: $vtt_file"
|
||||
|
||||
# Convert to plain text
|
||||
vtt_to_text "$vtt_file" "$txt_file"
|
||||
log_ok "Converted to text: $txt_file"
|
||||
|
||||
local word_count=$(wc -w < "$txt_file" | tr -d ' ')
|
||||
printf " Word count: %s\n" "$word_count"
|
||||
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
log_warn "No subtitles available for download"
|
||||
return 1
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# 3. WHISPER TRANSCRIPTION
|
||||
#=============================================================================
|
||||
|
||||
extract_audio() {
|
||||
local url="$1"
|
||||
local output_dir="$2"
|
||||
local slug="$3"
|
||||
|
||||
log_info "Extracting audio..."
|
||||
|
||||
local audio_file="$output_dir/${slug}.audio.wav"
|
||||
|
||||
# Download and convert to 16kHz mono WAV (Whisper optimal format)
|
||||
if yt-dlp -x --audio-format wav -o "$output_dir/${slug}.%(ext)s" "$url" 2>/dev/null; then
|
||||
# Convert to Whisper-optimal format
|
||||
local downloaded=$(find "$output_dir" -name "${slug}.*" -type f | grep -E '\.(wav|mp3|m4a|opus|webm)$' | head -1)
|
||||
|
||||
if [ -n "$downloaded" ]; then
|
||||
ffmpeg -y -i "$downloaded" -vn -ac 1 -ar 16000 -f wav "$audio_file" 2>/dev/null
|
||||
rm -f "$downloaded"
|
||||
log_ok "Audio extracted: $audio_file"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
log_error "Failed to extract audio"
|
||||
return 1
|
||||
}
|
||||
|
||||
run_whisper() {
|
||||
local audio_file="$1"
|
||||
local output_dir="$2"
|
||||
local slug="$3"
|
||||
local model="$4"
|
||||
local lang="$5"
|
||||
|
||||
log_step "Running Whisper transcription"
|
||||
|
||||
if [ -z "$WHISPER_CMD" ]; then
|
||||
log_error "Whisper not available"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local txt_file="$output_dir/${slug}.transcript.txt"
|
||||
|
||||
log_info "Model: $model, Language: $lang"
|
||||
log_info "This may take a while..."
|
||||
|
||||
case "$WHISPER_CMD" in
|
||||
whisper)
|
||||
# OpenAI Whisper Python
|
||||
whisper "$audio_file" \
|
||||
--model "$model" \
|
||||
--language "$lang" \
|
||||
--output_format txt \
|
||||
--output_dir "$output_dir" \
|
||||
--verbose False 2>/dev/null
|
||||
|
||||
# Rename output
|
||||
local whisper_out="$output_dir/$(basename "$audio_file" .wav).txt"
|
||||
[ -f "$whisper_out" ] && mv "$whisper_out" "$txt_file"
|
||||
;;
|
||||
|
||||
whisper-cpp|main)
|
||||
# whisper.cpp
|
||||
local model_path="${WHISPER_MODELS_PATH:-$HOME/.cache/whisper}/ggml-${model}.bin"
|
||||
|
||||
if [ ! -f "$model_path" ]; then
|
||||
log_warn "Model not found: $model_path"
|
||||
log_info "Downloading model..."
|
||||
# Try to download model
|
||||
mkdir -p "$(dirname "$model_path")"
|
||||
curl -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${model}.bin" \
|
||||
-o "$model_path" 2>/dev/null || {
|
||||
log_error "Failed to download model"
|
||||
return 1
|
||||
}
|
||||
fi
|
||||
|
||||
"$WHISPER_CMD" -m "$model_path" -l "$lang" -otxt -of "$output_dir/${slug}.transcript" "$audio_file" 2>/dev/null
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -f "$txt_file" ]; then
|
||||
# Clean up the transcript
|
||||
sed -i 's/\[.*\]//g; s/ */ /g' "$txt_file" 2>/dev/null || \
|
||||
sed 's/\[.*\]//g; s/ */ /g' "$txt_file" > "$txt_file.tmp" && mv "$txt_file.tmp" "$txt_file"
|
||||
|
||||
local word_count=$(wc -w < "$txt_file" | tr -d ' ')
|
||||
log_ok "Transcription complete: $word_count words"
|
||||
return 0
|
||||
else
|
||||
log_error "Whisper transcription failed"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# 4. CLAUDE AI ANALYSIS
|
||||
#=============================================================================
|
||||
|
||||
analyse_with_claude() {
|
||||
local meta_file="$1"
|
||||
local transcript_file="$2"
|
||||
local output_dir="$3"
|
||||
local slug="$4"
|
||||
|
||||
log_step "Running Claude AI analysis"
|
||||
|
||||
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
||||
log_error "ANTHROPIC_API_KEY not set"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$transcript_file" ]; then
|
||||
log_error "Transcript file not found: $transcript_file"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local analysis_file="$output_dir/${slug}.analyse.md"
|
||||
|
||||
# Read metadata
|
||||
local title=$(jq -r '.title // "Unknown"' "$meta_file" 2>/dev/null)
|
||||
local duration=$(jq -r '.duration_string // .duration // "Unknown"' "$meta_file" 2>/dev/null)
|
||||
local uploader=$(jq -r '.uploader // .channel // "Unknown"' "$meta_file" 2>/dev/null)
|
||||
local upload_date=$(jq -r '.upload_date // "Unknown"' "$meta_file" 2>/dev/null)
|
||||
local tags=$(jq -r '.tags | if . then join(", ") else "None" end' "$meta_file" 2>/dev/null)
|
||||
local url=$(jq -r '.webpage_url // "Unknown"' "$meta_file" 2>/dev/null)
|
||||
|
||||
# Read and truncate transcript
|
||||
local transcript=$(cat "$transcript_file")
|
||||
local transcript_len=${#transcript}
|
||||
|
||||
if [ "$transcript_len" -gt "$MAX_TRANSCRIPT_CHARS" ]; then
|
||||
log_warn "Transcript truncated from $transcript_len to $MAX_TRANSCRIPT_CHARS chars"
|
||||
transcript=$(echo "$transcript" | head -c "$MAX_TRANSCRIPT_CHARS")
|
||||
transcript="${transcript}... [TRUNCATED - Original: ${transcript_len} chars]"
|
||||
fi
|
||||
|
||||
# Escape special characters for JSON
|
||||
transcript=$(echo "$transcript" | jq -Rs '.')
|
||||
title=$(echo "$title" | jq -Rs '.' | sed 's/^"//;s/"$//')
|
||||
|
||||
log_info "Calling Claude API ($CLAUDE_MODEL)..."
|
||||
|
||||
# Build the API request
|
||||
local system_prompt="Tu es un analyste expert en renseignement, cybersécurité et géopolitique. Tu analyses des transcripts vidéo de manière structurée et rigoureuse. Tu réponds toujours en français."
|
||||
|
||||
local user_prompt="Analyse le transcript vidéo suivant:
|
||||
|
||||
=== MÉTADONNÉES ===
|
||||
Titre: ${title}
|
||||
Durée: ${duration}
|
||||
Auteur: ${uploader}
|
||||
Date: ${upload_date}
|
||||
Tags: ${tags}
|
||||
URL: ${url}
|
||||
|
||||
=== TRANSCRIPT ===
|
||||
${transcript}
|
||||
|
||||
=== INSTRUCTIONS ===
|
||||
Produis une analyse structurée en Markdown comprenant:
|
||||
|
||||
1. **Résumé exécutif** (5 lignes maximum)
|
||||
2. **Thèmes principaux et sous-thèmes**
|
||||
3. **Acteurs / entités mentionnés** (personnes, organisations, pays)
|
||||
4. **Points factuels clés et révélations notables**
|
||||
5. **Angle narratif et biais éventuels**
|
||||
6. **Pertinence pour un professionnel en cybersécurité et renseignement**
|
||||
7. **Questions ouvertes ou points à approfondir**
|
||||
|
||||
Sois factuel, précis et critique dans ton analyse."
|
||||
|
||||
# Make API call
|
||||
local response
|
||||
response=$(curl -s -w "\n%{http_code}" "https://api.anthropic.com/v1/messages" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "x-api-key: $ANTHROPIC_API_KEY" \
|
||||
-H "anthropic-version: 2023-06-01" \
|
||||
-d "$(jq -n \
|
||||
--arg model "$CLAUDE_MODEL" \
|
||||
--argjson max_tokens "$MAX_TOKENS" \
|
||||
--arg system "$system_prompt" \
|
||||
--arg user "$user_prompt" \
|
||||
'{
|
||||
model: $model,
|
||||
max_tokens: $max_tokens,
|
||||
system: $system,
|
||||
messages: [
|
||||
{role: "user", content: $user}
|
||||
]
|
||||
}')" 2>/dev/null)
|
||||
|
||||
local http_code=$(echo "$response" | tail -n1)
|
||||
local body=$(echo "$response" | sed '$d')
|
||||
|
||||
if [ "$http_code" != "200" ]; then
|
||||
log_error "API error (HTTP $http_code)"
|
||||
echo "$body" | jq -r '.error.message // .' 2>/dev/null | head -3
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Extract and save the analysis
|
||||
local analysis=$(echo "$body" | jq -r '.content[0].text // empty')
|
||||
|
||||
if [ -z "$analysis" ]; then
|
||||
log_error "Empty response from Claude"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Create Markdown file with header
|
||||
cat > "$analysis_file" << EOF
|
||||
# Analyse: ${title}
|
||||
|
||||
**Source:** ${url}
|
||||
**Durée:** ${duration}
|
||||
**Auteur:** ${uploader}
|
||||
**Date:** ${upload_date}
|
||||
**Analysé le:** $(date +"%Y-%m-%d %H:%M")
|
||||
**Modèle:** ${CLAUDE_MODEL}
|
||||
|
||||
---
|
||||
|
||||
${analysis}
|
||||
|
||||
---
|
||||
|
||||
*Analyse générée automatiquement par SecuBox Intelligence Module v${SCRIPT_VERSION}*
|
||||
EOF
|
||||
|
||||
log_ok "Analysis saved to $analysis_file"
|
||||
|
||||
# Display summary in terminal
|
||||
printf "\n%b=== RÉSUMÉ ===%b\n" "$BOLD$GREEN" "$NC"
|
||||
echo "$analysis" | sed -n '/Résumé exécutif/,/^##\|^[0-9]\./p' | head -10
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# MAIN PIPELINE
|
||||
#=============================================================================
|
||||
|
||||
process_video() {
|
||||
local url="$1"
|
||||
|
||||
log_step "Processing video: $url"
|
||||
|
||||
# Extract video ID and create output directory
|
||||
local video_id=$(extract_video_id "$url")
|
||||
local instance=$(echo "$url" | sed -E 's|https?://([^/]+).*|\1|')
|
||||
|
||||
log_info "Video ID: $video_id"
|
||||
log_info "Instance: $instance"
|
||||
|
||||
# Create temporary metadata to get title for slug
|
||||
local temp_meta=$(mktemp)
|
||||
if ! yt-dlp --dump-json --no-warnings "$url" 2>/dev/null > "$temp_meta"; then
|
||||
log_error "Failed to fetch video info"
|
||||
rm -f "$temp_meta"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local title=$(jq -r '.title // "video"' "$temp_meta")
|
||||
local slug=$(generate_slug "$title")
|
||||
slug="${slug:-$video_id}"
|
||||
rm -f "$temp_meta"
|
||||
|
||||
log_info "Slug: $slug"
|
||||
|
||||
# Create output directory
|
||||
local output_dir="$OUTPUT_BASE/$slug"
|
||||
mkdir -p "$output_dir"
|
||||
log_ok "Output directory: $output_dir"
|
||||
|
||||
# 1. Extract metadata
|
||||
extract_metadata "$url" "$output_dir" "$slug" || {
|
||||
log_warn "Metadata extraction failed, continuing..."
|
||||
}
|
||||
|
||||
local transcript_file="$output_dir/${slug}.transcript.txt"
|
||||
local has_transcript=0
|
||||
|
||||
# 2. Try to download existing subtitles
|
||||
if [ "$NO_WHISPER" != "1" ] || [ "$FORCE_WHISPER" != "1" ]; then
|
||||
# Check PeerTube API first
|
||||
check_peertube_captions "$video_id" "$instance" 2>/dev/null
|
||||
|
||||
if download_subtitles "$url" "$output_dir" "$slug" "$WHISPER_LANG"; then
|
||||
has_transcript=1
|
||||
fi
|
||||
fi
|
||||
|
||||
# 3. Run Whisper if needed
|
||||
if [ "$has_transcript" = "0" ] || [ "$FORCE_WHISPER" = "1" ]; then
|
||||
if [ "$NO_WHISPER" = "1" ]; then
|
||||
log_warn "Whisper disabled, no transcript available"
|
||||
elif [ -n "$WHISPER_CMD" ]; then
|
||||
local audio_file="$output_dir/${slug}.audio.wav"
|
||||
|
||||
if extract_audio "$url" "$output_dir" "$slug"; then
|
||||
if run_whisper "$audio_file" "$output_dir" "$slug" "$WHISPER_MODEL" "$WHISPER_LANG"; then
|
||||
has_transcript=1
|
||||
fi
|
||||
# Clean up audio file
|
||||
rm -f "$audio_file"
|
||||
fi
|
||||
else
|
||||
log_warn "No Whisper available and no subtitles found"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 4. Run Claude analysis
|
||||
if [ "$NO_ANALYSE" != "1" ] && [ "$has_transcript" = "1" ]; then
|
||||
local meta_file="$output_dir/${slug}.meta.json"
|
||||
analyse_with_claude "$meta_file" "$transcript_file" "$output_dir" "$slug" || {
|
||||
log_warn "Analysis failed"
|
||||
}
|
||||
elif [ "$NO_ANALYSE" = "1" ]; then
|
||||
log_info "Analysis disabled (--no-analyse)"
|
||||
else
|
||||
log_warn "No transcript available for analysis"
|
||||
fi
|
||||
|
||||
# Summary
|
||||
log_step "Processing complete"
|
||||
printf "\nOutput files in %s:\n" "$output_dir"
|
||||
ls -la "$output_dir" 2>/dev/null | tail -n +2
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# CLI PARSING
|
||||
#=============================================================================
|
||||
|
||||
show_help() {
|
||||
cat << EOF
|
||||
${BOLD}PeerTube Video Transcript & Analysis Tool${NC}
|
||||
SecuBox Intelligence Module v${SCRIPT_VERSION}
|
||||
|
||||
${BOLD}Usage:${NC}
|
||||
$(basename "$0") [OPTIONS] --url <video_url>
|
||||
$(basename "$0") [OPTIONS] <video_url>
|
||||
|
||||
${BOLD}Options:${NC}
|
||||
--url <url> PeerTube video URL
|
||||
--no-whisper Disable Whisper (subtitles only)
|
||||
--force-whisper Force Whisper even if subtitles exist
|
||||
--no-analyse Download/transcribe without Claude analysis
|
||||
--model <name> Whisper model (tiny, base, small, medium, large-v3)
|
||||
Default: ${WHISPER_MODEL}
|
||||
--lang <code> Whisper language code (fr, en, de, etc.)
|
||||
Default: ${WHISPER_LANG}
|
||||
--output <dir> Output base directory
|
||||
Default: ${OUTPUT_BASE}
|
||||
--claude-model <m> Claude model for analysis
|
||||
Default: ${CLAUDE_MODEL}
|
||||
-h, --help Show this help message
|
||||
-v, --version Show version
|
||||
|
||||
${BOLD}Environment Variables:${NC}
|
||||
ANTHROPIC_API_KEY Claude API key (required for analysis)
|
||||
PEERTUBE_INSTANCE Default PeerTube instance
|
||||
WHISPER_MODELS_PATH Path to Whisper models
|
||||
|
||||
${BOLD}Examples:${NC}
|
||||
# Basic usage
|
||||
$(basename "$0") https://tube.gk2.secubox.in/w/abc123
|
||||
|
||||
# Force Whisper transcription with large model
|
||||
$(basename "$0") --force-whisper --model large-v3 --url https://...
|
||||
|
||||
# Subtitles only, no AI analysis
|
||||
$(basename "$0") --no-whisper --no-analyse https://...
|
||||
|
||||
${BOLD}Output Structure:${NC}
|
||||
./output/<slug>/
|
||||
├── <slug>.meta.json # Video metadata
|
||||
├── <slug>.fr.vtt # Original subtitles (if available)
|
||||
├── <slug>.transcript.txt # Plain text transcript
|
||||
└── <slug>.analyse.md # Claude AI analysis
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
show_version() {
|
||||
echo "PeerTube Analyse v${SCRIPT_VERSION}"
|
||||
echo "SecuBox Intelligence Module"
|
||||
}
|
||||
|
||||
parse_args() {
|
||||
VIDEO_URL=""
|
||||
NO_WHISPER=""
|
||||
FORCE_WHISPER=""
|
||||
NO_ANALYSE=""
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--url)
|
||||
VIDEO_URL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-whisper)
|
||||
NO_WHISPER=1
|
||||
shift
|
||||
;;
|
||||
--force-whisper)
|
||||
FORCE_WHISPER=1
|
||||
shift
|
||||
;;
|
||||
--no-analyse|--no-analyze)
|
||||
NO_ANALYSE=1
|
||||
shift
|
||||
;;
|
||||
--model)
|
||||
WHISPER_MODEL="$2"
|
||||
shift 2
|
||||
;;
|
||||
--lang)
|
||||
WHISPER_LANG="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output)
|
||||
OUTPUT_BASE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--claude-model)
|
||||
CLAUDE_MODEL="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
-v|--version)
|
||||
show_version
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
log_error "Unknown option: $1"
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
# Positional argument = URL
|
||||
if [ -z "$VIDEO_URL" ]; then
|
||||
VIDEO_URL="$1"
|
||||
else
|
||||
log_error "Multiple URLs not supported"
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$VIDEO_URL" ]; then
|
||||
log_error "No video URL provided"
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
# ENTRY POINT
|
||||
#=============================================================================
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
|
||||
printf "\n%b╔══════════════════════════════════════════════════════╗%b\n" "$BOLD$CYAN" "$NC"
|
||||
printf "%b║ PeerTube Transcript & Analysis Tool v%-16s║%b\n" "$BOLD$CYAN" "$SCRIPT_VERSION" "$NC"
|
||||
printf "%b║ SecuBox Intelligence Module ║%b\n" "$BOLD$CYAN" "$NC"
|
||||
printf "%b╚══════════════════════════════════════════════════════╝%b\n\n" "$BOLD$CYAN" "$NC"
|
||||
|
||||
check_dependencies || exit 1
|
||||
|
||||
process_video "$VIDEO_URL"
|
||||
|
||||
local exit_code=$?
|
||||
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
printf "\n%b✓ All done!%b\n\n" "$BOLD$GREEN" "$NC"
|
||||
else
|
||||
printf "\n%b✗ Completed with errors%b\n\n" "$BOLD$YELLOW" "$NC"
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
# Run if not sourced
|
||||
if [ "${0##*/}" = "peertube-analyse" ] || [ "${0##*/}" = "sh" ]; then
|
||||
main "$@"
|
||||
fi
|
||||
Loading…
Reference in New Issue
Block a user