BASH Post Services

Viewing: 1748228826_bodygram.sh

Try wget https://bash.commongrounds.cc/uploads/1748228826_bodygram.sh from the console
#!/bin/bash

# webgram by Page Telegram Volunteer Services 2025
# Script to crawl text from the <body> of a website's linked pages recursively at the provided link level
# Saves text to txt files based on link suffixes (without .html/.htm) with a simple progress bar
# Progress bar stays on one line, saved file messages on another
# Supports local HTML files and Wayback Machine for inaccessible URLs

# Check if required tools are installed
command -v curl >/dev/null 2>&1 || { echo "curl is required but not installed. Exiting."; exit 1; }
command -v lynx >/dev/null 2>&1 || { echo "lynx is required but not installed. Exiting."; exit 1; }
command -v pup >/dev/null 2>&1 || { echo "pup is required but not installed. Install golang-github-ericchiang-pup. Exiting."; exit 1; }
command -v tidy >/dev/null 2>&1 || { echo "tidy is required but not installed. Install htmltidy. Exiting."; exit 1; }
command -v html2text >/dev/null 2>&1 || { echo "html2text is required but not installed. Install html2text. Exiting."; exit 1; }
command -v tput >/dev/null 2>&1 || { echo "tput is required but not installed. Exiting."; exit 1; }

# Check if URL or file is provided
if [ -z "$1" ]; then
    echo "Usage: $0 <website_url | local_html_file>"
    exit 1
fi

INPUT="$1"
WAYBACK=0
# Determine if input is a file or URL
if [ -f "$INPUT" ]; then
    IS_FILE=1
    HTML=$(cat "$INPUT")
    BASE_URL="https://all-creatures.org/mfz"
    DOMAIN="all-creatures.org"
    BASE_PATH="/mfz"
    OUTPUT_DIR="webgram_output_${DOMAIN}_mfz"
    echo "Processing local file: $INPUT"
else
    IS_FILE=0
    URL="$INPUT"
    DOMAIN=$(echo "$URL" | awk -F/ '{print $3}')
    BASE_PATH="/mfz"  # Hardcode to /mfz/ for all-creatures.org
    BASE_URL="https://$DOMAIN$BASE_PATH"
    OUTPUT_DIR="webgram_output_${DOMAIN}_mfz"
fi

LOG_FILE="webgram_errors.log"
mkdir -p "$OUTPUT_DIR"

# Initialize error log
: > "$LOG_FILE"

# Function to resolve relative URLs
resolve_url() {
    local base="$1"
    local rel="$2"
    if [[ "$rel" =~ ^https?:// ]]; then
        echo "$rel"
    elif [[ "$rel" =~ ^file:// ]] || [[ "$rel" =~ ^# ]]; then
        echo "invalid"
    else
        if [[ "$rel" =~ ^/ ]]; then
            echo "https://$DOMAIN$rel"
        else
            echo "$base/$rel" | sed -e 's|//$|/|' -e 's|/\./|/|' -e 's|/\?[^/]*$|/|' -e "s|$|$rel|" | sed -e 's|//$|/|'
        fi
    fi
}

# Function to fetch from Wayback Machine
fetch_wayback() {
    local url="$1"
    local snapshot="20150401000000"  # Snapshot from April 1, 2015
    local wayback_url="https://web.archive.org/web/$snapshot/$url"
    curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 "$wayback_url" 2>>"$LOG_FILE"
}

# Function to display progress bar on a fixed line
show_progress() {
    local current=$1
    local total=$2
    local width=$3
    local percent=$((current * 100 / total))
    local filled=$((current * width / total))
    local empty=$((width - filled))
    local bar=""
    for ((i=0; i<filled; i++)); do bar="${bar}#"; done
    for ((i=0; i<empty; i++)); do bar="${bar}-"; done
    # Save cursor, move to line 0, clear line, print progress, restore cursor
    tput sc
    tput cup 0 0
    tput el
    printf "Progress: [${bar}] %d%% (%d/%d)" "$percent" "$current" "$total"
    tput rc
}

# Function to display saved file message on a separate line
show_saved() {
    local file="$1"
    # Save cursor, move to line 1, clear line, print message, restore cursor
    tput sc
    tput cup 1 0
    tput el
    printf "Saved text to %s" "$file"
    tput rc
    # Move to line 2 for subsequent output
    tput cup 2 0
}

# Fetch HTML if URL
if [ "$IS_FILE" -eq 0 ]; then
    echo "Checking $URL..."
    HTTP_STATUS=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 -o /dev/null -w "%{http_code}" "$URL" 2>>"$LOG_FILE")
    if [ "$HTTP_STATUS" != "200" ]; then
        echo "Input URL $URL returned HTTP $HTTP_STATUS (e.g., 404 Not Found). Attempting Wayback Machine..." | tee -a "$LOG_FILE"
        WAYBACK=1
        HTML=$(fetch_wayback "$URL")
        if [ -z "$HTML" ] || echo "$HTML" | grep -q "Wayback Machine"; then
            echo "Failed to fetch $URL from Wayback Machine. Please provide a local HTML file." | tee -a "$LOG_FILE"
            exit 1
        fi
    else
        echo "Fetching $URL..."
        HTML=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" "$URL" 2>>"$LOG_FILE")
        if [ -z "$HTML" ]; then
            echo "Failed to fetch $URL" >> "$LOG_FILE"
            exit 1
        fi
    fi
fi

# Extract links using pup
LINKS=$(echo "$HTML" | pup 'a[href] attr{href}' 2>>"$LOG_FILE" | while read -r href; do
    resolved=$(resolve_url "$BASE_URL" "$href")
    if [ "$resolved" != "invalid" ] && echo "$resolved" | grep -qE "^https?://$DOMAIN$BASE_PATH"; then
        echo "$resolved"
    fi
done | sort | uniq)

# Add the input URL explicitly if not a file
if [ "$IS_FILE" -eq 0 ]; then
    LINKS=$(echo -e "$URL\n$LINKS" | sort | uniq)
fi

# Count total links for progress bar
TOTAL_LINKS=$(echo "$LINKS" | wc -l)
CURRENT=0
BAR_WIDTH=50

if [ "$TOTAL_LINKS" -eq 0 ]; then
    echo "No valid links found. Check $LOG_FILE for errors." | tee -a "$LOG_FILE"
    exit 1
fi

echo "Found $TOTAL_LINKS links to process."
# Reserve space for progress bar and saved message
echo -e "\n\n"

# Process each link
echo "$LINKS" | while read -r link; do
    # Skip empty or invalid links
    if [ -z "$link" ] || ! echo "$link" | grep -qE "^https?://"; then
        echo "Skipping invalid link: '$link'" >> "$LOG_FILE"
        continue
    fi
    
    # Check HTTP status
    if [ "$WAYBACK" -eq 1 ]; then
        HTML=$(fetch_wayback "$link")
        HTTP_STATUS=$([ -z "$HTML" ] || echo "$HTML" | grep -q "Wayback Machine" && echo "404" || echo "200")
    else
        HTTP_STATUS=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 -o /dev/null -w "%{http_code}" "$link" 2>>"$LOG_FILE")
    fi
    if [ "$HTTP_STATUS" != "200" ]; then
        echo "Skipping $link (HTTP $HTTP_STATUS)" >> "$LOG_FILE"
        continue
    fi
    
    # Increment progress
    ((CURRENT++))
    
    # Update progress bar
    show_progress "$CURRENT" "$TOTAL_LINKS" "$BAR_WIDTH"
    
    # Get the suffix for filename, removing .html/.htm
    SUFFIX=$(echo "$link" | sed -e 's|.*/||' -e 's|\?.*||' -e 's|#[^/]*$||' -e 's|\.html?$||' -e 's|[^a-zA-Z0-9.]|-|g')
    if [ -z "$SUFFIX" ]; then
        SUFFIX="index"
    fi
    OUTPUT_FILE="$OUTPUT_DIR/$SUFFIX.txt"
    
    # Handle duplicate filenames
    if [ -f "$OUTPUT_FILE" ]; then
        COUNTER=1
        while [ -f "${OUTPUT_DIR}/${SUFFIX}-${COUNTER}.txt" ]; do
            ((COUNTER++))
        done
        OUTPUT_FILE="${OUTPUT_DIR}/${SUFFIX}-${COUNTER}.txt"
    fi
    
    # Download HTML
    if [ "$WAYBACK" -eq 1 ]; then
        HTML=$(fetch_wayback "$link")
    else
        HTML=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 "$link" 2>>"$LOG_FILE")
    fi
    if [ -z "$HTML" ]; then
        echo "Failed to fetch HTML for $link" >> "$LOG_FILE"
        continue
    fi
    
    # Clean HTML with tidy
    CLEANED_HTML=$(echo "$HTML" | tidy -q --show-warnings no --show-errors 0 --drop-empty-elements y --force-output y 2>>"$LOG_FILE")
    if [ -z "$CLEANED_HTML" ]; then
        echo "Failed to clean HTML for $link" >> "$LOG_FILE"
        echo "$HTML" > "$OUTPUT_DIR/failed_$SUFFIX.html"
        # Fallback: Use html2text
        echo "$HTML" | html2text > "$OUTPUT_FILE" 2>>"$LOG_FILE"
    else
        # Extract <body> with pup
        BODY=$(echo "$CLEANED_HTML" | pup 'body' 2>>"$LOG_FILE")
        if [ -z "$BODY" ]; then
            echo "Failed to extract <body> for $link" >> "$LOG_FILE"
            echo "$HTML" > "$OUTPUT_DIR/failed_$SUFFIX.html"
            # Fallback: Use html2text
            echo "$HTML" | html2text > "$OUTPUT_FILE" 2>>"$LOG_FILE"
        else
            echo "$BODY" | lynx -dump -force-html -stdin > "$OUTPUT_FILE" 2>>"$LOG_FILE"
        fi
    fi
    
    # Check if file is empty or contains 404
    if [ ! -s "$OUTPUT_FILE" ]; then
        echo "Empty or invalid output for $link" >> "$LOG_FILE"
        rm -f "$OUTPUT_FILE"
    elif grep -q "404 Not Found" "$OUTPUT_FILE"; then
        echo "Output for $link contains 404 error" >> "$LOG_FILE"
        rm -f "$OUTPUT_FILE"
    else
        show_saved "$OUTPUT_FILE"
    fi
done

# Clear progress bar and saved message lines
tput cup 0 0
tput el
tput cup 1 0
tput el
echo ""

if [ -s "$LOG_FILE" ]; then
    echo "Some errors occurred. See $LOG_FILE for details."
else
    rm -f "$LOG_FILE"
fi

for f in *; do 
  newname=$(echo "$f" | sed -e 's/\.html//g' -e 's/\.htm//g'); 
  [ "$f" != "$newname" ] && mv -- "$f" "$newname"; 
done

echo "Crawling complete. Output saved in $OUTPUT_DIR/"
BASH to Home