Try wget https://bash.commongrounds.cc/uploads/1748228826_bodygram.sh
from the console
#!/bin/bash # webgram by Page Telegram Volunteer Services 2025 # Script to crawl text from the <body> of a website's linked pages recursively at the provided link level # Saves text to txt files based on link suffixes (without .html/.htm) with a simple progress bar # Progress bar stays on one line, saved file messages on another # Supports local HTML files and Wayback Machine for inaccessible URLs # Check if required tools are installed command -v curl >/dev/null 2>&1 || { echo "curl is required but not installed. Exiting."; exit 1; } command -v lynx >/dev/null 2>&1 || { echo "lynx is required but not installed. Exiting."; exit 1; } command -v pup >/dev/null 2>&1 || { echo "pup is required but not installed. Install golang-github-ericchiang-pup. Exiting."; exit 1; } command -v tidy >/dev/null 2>&1 || { echo "tidy is required but not installed. Install htmltidy. Exiting."; exit 1; } command -v html2text >/dev/null 2>&1 || { echo "html2text is required but not installed. Install html2text. Exiting."; exit 1; } command -v tput >/dev/null 2>&1 || { echo "tput is required but not installed. Exiting."; exit 1; } # Check if URL or file is provided if [ -z "$1" ]; then echo "Usage: $0 <website_url | local_html_file>" exit 1 fi INPUT="$1" WAYBACK=0 # Determine if input is a file or URL if [ -f "$INPUT" ]; then IS_FILE=1 HTML=$(cat "$INPUT") BASE_URL="https://all-creatures.org/mfz" DOMAIN="all-creatures.org" BASE_PATH="/mfz" OUTPUT_DIR="webgram_output_${DOMAIN}_mfz" echo "Processing local file: $INPUT" else IS_FILE=0 URL="$INPUT" DOMAIN=$(echo "$URL" | awk -F/ '{print $3}') BASE_PATH="/mfz" # Hardcode to /mfz/ for all-creatures.org BASE_URL="https://$DOMAIN$BASE_PATH" OUTPUT_DIR="webgram_output_${DOMAIN}_mfz" fi LOG_FILE="webgram_errors.log" mkdir -p "$OUTPUT_DIR" # Initialize error log : > "$LOG_FILE" # Function to resolve relative URLs resolve_url() { local base="$1" local rel="$2" if [[ "$rel" =~ ^https?:// ]]; then echo "$rel" elif [[ "$rel" =~ ^file:// ]] || [[ "$rel" =~ ^# ]]; then echo "invalid" else if [[ "$rel" =~ ^/ ]]; then echo "https://$DOMAIN$rel" else echo "$base/$rel" | sed -e 's|//$|/|' -e 's|/\./|/|' -e 's|/\?[^/]*$|/|' -e "s|$|$rel|" | sed -e 's|//$|/|' fi fi } # Function to fetch from Wayback Machine fetch_wayback() { local url="$1" local snapshot="20150401000000" # Snapshot from April 1, 2015 local wayback_url="https://web.archive.org/web/$snapshot/$url" curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 "$wayback_url" 2>>"$LOG_FILE" } # Function to display progress bar on a fixed line show_progress() { local current=$1 local total=$2 local width=$3 local percent=$((current * 100 / total)) local filled=$((current * width / total)) local empty=$((width - filled)) local bar="" for ((i=0; i<filled; i++)); do bar="${bar}#"; done for ((i=0; i<empty; i++)); do bar="${bar}-"; done # Save cursor, move to line 0, clear line, print progress, restore cursor tput sc tput cup 0 0 tput el printf "Progress: [${bar}] %d%% (%d/%d)" "$percent" "$current" "$total" tput rc } # Function to display saved file message on a separate line show_saved() { local file="$1" # Save cursor, move to line 1, clear line, print message, restore cursor tput sc tput cup 1 0 tput el printf "Saved text to %s" "$file" tput rc # Move to line 2 for subsequent output tput cup 2 0 } # Fetch HTML if URL if [ "$IS_FILE" -eq 0 ]; then echo "Checking $URL..." HTTP_STATUS=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 -o /dev/null -w "%{http_code}" "$URL" 2>>"$LOG_FILE") if [ "$HTTP_STATUS" != "200" ]; then echo "Input URL $URL returned HTTP $HTTP_STATUS (e.g., 404 Not Found). Attempting Wayback Machine..." | tee -a "$LOG_FILE" WAYBACK=1 HTML=$(fetch_wayback "$URL") if [ -z "$HTML" ] || echo "$HTML" | grep -q "Wayback Machine"; then echo "Failed to fetch $URL from Wayback Machine. Please provide a local HTML file." | tee -a "$LOG_FILE" exit 1 fi else echo "Fetching $URL..." HTML=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" "$URL" 2>>"$LOG_FILE") if [ -z "$HTML" ]; then echo "Failed to fetch $URL" >> "$LOG_FILE" exit 1 fi fi fi # Extract links using pup LINKS=$(echo "$HTML" | pup 'a[href] attr{href}' 2>>"$LOG_FILE" | while read -r href; do resolved=$(resolve_url "$BASE_URL" "$href") if [ "$resolved" != "invalid" ] && echo "$resolved" | grep -qE "^https?://$DOMAIN$BASE_PATH"; then echo "$resolved" fi done | sort | uniq) # Add the input URL explicitly if not a file if [ "$IS_FILE" -eq 0 ]; then LINKS=$(echo -e "$URL\n$LINKS" | sort | uniq) fi # Count total links for progress bar TOTAL_LINKS=$(echo "$LINKS" | wc -l) CURRENT=0 BAR_WIDTH=50 if [ "$TOTAL_LINKS" -eq 0 ]; then echo "No valid links found. Check $LOG_FILE for errors." | tee -a "$LOG_FILE" exit 1 fi echo "Found $TOTAL_LINKS links to process." # Reserve space for progress bar and saved message echo -e "\n\n" # Process each link echo "$LINKS" | while read -r link; do # Skip empty or invalid links if [ -z "$link" ] || ! echo "$link" | grep -qE "^https?://"; then echo "Skipping invalid link: '$link'" >> "$LOG_FILE" continue fi # Check HTTP status if [ "$WAYBACK" -eq 1 ]; then HTML=$(fetch_wayback "$link") HTTP_STATUS=$([ -z "$HTML" ] || echo "$HTML" | grep -q "Wayback Machine" && echo "404" || echo "200") else HTTP_STATUS=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 -o /dev/null -w "%{http_code}" "$link" 2>>"$LOG_FILE") fi if [ "$HTTP_STATUS" != "200" ]; then echo "Skipping $link (HTTP $HTTP_STATUS)" >> "$LOG_FILE" continue fi # Increment progress ((CURRENT++)) # Update progress bar show_progress "$CURRENT" "$TOTAL_LINKS" "$BAR_WIDTH" # Get the suffix for filename, removing .html/.htm SUFFIX=$(echo "$link" | sed -e 's|.*/||' -e 's|\?.*||' -e 's|#[^/]*$||' -e 's|\.html?$||' -e 's|[^a-zA-Z0-9.]|-|g') if [ -z "$SUFFIX" ]; then SUFFIX="index" fi OUTPUT_FILE="$OUTPUT_DIR/$SUFFIX.txt" # Handle duplicate filenames if [ -f "$OUTPUT_FILE" ]; then COUNTER=1 while [ -f "${OUTPUT_DIR}/${SUFFIX}-${COUNTER}.txt" ]; do ((COUNTER++)) done OUTPUT_FILE="${OUTPUT_DIR}/${SUFFIX}-${COUNTER}.txt" fi # Download HTML if [ "$WAYBACK" -eq 1 ]; then HTML=$(fetch_wayback "$link") else HTML=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 "$link" 2>>"$LOG_FILE") fi if [ -z "$HTML" ]; then echo "Failed to fetch HTML for $link" >> "$LOG_FILE" continue fi # Clean HTML with tidy CLEANED_HTML=$(echo "$HTML" | tidy -q --show-warnings no --show-errors 0 --drop-empty-elements y --force-output y 2>>"$LOG_FILE") if [ -z "$CLEANED_HTML" ]; then echo "Failed to clean HTML for $link" >> "$LOG_FILE" echo "$HTML" > "$OUTPUT_DIR/failed_$SUFFIX.html" # Fallback: Use html2text echo "$HTML" | html2text > "$OUTPUT_FILE" 2>>"$LOG_FILE" else # Extract <body> with pup BODY=$(echo "$CLEANED_HTML" | pup 'body' 2>>"$LOG_FILE") if [ -z "$BODY" ]; then echo "Failed to extract <body> for $link" >> "$LOG_FILE" echo "$HTML" > "$OUTPUT_DIR/failed_$SUFFIX.html" # Fallback: Use html2text echo "$HTML" | html2text > "$OUTPUT_FILE" 2>>"$LOG_FILE" else echo "$BODY" | lynx -dump -force-html -stdin > "$OUTPUT_FILE" 2>>"$LOG_FILE" fi fi # Check if file is empty or contains 404 if [ ! -s "$OUTPUT_FILE" ]; then echo "Empty or invalid output for $link" >> "$LOG_FILE" rm -f "$OUTPUT_FILE" elif grep -q "404 Not Found" "$OUTPUT_FILE"; then echo "Output for $link contains 404 error" >> "$LOG_FILE" rm -f "$OUTPUT_FILE" else show_saved "$OUTPUT_FILE" fi done # Clear progress bar and saved message lines tput cup 0 0 tput el tput cup 1 0 tput el echo "" if [ -s "$LOG_FILE" ]; then echo "Some errors occurred. See $LOG_FILE for details." else rm -f "$LOG_FILE" fi for f in *; do newname=$(echo "$f" | sed -e 's/\.html//g' -e 's/\.htm//g'); [ "$f" != "$newname" ] && mv -- "$f" "$newname"; done echo "Crawling complete. Output saved in $OUTPUT_DIR/"BASH to Home