Try wget https://bash.commongrounds.cc/uploads/1748228826_bodygram.sh from the console
#!/bin/bash
# webgram by Page Telegram Volunteer Services 2025
# Script to crawl text from the <body> of a website's linked pages recursively at the provided link level
# Saves text to txt files based on link suffixes (without .html/.htm) with a simple progress bar
# Progress bar stays on one line, saved file messages on another
# Supports local HTML files and Wayback Machine for inaccessible URLs
# Check if required tools are installed
command -v curl >/dev/null 2>&1 || { echo "curl is required but not installed. Exiting."; exit 1; }
command -v lynx >/dev/null 2>&1 || { echo "lynx is required but not installed. Exiting."; exit 1; }
command -v pup >/dev/null 2>&1 || { echo "pup is required but not installed. Install golang-github-ericchiang-pup. Exiting."; exit 1; }
command -v tidy >/dev/null 2>&1 || { echo "tidy is required but not installed. Install htmltidy. Exiting."; exit 1; }
command -v html2text >/dev/null 2>&1 || { echo "html2text is required but not installed. Install html2text. Exiting."; exit 1; }
command -v tput >/dev/null 2>&1 || { echo "tput is required but not installed. Exiting."; exit 1; }
# Check if URL or file is provided
if [ -z "$1" ]; then
echo "Usage: $0 <website_url | local_html_file>"
exit 1
fi
INPUT="$1"
WAYBACK=0
# Determine if input is a file or URL
if [ -f "$INPUT" ]; then
IS_FILE=1
HTML=$(cat "$INPUT")
BASE_URL="https://all-creatures.org/mfz"
DOMAIN="all-creatures.org"
BASE_PATH="/mfz"
OUTPUT_DIR="webgram_output_${DOMAIN}_mfz"
echo "Processing local file: $INPUT"
else
IS_FILE=0
URL="$INPUT"
DOMAIN=$(echo "$URL" | awk -F/ '{print $3}')
BASE_PATH="/mfz" # Hardcode to /mfz/ for all-creatures.org
BASE_URL="https://$DOMAIN$BASE_PATH"
OUTPUT_DIR="webgram_output_${DOMAIN}_mfz"
fi
LOG_FILE="webgram_errors.log"
mkdir -p "$OUTPUT_DIR"
# Initialize error log
: > "$LOG_FILE"
# Function to resolve relative URLs
resolve_url() {
local base="$1"
local rel="$2"
if [[ "$rel" =~ ^https?:// ]]; then
echo "$rel"
elif [[ "$rel" =~ ^file:// ]] || [[ "$rel" =~ ^# ]]; then
echo "invalid"
else
if [[ "$rel" =~ ^/ ]]; then
echo "https://$DOMAIN$rel"
else
echo "$base/$rel" | sed -e 's|//$|/|' -e 's|/\./|/|' -e 's|/\?[^/]*$|/|' -e "s|$|$rel|" | sed -e 's|//$|/|'
fi
fi
}
# Function to fetch from Wayback Machine
fetch_wayback() {
local url="$1"
local snapshot="20150401000000" # Snapshot from April 1, 2015
local wayback_url="https://web.archive.org/web/$snapshot/$url"
curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 "$wayback_url" 2>>"$LOG_FILE"
}
# Function to display progress bar on a fixed line
show_progress() {
local current=$1
local total=$2
local width=$3
local percent=$((current * 100 / total))
local filled=$((current * width / total))
local empty=$((width - filled))
local bar=""
for ((i=0; i<filled; i++)); do bar="${bar}#"; done
for ((i=0; i<empty; i++)); do bar="${bar}-"; done
# Save cursor, move to line 0, clear line, print progress, restore cursor
tput sc
tput cup 0 0
tput el
printf "Progress: [${bar}] %d%% (%d/%d)" "$percent" "$current" "$total"
tput rc
}
# Function to display saved file message on a separate line
show_saved() {
local file="$1"
# Save cursor, move to line 1, clear line, print message, restore cursor
tput sc
tput cup 1 0
tput el
printf "Saved text to %s" "$file"
tput rc
# Move to line 2 for subsequent output
tput cup 2 0
}
# Fetch HTML if URL
if [ "$IS_FILE" -eq 0 ]; then
echo "Checking $URL..."
HTTP_STATUS=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 -o /dev/null -w "%{http_code}" "$URL" 2>>"$LOG_FILE")
if [ "$HTTP_STATUS" != "200" ]; then
echo "Input URL $URL returned HTTP $HTTP_STATUS (e.g., 404 Not Found). Attempting Wayback Machine..." | tee -a "$LOG_FILE"
WAYBACK=1
HTML=$(fetch_wayback "$URL")
if [ -z "$HTML" ] || echo "$HTML" | grep -q "Wayback Machine"; then
echo "Failed to fetch $URL from Wayback Machine. Please provide a local HTML file." | tee -a "$LOG_FILE"
exit 1
fi
else
echo "Fetching $URL..."
HTML=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" "$URL" 2>>"$LOG_FILE")
if [ -z "$HTML" ]; then
echo "Failed to fetch $URL" >> "$LOG_FILE"
exit 1
fi
fi
fi
# Extract links using pup
LINKS=$(echo "$HTML" | pup 'a[href] attr{href}' 2>>"$LOG_FILE" | while read -r href; do
resolved=$(resolve_url "$BASE_URL" "$href")
if [ "$resolved" != "invalid" ] && echo "$resolved" | grep -qE "^https?://$DOMAIN$BASE_PATH"; then
echo "$resolved"
fi
done | sort | uniq)
# Add the input URL explicitly if not a file
if [ "$IS_FILE" -eq 0 ]; then
LINKS=$(echo -e "$URL\n$LINKS" | sort | uniq)
fi
# Count total links for progress bar
TOTAL_LINKS=$(echo "$LINKS" | wc -l)
CURRENT=0
BAR_WIDTH=50
if [ "$TOTAL_LINKS" -eq 0 ]; then
echo "No valid links found. Check $LOG_FILE for errors." | tee -a "$LOG_FILE"
exit 1
fi
echo "Found $TOTAL_LINKS links to process."
# Reserve space for progress bar and saved message
echo -e "\n\n"
# Process each link
echo "$LINKS" | while read -r link; do
# Skip empty or invalid links
if [ -z "$link" ] || ! echo "$link" | grep -qE "^https?://"; then
echo "Skipping invalid link: '$link'" >> "$LOG_FILE"
continue
fi
# Check HTTP status
if [ "$WAYBACK" -eq 1 ]; then
HTML=$(fetch_wayback "$link")
HTTP_STATUS=$([ -z "$HTML" ] || echo "$HTML" | grep -q "Wayback Machine" && echo "404" || echo "200")
else
HTTP_STATUS=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 -o /dev/null -w "%{http_code}" "$link" 2>>"$LOG_FILE")
fi
if [ "$HTTP_STATUS" != "200" ]; then
echo "Skipping $link (HTTP $HTTP_STATUS)" >> "$LOG_FILE"
continue
fi
# Increment progress
((CURRENT++))
# Update progress bar
show_progress "$CURRENT" "$TOTAL_LINKS" "$BAR_WIDTH"
# Get the suffix for filename, removing .html/.htm
SUFFIX=$(echo "$link" | sed -e 's|.*/||' -e 's|\?.*||' -e 's|#[^/]*$||' -e 's|\.html?$||' -e 's|[^a-zA-Z0-9.]|-|g')
if [ -z "$SUFFIX" ]; then
SUFFIX="index"
fi
OUTPUT_FILE="$OUTPUT_DIR/$SUFFIX.txt"
# Handle duplicate filenames
if [ -f "$OUTPUT_FILE" ]; then
COUNTER=1
while [ -f "${OUTPUT_DIR}/${SUFFIX}-${COUNTER}.txt" ]; do
((COUNTER++))
done
OUTPUT_FILE="${OUTPUT_DIR}/${SUFFIX}-${COUNTER}.txt"
fi
# Download HTML
if [ "$WAYBACK" -eq 1 ]; then
HTML=$(fetch_wayback "$link")
else
HTML=$(curl -s -L -A "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" --connect-timeout 10 --retry 2 "$link" 2>>"$LOG_FILE")
fi
if [ -z "$HTML" ]; then
echo "Failed to fetch HTML for $link" >> "$LOG_FILE"
continue
fi
# Clean HTML with tidy
CLEANED_HTML=$(echo "$HTML" | tidy -q --show-warnings no --show-errors 0 --drop-empty-elements y --force-output y 2>>"$LOG_FILE")
if [ -z "$CLEANED_HTML" ]; then
echo "Failed to clean HTML for $link" >> "$LOG_FILE"
echo "$HTML" > "$OUTPUT_DIR/failed_$SUFFIX.html"
# Fallback: Use html2text
echo "$HTML" | html2text > "$OUTPUT_FILE" 2>>"$LOG_FILE"
else
# Extract <body> with pup
BODY=$(echo "$CLEANED_HTML" | pup 'body' 2>>"$LOG_FILE")
if [ -z "$BODY" ]; then
echo "Failed to extract <body> for $link" >> "$LOG_FILE"
echo "$HTML" > "$OUTPUT_DIR/failed_$SUFFIX.html"
# Fallback: Use html2text
echo "$HTML" | html2text > "$OUTPUT_FILE" 2>>"$LOG_FILE"
else
echo "$BODY" | lynx -dump -force-html -stdin > "$OUTPUT_FILE" 2>>"$LOG_FILE"
fi
fi
# Check if file is empty or contains 404
if [ ! -s "$OUTPUT_FILE" ]; then
echo "Empty or invalid output for $link" >> "$LOG_FILE"
rm -f "$OUTPUT_FILE"
elif grep -q "404 Not Found" "$OUTPUT_FILE"; then
echo "Output for $link contains 404 error" >> "$LOG_FILE"
rm -f "$OUTPUT_FILE"
else
show_saved "$OUTPUT_FILE"
fi
done
# Clear progress bar and saved message lines
tput cup 0 0
tput el
tput cup 1 0
tput el
echo ""
if [ -s "$LOG_FILE" ]; then
echo "Some errors occurred. See $LOG_FILE for details."
else
rm -f "$LOG_FILE"
fi
for f in *; do
newname=$(echo "$f" | sed -e 's/\.html//g' -e 's/\.htm//g');
[ "$f" != "$newname" ] && mv -- "$f" "$newname";
done
echo "Crawling complete. Output saved in $OUTPUT_DIR/"
BASH to Home