Try wget https://bash.commongrounds.cc/uploads/1752820094_wwow.sh from the console
#!/bin/bash
# Usage: ./mirror_site.sh <mode> <url> [output_dir]
# mode: "1" for single HTML page with associated images/documents, "A" for entire website/directory
# url: The URL to mirror (http, https, ftp, sftp)
# output_dir: Optional directory to save the mirrored content (default: ./mirror)
if [ $# -lt 2 ]; then
echo "Usage: $0 <mode> <url> [output_dir]"
echo "mode: '1' for single HTML page with images/documents, 'A' for entire website/directory"
exit 1
fi
mode="$1"
url="$2"
output_dir="${3:-mirror}"
# Current date and time in CDT
archive_date=$(TZ="America/Chicago" date "+%Y-%m-%d %H:%M:%S %Z")
# Check for required dependencies
command -v wget >/dev/null 2>&1 || { echo "wget is required"; exit 1; }
command -v python3 >/dev/null 2>&1 || { echo "python3 is required for HTML processing"; exit 1; }
command -v sed >/dev/null 2>&1 || { echo "sed is required for fallback processing"; exit 1; }
# Create output directory and images, files, and css directories inside it
mkdir -p "$output_dir" "$output_dir/images" "$output_dir/files" "$output_dir/css"
cd "$output_dir" || exit 1
# Create external custom CSS file
cat > css/custom.css << 'EOF'
html, body {
font-family: monospace !important;
font-size: 12pt !important;
color: #ffbf00 !important;
background-color: black !important;
padding: 1em !important;
margin: 0 auto !important;
max-width: 100% !important;
}
@media (min-width: 768px) {
body {
max-width: 80% !important;
}
}
a {
display: inline-block !important;
color: #ffbf00 !important;
background-color: black !important;
padding: 0.2em 0.5em !important;
border: 1px solid #ffbf00 !important;
border-radius: 4px !important;
text-decoration: none !important;
transition: background-color 0.3s, color 0.3s !important;
margin: 0.1em !important;
}
a:hover {
color: black !important;
background-color: #ffbf00 !important;
}
.ad, [class*="ad"], [id*="ad"], [class*="banner"], [id*="banner"] {
display: none !important;
}
img {
max-width: 100% !important;
height: auto !important;
}
EOF
protocol="${url%%://*}"
case "$mode" in
1)
if [[ "$protocol" == "http" || "$protocol" == "https" ]]; then
# Download only the single HTML page
echo "Downloading HTML from $url"
wget --adjust-extension --no-check-certificate --timeout=10 --tries=3 \
--level=1 --output-document=index.html "$url" 2>> wget.log
if [ ! -f "index.html" ]; then
echo "Failed to download index.html, check wget.log"
exit 1
fi
# Save raw HTML for debugging
cp index.html index.html.raw
# Extract and download images and documents
mkdir -p temp
echo "Extracting resource URLs from index.html"
grep -oE '(href|src)="[^"]*\.(png|jpg|jpeg|gif|pdf|csv|xls|xlsx)"' index.html | \
sed -E 's/(href|src)="([^"]*)"/\2/' | sort -u > temp/resource_urls.txt
# Download each resource to temp directory
while IFS= read -r resource; do
# Decode URL to handle encoded characters
decoded_resource=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$resource'))")
# Convert relative URLs to absolute if needed
if [[ ! "$decoded_resource" =~ ^https?:// ]]; then
base_url=$(echo "$url" | sed -E 's|(https?://[^/]+).*|\1|')
resource_url="${base_url}${decoded_resource}"
else
resource_url="$decoded_resource"
fi
# Extract filename from decoded URL
filename=$(basename "$decoded_resource")
echo "Downloading resource: $resource_url as $filename"
wget --no-check-certificate --timeout=10 --tries=3 \
--output-document="temp/$filename" "$resource_url" 2>> wget.log
done < temp/resource_urls.txt
# Move images to ./images
find ./temp -maxdepth 1 -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \
-not -path "./images/*" -exec mv {} ./images/ \;
# Move documents to ./files
find ./temp -maxdepth 1 -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \
-not -path "./files/*" -exec mv {} ./files/ \;
# Remove temporary directory and any other subdirectories
rm -rf temp
find . -type d -not -path . -not -path "./images" -not -path "./files" -not -path "./css" -exec rm -rf {} \;
elif [[ "$protocol" == "ftp" ]]; then
wget --no-check-certificate --timeout=10 --tries=3 --directory-prefix=./files "$url" 2>> wget.log
elif [[ "$protocol" == "sftp" ]]; then
command -v lftp >/dev/null 2>&1 || { echo "lftp is required for sftp"; exit 1; }
hostpath="${url#*://}"
host="${hostpath%%/*}"
path="/${hostpath#"$host"}"
if [ "$path" == "/" ]; then
echo "For single mode with sftp, provide a file path in the URL"
exit 1
fi
lftp -c "open $protocol://$host; get $path -o ./files/$(basename $path)" 2>> lftp.log
else
echo "Unsupported protocol: $protocol"
exit 1
fi
;;
A)
if [[ "$protocol" == "http" || "$protocol" == "https" || "$protocol" == "ftp" ]]; then
# Mirror entire site with images, PDFs, CSVs, and Excel files
echo "Mirroring entire site from $url"
wget --mirror --convert-links --adjust-extension --page-requisites \
--directory-prefix=. --accept '*.html,*.htm,*.png,*.jpg,*.jpeg,*.gif,*.pdf,*.csv,*.xls,*.xlsx' \
--reject-regex='(ad|advert|banner)' --no-check-certificate --timeout=10 --tries=3 "$url" 2>> wget.log
# Move images to ./images
find . -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \
-not -path "./images/*" -exec mv {} ./images/ \;
# Move documents to ./files
find . -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \
-not -path "./files/*" -exec mv {} ./files/ \;
elif [[ "$protocol" == "sftp" ]]; then
command -v lftp >/dev/null 2>&1 || { echo "lftp is required for sftp"; exit 1; }
hostpath="${url#*://}"
host="${hostpath%%/*}"
path="/${hostpath#"$host"}"
lftp -c "open $protocol://$host; mirror -e -P10 --exclude ad $path ." 2>> lftp.log
# Move images to ./images
find . -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \
-not -path "./images/*" -exec mv {} ./images/ \;
# Move documents to ./files
find . -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \
-not -path "./files/*" -exec mv {} ./files/ \;
else
echo "Unsupported protocol: $protocol"
exit 1
fi
;;
*)
echo "Invalid mode: $mode. Use '1' or 'A'."
exit 1
;;
esac
# Create a virtual environment and install beautifulsoup4
venv_dir=".venv"
if [ ! -d "$venv_dir" ]; then
echo "Creating virtual environment in $venv_dir"
python3 -m venv "$venv_dir" || { echo "Failed to create virtual environment"; exit 1; }
fi
source "$venv_dir/bin/activate"
pip install beautifulsoup4 lxml >/dev/null 2> pip.log || { echo "Failed to install beautifulsoup4 or lxml, check pip.log"; deactivate; exit 1; }
# Post-process HTML files with Python and BeautifulSoup
cat > process_html.py << EOF
#!/usr/bin/env python3
import sys
import os
import logging
from bs4 import BeautifulSoup, Comment, NavigableString
from urllib.parse import urljoin, urlparse, unquote
# Set up logging
logging.basicConfig(filename='debug_html.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
# Allowed tags
ALLOWED_TAGS = {
'div', 'tt', 'p', 'a', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'img', 'span', 'strong', 'em', 'b', 'i', 'br', 'hr', 'blockquote'
}
# Allowed attributes
ALLOWED_ATTRS = {'a': ['href'], 'img': ['src', 'alt', 'width', 'height']}
# Placeholder image patterns
PLACEHOLDER_PATTERNS = ['404', 'broken', 'missing', 'placeholder', 'default']
def clean_attributes(tag):
"""Remove unwanted attributes, keeping only allowed ones."""
attrs = dict(tag.attrs)
tag.attrs = {}
allowed = ALLOWED_ATTRS.get(tag.name, [])
for attr in allowed:
if attr in attrs:
tag[attr] = attrs[attr]
logging.debug(f"Cleaned attributes for tag {tag.name}: {tag.attrs}")
def process_html(input_file, base_url, archive_date):
"""Process HTML file to keep only allowed tags, update paths, and add footer."""
try:
# Read input HTML
logging.debug(f"Reading input file: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
if not content.strip():
logging.error("Input file is empty")
raise ValueError("Input file is empty")
soup = BeautifulSoup(content, 'lxml')
logging.debug("Parsed HTML with BeautifulSoup using lxml parser")
# Extract title and description
title_tag = soup.find('title')
title = str(title_tag) if title_tag else '<title>Untitled</title>'
description_tag = soup.find('meta', attrs={'name': 'description'})
description = str(description_tag) if description_tag else ''
logging.debug(f"Extracted title: {title}")
logging.debug(f"Extracted description: {description}")
# Create new HTML structure
new_soup = BeautifulSoup('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body></body></html>', 'lxml')
# Add title and description to head
new_soup.head.append(BeautifulSoup(title, 'lxml'))
if description:
new_soup.head.append(BeautifulSoup(description, 'lxml'))
css_link = new_soup.new_tag('link', rel='stylesheet', href='./css/custom.css')
new_soup.head.append(css_link)
logging.debug("Added CSS link to head")
# Process all elements in the document
body = new_soup.body
elements = soup.find_all(recursive=True)
logging.debug(f"Found {len(elements)} elements in input HTML")
# Create a temporary container to hold valid elements
temp_container = BeautifulSoup('<div></div>', 'lxml').div
for element in elements:
if isinstance(element, Comment):
logging.debug("Skipping comment")
continue
if isinstance(element, NavigableString) and element.strip():
temp_container.append(NavigableString(element.strip()))
logging.debug(f"Appended text: {element.strip()}")
elif element.name in ALLOWED_TAGS:
clean_attributes(element)
if element.name == 'img' and 'src' in element.attrs:
src = element['src']
# Decode URL-encoded src
decoded_src = unquote(src)
logging.debug(f"Decoded image src: {src} -> {decoded_src}")
if any(pattern in decoded_src.lower() for pattern in PLACEHOLDER_PATTERNS):
logging.debug(f"Skipping placeholder image: {decoded_src}")
element.decompose()
continue
filename = os.path.basename(decoded_src)
if os.path.exists(f'./images/{filename}') and os.path.getsize(f'./images/{filename}') > 0:
element['src'] = f'./images/{filename}'
logging.debug(f"Updated image src to: ./images/{filename}")
else:
logging.debug(f"Removing non-existent or empty image: ./images/{filename}")
element.decompose()
continue
elif element.name == 'a' and 'href' in element.attrs:
href = element['href']
# Decode URL-encoded href
decoded_href = unquote(href)
if decoded_href.lower().endswith(('.pdf', '.csv', '.xls', '.xlsx')):
filename = os.path.basename(decoded_href)
element['href'] = f'./files/{filename}'
logging.debug(f"Updated link href to: ./files/{filename}")
else:
element['href'] = decoded_href
logging.debug(f"Kept decoded href: {decoded_href}")
temp_container.append(element)
logging.debug(f"Appended tag: {element.name}")
else:
logging.debug(f"Skipping tag: {element.name if hasattr(element, 'name') else 'None'}")
# Move valid elements to body
for child in temp_container.children:
body.append(child)
logging.debug("Moved valid elements to body")
# Add footer
footer = new_soup.new_tag('footer')
footer_p = new_soup.new_tag('p')
footer_p.string = f"Archived by AmFile.org Script on {archive_date}"
footer.append(footer_p)
body.append(footer)
logging.debug(f"Added footer: Archived by AmFile.org Script on {archive_date}")
# Write output HTML
output_file = input_file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(str(new_soup.prettify()))
logging.debug(f"Wrote processed HTML to: {output_file}")
except Exception as e:
logging.error(f"Error processing {input_file}: {str(e)}")
raise
if __name__ == '__main__':
if len(sys.argv) != 4:
print("Usage: python3 process_html.py <input_file> <base_url> <archive_date>")
sys.exit(1)
process_html(sys.argv[1], sys.argv[2], sys.argv[3])
EOF
# Make Python script executable
chmod +x process_html.py
# Post-process HTML files with Python
for file in *.html *.htm; do
if [ -f "$file" ]; then
echo "Processing file: $file"
cp "$file" "${file}.backup" # Backup original file
"$venv_dir/bin/python" process_html.py "$file" "$url" "$archive_date" 2>> debug_html.log
if [ $? -ne 0 ]; then
echo "Python processing failed for $file, falling back to sed"
temp_file=$(mktemp)
title=$(grep -oE '<title>[^<]*</title>' "$file" || echo "<title>Untitled</title>")
description=$(grep -oE '<meta[^>]*name=["'\'']description["'\''][^>]*>' "$file" || echo "")
echo "<!DOCTYPE html><html><head><meta charset=\"UTF-8\">$title$description<link rel=\"stylesheet\" href=\"./css/custom.css\"></head><body>" > "$temp_file"
# Improved sed to handle nested tags
sed -n '/<(div|tt|p|a|li|ul|ol|h1|h2|h3|h4|h5|h6|img|span|strong|em|b|i|br|hr|blockquote)[^>]*>/,/<\/(div|tt|p|a|li|ul|ol|h1|h2|h3|h4|h5|h6|img|span|strong|em|b|i|br|hr|blockquote)>/p' "$file" | \
sed -E 's/\s+style\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \
sed -E 's/\s+on[a-zA-Z]+\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \
sed -E 's/\s+data-[a-zA-Z0-9_-]+\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \
sed -E '/<img/s/\s+src\s*=\s*(["'\''])([^"'\'']*(404|broken|missing|placeholder|default)\.(png|jpg|jpeg|gif))\1//g' | \
sed -E '/<img/!s/\s+src\s*=\s*(["'\''][^"'\'']*["'\''])//g' | \
sed -E '/<a/!s/\s+href\s*=\s*(["'\''][^"'\'']*["'\''])//g' | \
sed -E 's/(href|src)\s*=\s*(["'\''])([^"'\'']*\.(png|jpg|jpeg|gif))\2/\1=\2.\/images\/\3\2/g' | \
sed -E 's/(href|src)\s*=\s*(["'\''])([^"'\'']*\.(pdf|csv|xls|xlsx))\2/\1=\2.\/files\/\3\2/g' >> "$temp_file"
while IFS= read -r img_line; do
src=$(echo "$img_line" | sed -E 's/.*src\s*=\s*(["'\''])([^"'\'']*)\1.*/\2/')
decoded_src=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$src'))")
filename=$(basename "$decoded_src")
if [ ! -s "./images/$filename" ]; then
echo "Removing broken image: ./images/$filename"
sed -i -E "/$(echo "$img_line" | sed -e 's/[]\/$*.^|[]/\\&/g')/d" "$temp_file"
else
sed -i -E "s|$src|./images/$filename|g" "$temp_file"
fi
done < <(grep -oE '<img\b[^>]*src\s*=\s*["'\''][^"'\'']*\.(png|jpg|jpeg|gif)["'\''][^>]*>' "$temp_file")
echo "<footer><p>Archived by AmFile.org Script on $archive_date</p></footer>" >> "$temp_file"
echo "</body></html>" >> "$temp_file"
mv "$temp_file" "$file"
mv "${file}.backup" "${file}.failed" # Keep backup for debugging
else
rm "${file}.backup" # Remove backup if successful
fi
fi
done
# Clean up Python script and virtual environment
rm -f process_html.py
deactivate
rm -rf "$venv_dir"
echo "Mirroring complete. Content saved in $output_dir/index.html, images in $output_dir/images, documents in $output_dir/files, CSS in $output_dir/css/custom.css, custom formatting enforced, broken/placeholder images removed, all JavaScript/AJAX stripped, and all original CSS removed."
BASH to Home