Try wget https://bash.commongrounds.cc/uploads/1752820094_wwow.sh
from the console
#!/bin/bash # Usage: ./mirror_site.sh <mode> <url> [output_dir] # mode: "1" for single HTML page with associated images/documents, "A" for entire website/directory # url: The URL to mirror (http, https, ftp, sftp) # output_dir: Optional directory to save the mirrored content (default: ./mirror) if [ $# -lt 2 ]; then echo "Usage: $0 <mode> <url> [output_dir]" echo "mode: '1' for single HTML page with images/documents, 'A' for entire website/directory" exit 1 fi mode="$1" url="$2" output_dir="${3:-mirror}" # Current date and time in CDT archive_date=$(TZ="America/Chicago" date "+%Y-%m-%d %H:%M:%S %Z") # Check for required dependencies command -v wget >/dev/null 2>&1 || { echo "wget is required"; exit 1; } command -v python3 >/dev/null 2>&1 || { echo "python3 is required for HTML processing"; exit 1; } command -v sed >/dev/null 2>&1 || { echo "sed is required for fallback processing"; exit 1; } # Create output directory and images, files, and css directories inside it mkdir -p "$output_dir" "$output_dir/images" "$output_dir/files" "$output_dir/css" cd "$output_dir" || exit 1 # Create external custom CSS file cat > css/custom.css << 'EOF' html, body { font-family: monospace !important; font-size: 12pt !important; color: #ffbf00 !important; background-color: black !important; padding: 1em !important; margin: 0 auto !important; max-width: 100% !important; } @media (min-width: 768px) { body { max-width: 80% !important; } } a { display: inline-block !important; color: #ffbf00 !important; background-color: black !important; padding: 0.2em 0.5em !important; border: 1px solid #ffbf00 !important; border-radius: 4px !important; text-decoration: none !important; transition: background-color 0.3s, color 0.3s !important; margin: 0.1em !important; } a:hover { color: black !important; background-color: #ffbf00 !important; } .ad, [class*="ad"], [id*="ad"], [class*="banner"], [id*="banner"] { display: none !important; } img { max-width: 100% !important; height: auto !important; } EOF protocol="${url%%://*}" case "$mode" in 1) if [[ "$protocol" == "http" || "$protocol" == "https" ]]; then # Download only the single HTML page echo "Downloading HTML from $url" wget --adjust-extension --no-check-certificate --timeout=10 --tries=3 \ --level=1 --output-document=index.html "$url" 2>> wget.log if [ ! -f "index.html" ]; then echo "Failed to download index.html, check wget.log" exit 1 fi # Save raw HTML for debugging cp index.html index.html.raw # Extract and download images and documents mkdir -p temp echo "Extracting resource URLs from index.html" grep -oE '(href|src)="[^"]*\.(png|jpg|jpeg|gif|pdf|csv|xls|xlsx)"' index.html | \ sed -E 's/(href|src)="([^"]*)"/\2/' | sort -u > temp/resource_urls.txt # Download each resource to temp directory while IFS= read -r resource; do # Decode URL to handle encoded characters decoded_resource=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$resource'))") # Convert relative URLs to absolute if needed if [[ ! "$decoded_resource" =~ ^https?:// ]]; then base_url=$(echo "$url" | sed -E 's|(https?://[^/]+).*|\1|') resource_url="${base_url}${decoded_resource}" else resource_url="$decoded_resource" fi # Extract filename from decoded URL filename=$(basename "$decoded_resource") echo "Downloading resource: $resource_url as $filename" wget --no-check-certificate --timeout=10 --tries=3 \ --output-document="temp/$filename" "$resource_url" 2>> wget.log done < temp/resource_urls.txt # Move images to ./images find ./temp -maxdepth 1 -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \ -not -path "./images/*" -exec mv {} ./images/ \; # Move documents to ./files find ./temp -maxdepth 1 -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \ -not -path "./files/*" -exec mv {} ./files/ \; # Remove temporary directory and any other subdirectories rm -rf temp find . -type d -not -path . -not -path "./images" -not -path "./files" -not -path "./css" -exec rm -rf {} \; elif [[ "$protocol" == "ftp" ]]; then wget --no-check-certificate --timeout=10 --tries=3 --directory-prefix=./files "$url" 2>> wget.log elif [[ "$protocol" == "sftp" ]]; then command -v lftp >/dev/null 2>&1 || { echo "lftp is required for sftp"; exit 1; } hostpath="${url#*://}" host="${hostpath%%/*}" path="/${hostpath#"$host"}" if [ "$path" == "/" ]; then echo "For single mode with sftp, provide a file path in the URL" exit 1 fi lftp -c "open $protocol://$host; get $path -o ./files/$(basename $path)" 2>> lftp.log else echo "Unsupported protocol: $protocol" exit 1 fi ;; A) if [[ "$protocol" == "http" || "$protocol" == "https" || "$protocol" == "ftp" ]]; then # Mirror entire site with images, PDFs, CSVs, and Excel files echo "Mirroring entire site from $url" wget --mirror --convert-links --adjust-extension --page-requisites \ --directory-prefix=. --accept '*.html,*.htm,*.png,*.jpg,*.jpeg,*.gif,*.pdf,*.csv,*.xls,*.xlsx' \ --reject-regex='(ad|advert|banner)' --no-check-certificate --timeout=10 --tries=3 "$url" 2>> wget.log # Move images to ./images find . -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \ -not -path "./images/*" -exec mv {} ./images/ \; # Move documents to ./files find . -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \ -not -path "./files/*" -exec mv {} ./files/ \; elif [[ "$protocol" == "sftp" ]]; then command -v lftp >/dev/null 2>&1 || { echo "lftp is required for sftp"; exit 1; } hostpath="${url#*://}" host="${hostpath%%/*}" path="/${hostpath#"$host"}" lftp -c "open $protocol://$host; mirror -e -P10 --exclude ad $path ." 2>> lftp.log # Move images to ./images find . -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \ -not -path "./images/*" -exec mv {} ./images/ \; # Move documents to ./files find . -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \ -not -path "./files/*" -exec mv {} ./files/ \; else echo "Unsupported protocol: $protocol" exit 1 fi ;; *) echo "Invalid mode: $mode. Use '1' or 'A'." exit 1 ;; esac # Create a virtual environment and install beautifulsoup4 venv_dir=".venv" if [ ! -d "$venv_dir" ]; then echo "Creating virtual environment in $venv_dir" python3 -m venv "$venv_dir" || { echo "Failed to create virtual environment"; exit 1; } fi source "$venv_dir/bin/activate" pip install beautifulsoup4 lxml >/dev/null 2> pip.log || { echo "Failed to install beautifulsoup4 or lxml, check pip.log"; deactivate; exit 1; } # Post-process HTML files with Python and BeautifulSoup cat > process_html.py << EOF #!/usr/bin/env python3 import sys import os import logging from bs4 import BeautifulSoup, Comment, NavigableString from urllib.parse import urljoin, urlparse, unquote # Set up logging logging.basicConfig(filename='debug_html.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') # Allowed tags ALLOWED_TAGS = { 'div', 'tt', 'p', 'a', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'span', 'strong', 'em', 'b', 'i', 'br', 'hr', 'blockquote' } # Allowed attributes ALLOWED_ATTRS = {'a': ['href'], 'img': ['src', 'alt', 'width', 'height']} # Placeholder image patterns PLACEHOLDER_PATTERNS = ['404', 'broken', 'missing', 'placeholder', 'default'] def clean_attributes(tag): """Remove unwanted attributes, keeping only allowed ones.""" attrs = dict(tag.attrs) tag.attrs = {} allowed = ALLOWED_ATTRS.get(tag.name, []) for attr in allowed: if attr in attrs: tag[attr] = attrs[attr] logging.debug(f"Cleaned attributes for tag {tag.name}: {tag.attrs}") def process_html(input_file, base_url, archive_date): """Process HTML file to keep only allowed tags, update paths, and add footer.""" try: # Read input HTML logging.debug(f"Reading input file: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: content = f.read() if not content.strip(): logging.error("Input file is empty") raise ValueError("Input file is empty") soup = BeautifulSoup(content, 'lxml') logging.debug("Parsed HTML with BeautifulSoup using lxml parser") # Extract title and description title_tag = soup.find('title') title = str(title_tag) if title_tag else '<title>Untitled</title>' description_tag = soup.find('meta', attrs={'name': 'description'}) description = str(description_tag) if description_tag else '' logging.debug(f"Extracted title: {title}") logging.debug(f"Extracted description: {description}") # Create new HTML structure new_soup = BeautifulSoup('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body></body></html>', 'lxml') # Add title and description to head new_soup.head.append(BeautifulSoup(title, 'lxml')) if description: new_soup.head.append(BeautifulSoup(description, 'lxml')) css_link = new_soup.new_tag('link', rel='stylesheet', href='./css/custom.css') new_soup.head.append(css_link) logging.debug("Added CSS link to head") # Process all elements in the document body = new_soup.body elements = soup.find_all(recursive=True) logging.debug(f"Found {len(elements)} elements in input HTML") # Create a temporary container to hold valid elements temp_container = BeautifulSoup('<div></div>', 'lxml').div for element in elements: if isinstance(element, Comment): logging.debug("Skipping comment") continue if isinstance(element, NavigableString) and element.strip(): temp_container.append(NavigableString(element.strip())) logging.debug(f"Appended text: {element.strip()}") elif element.name in ALLOWED_TAGS: clean_attributes(element) if element.name == 'img' and 'src' in element.attrs: src = element['src'] # Decode URL-encoded src decoded_src = unquote(src) logging.debug(f"Decoded image src: {src} -> {decoded_src}") if any(pattern in decoded_src.lower() for pattern in PLACEHOLDER_PATTERNS): logging.debug(f"Skipping placeholder image: {decoded_src}") element.decompose() continue filename = os.path.basename(decoded_src) if os.path.exists(f'./images/{filename}') and os.path.getsize(f'./images/{filename}') > 0: element['src'] = f'./images/{filename}' logging.debug(f"Updated image src to: ./images/{filename}") else: logging.debug(f"Removing non-existent or empty image: ./images/{filename}") element.decompose() continue elif element.name == 'a' and 'href' in element.attrs: href = element['href'] # Decode URL-encoded href decoded_href = unquote(href) if decoded_href.lower().endswith(('.pdf', '.csv', '.xls', '.xlsx')): filename = os.path.basename(decoded_href) element['href'] = f'./files/{filename}' logging.debug(f"Updated link href to: ./files/{filename}") else: element['href'] = decoded_href logging.debug(f"Kept decoded href: {decoded_href}") temp_container.append(element) logging.debug(f"Appended tag: {element.name}") else: logging.debug(f"Skipping tag: {element.name if hasattr(element, 'name') else 'None'}") # Move valid elements to body for child in temp_container.children: body.append(child) logging.debug("Moved valid elements to body") # Add footer footer = new_soup.new_tag('footer') footer_p = new_soup.new_tag('p') footer_p.string = f"Archived by AmFile.org Script on {archive_date}" footer.append(footer_p) body.append(footer) logging.debug(f"Added footer: Archived by AmFile.org Script on {archive_date}") # Write output HTML output_file = input_file with open(output_file, 'w', encoding='utf-8') as f: f.write(str(new_soup.prettify())) logging.debug(f"Wrote processed HTML to: {output_file}") except Exception as e: logging.error(f"Error processing {input_file}: {str(e)}") raise if __name__ == '__main__': if len(sys.argv) != 4: print("Usage: python3 process_html.py <input_file> <base_url> <archive_date>") sys.exit(1) process_html(sys.argv[1], sys.argv[2], sys.argv[3]) EOF # Make Python script executable chmod +x process_html.py # Post-process HTML files with Python for file in *.html *.htm; do if [ -f "$file" ]; then echo "Processing file: $file" cp "$file" "${file}.backup" # Backup original file "$venv_dir/bin/python" process_html.py "$file" "$url" "$archive_date" 2>> debug_html.log if [ $? -ne 0 ]; then echo "Python processing failed for $file, falling back to sed" temp_file=$(mktemp) title=$(grep -oE '<title>[^<]*</title>' "$file" || echo "<title>Untitled</title>") description=$(grep -oE '<meta[^>]*name=["'\'']description["'\''][^>]*>' "$file" || echo "") echo "<!DOCTYPE html><html><head><meta charset=\"UTF-8\">$title$description<link rel=\"stylesheet\" href=\"./css/custom.css\"></head><body>" > "$temp_file" # Improved sed to handle nested tags sed -n '/<(div|tt|p|a|li|ul|ol|h1|h2|h3|h4|h5|h6|img|span|strong|em|b|i|br|hr|blockquote)[^>]*>/,/<\/(div|tt|p|a|li|ul|ol|h1|h2|h3|h4|h5|h6|img|span|strong|em|b|i|br|hr|blockquote)>/p' "$file" | \ sed -E 's/\s+style\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \ sed -E 's/\s+on[a-zA-Z]+\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \ sed -E 's/\s+data-[a-zA-Z0-9_-]+\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \ sed -E '/<img/s/\s+src\s*=\s*(["'\''])([^"'\'']*(404|broken|missing|placeholder|default)\.(png|jpg|jpeg|gif))\1//g' | \ sed -E '/<img/!s/\s+src\s*=\s*(["'\''][^"'\'']*["'\''])//g' | \ sed -E '/<a/!s/\s+href\s*=\s*(["'\''][^"'\'']*["'\''])//g' | \ sed -E 's/(href|src)\s*=\s*(["'\''])([^"'\'']*\.(png|jpg|jpeg|gif))\2/\1=\2.\/images\/\3\2/g' | \ sed -E 's/(href|src)\s*=\s*(["'\''])([^"'\'']*\.(pdf|csv|xls|xlsx))\2/\1=\2.\/files\/\3\2/g' >> "$temp_file" while IFS= read -r img_line; do src=$(echo "$img_line" | sed -E 's/.*src\s*=\s*(["'\''])([^"'\'']*)\1.*/\2/') decoded_src=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$src'))") filename=$(basename "$decoded_src") if [ ! -s "./images/$filename" ]; then echo "Removing broken image: ./images/$filename" sed -i -E "/$(echo "$img_line" | sed -e 's/[]\/$*.^|[]/\\&/g')/d" "$temp_file" else sed -i -E "s|$src|./images/$filename|g" "$temp_file" fi done < <(grep -oE '<img\b[^>]*src\s*=\s*["'\''][^"'\'']*\.(png|jpg|jpeg|gif)["'\''][^>]*>' "$temp_file") echo "<footer><p>Archived by AmFile.org Script on $archive_date</p></footer>" >> "$temp_file" echo "</body></html>" >> "$temp_file" mv "$temp_file" "$file" mv "${file}.backup" "${file}.failed" # Keep backup for debugging else rm "${file}.backup" # Remove backup if successful fi fi done # Clean up Python script and virtual environment rm -f process_html.py deactivate rm -rf "$venv_dir" echo "Mirroring complete. Content saved in $output_dir/index.html, images in $output_dir/images, documents in $output_dir/files, CSS in $output_dir/css/custom.css, custom formatting enforced, broken/placeholder images removed, all JavaScript/AJAX stripped, and all original CSS removed."BASH to Home