BASH Post Services

Viewing: 1752820094_wwow.sh

Try wget https://bash.commongrounds.cc/uploads/1752820094_wwow.sh from the console
#!/bin/bash

# Usage: ./mirror_site.sh <mode> <url> [output_dir]
# mode: "1" for single HTML page with associated images/documents, "A" for entire website/directory
# url: The URL to mirror (http, https, ftp, sftp)
# output_dir: Optional directory to save the mirrored content (default: ./mirror)

if [ $# -lt 2 ]; then
  echo "Usage: $0 <mode> <url> [output_dir]"
  echo "mode: '1' for single HTML page with images/documents, 'A' for entire website/directory"
  exit 1
fi

mode="$1"
url="$2"
output_dir="${3:-mirror}"

# Current date and time in CDT
archive_date=$(TZ="America/Chicago" date "+%Y-%m-%d %H:%M:%S %Z")

# Check for required dependencies
command -v wget >/dev/null 2>&1 || { echo "wget is required"; exit 1; }
command -v python3 >/dev/null 2>&1 || { echo "python3 is required for HTML processing"; exit 1; }
command -v sed >/dev/null 2>&1 || { echo "sed is required for fallback processing"; exit 1; }

# Create output directory and images, files, and css directories inside it
mkdir -p "$output_dir" "$output_dir/images" "$output_dir/files" "$output_dir/css"
cd "$output_dir" || exit 1

# Create external custom CSS file
cat > css/custom.css << 'EOF'
html, body {
  font-family: monospace !important;
  font-size: 12pt !important;
  color: #ffbf00 !important;
  background-color: black !important;
  padding: 1em !important;
  margin: 0 auto !important;
  max-width: 100% !important;
}
@media (min-width: 768px) {
  body {
    max-width: 80% !important;
  }
}
a {
  display: inline-block !important;
  color: #ffbf00 !important;
  background-color: black !important;
  padding: 0.2em 0.5em !important;
  border: 1px solid #ffbf00 !important;
  border-radius: 4px !important;
  text-decoration: none !important;
  transition: background-color 0.3s, color 0.3s !important;
  margin: 0.1em !important;
}
a:hover {
  color: black !important;
  background-color: #ffbf00 !important;
}
.ad, [class*="ad"], [id*="ad"], [class*="banner"], [id*="banner"] {
  display: none !important;
}
img {
  max-width: 100% !important;
  height: auto !important;
}
EOF

protocol="${url%%://*}"

case "$mode" in
  1)
    if [[ "$protocol" == "http" || "$protocol" == "https" ]]; then
      # Download only the single HTML page
      echo "Downloading HTML from $url"
      wget --adjust-extension --no-check-certificate --timeout=10 --tries=3 \
           --level=1 --output-document=index.html "$url" 2>> wget.log
      if [ ! -f "index.html" ]; then
        echo "Failed to download index.html, check wget.log"
        exit 1
      fi
      # Save raw HTML for debugging
      cp index.html index.html.raw
      # Extract and download images and documents
      mkdir -p temp
      echo "Extracting resource URLs from index.html"
      grep -oE '(href|src)="[^"]*\.(png|jpg|jpeg|gif|pdf|csv|xls|xlsx)"' index.html | \
        sed -E 's/(href|src)="([^"]*)"/\2/' | sort -u > temp/resource_urls.txt
      # Download each resource to temp directory
      while IFS= read -r resource; do
        # Decode URL to handle encoded characters
        decoded_resource=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$resource'))")
        # Convert relative URLs to absolute if needed
        if [[ ! "$decoded_resource" =~ ^https?:// ]]; then
          base_url=$(echo "$url" | sed -E 's|(https?://[^/]+).*|\1|')
          resource_url="${base_url}${decoded_resource}"
        else
          resource_url="$decoded_resource"
        fi
        # Extract filename from decoded URL
        filename=$(basename "$decoded_resource")
        echo "Downloading resource: $resource_url as $filename"
        wget --no-check-certificate --timeout=10 --tries=3 \
             --output-document="temp/$filename" "$resource_url" 2>> wget.log
      done < temp/resource_urls.txt
      # Move images to ./images
      find ./temp -maxdepth 1 -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \
           -not -path "./images/*" -exec mv {} ./images/ \;
      # Move documents to ./files
      find ./temp -maxdepth 1 -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \
           -not -path "./files/*" -exec mv {} ./files/ \;
      # Remove temporary directory and any other subdirectories
      rm -rf temp
      find . -type d -not -path . -not -path "./images" -not -path "./files" -not -path "./css" -exec rm -rf {} \;
    elif [[ "$protocol" == "ftp" ]]; then
      wget --no-check-certificate --timeout=10 --tries=3 --directory-prefix=./files "$url" 2>> wget.log
    elif [[ "$protocol" == "sftp" ]]; then
      command -v lftp >/dev/null 2>&1 || { echo "lftp is required for sftp"; exit 1; }
      hostpath="${url#*://}"
      host="${hostpath%%/*}"
      path="/${hostpath#"$host"}"
      if [ "$path" == "/" ]; then
        echo "For single mode with sftp, provide a file path in the URL"
        exit 1
      fi
      lftp -c "open $protocol://$host; get $path -o ./files/$(basename $path)" 2>> lftp.log
    else
      echo "Unsupported protocol: $protocol"
      exit 1
    fi
    ;;
  A)
    if [[ "$protocol" == "http" || "$protocol" == "https" || "$protocol" == "ftp" ]]; then
      # Mirror entire site with images, PDFs, CSVs, and Excel files
      echo "Mirroring entire site from $url"
      wget --mirror --convert-links --adjust-extension --page-requisites \
           --directory-prefix=. --accept '*.html,*.htm,*.png,*.jpg,*.jpeg,*.gif,*.pdf,*.csv,*.xls,*.xlsx' \
           --reject-regex='(ad|advert|banner)' --no-check-certificate --timeout=10 --tries=3 "$url" 2>> wget.log
      # Move images to ./images
      find . -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \
           -not -path "./images/*" -exec mv {} ./images/ \;
      # Move documents to ./files
      find . -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \
           -not -path "./files/*" -exec mv {} ./files/ \;
    elif [[ "$protocol" == "sftp" ]]; then
      command -v lftp >/dev/null 2>&1 || { echo "lftp is required for sftp"; exit 1; }
      hostpath="${url#*://}"
      host="${hostpath%%/*}"
      path="/${hostpath#"$host"}"
      lftp -c "open $protocol://$host; mirror -e -P10 --exclude ad $path ." 2>> lftp.log
      # Move images to ./images
      find . -type f \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.gif" \) \
           -not -path "./images/*" -exec mv {} ./images/ \;
      # Move documents to ./files
      find . -type f \( -name "*.pdf" -o -name "*.csv" -o -name "*.xls" -o -name "*.xlsx" \) \
           -not -path "./files/*" -exec mv {} ./files/ \;
    else
      echo "Unsupported protocol: $protocol"
      exit 1
    fi
    ;;
  *)
    echo "Invalid mode: $mode. Use '1' or 'A'."
    exit 1
    ;;
esac

# Create a virtual environment and install beautifulsoup4
venv_dir=".venv"
if [ ! -d "$venv_dir" ]; then
  echo "Creating virtual environment in $venv_dir"
  python3 -m venv "$venv_dir" || { echo "Failed to create virtual environment"; exit 1; }
fi
source "$venv_dir/bin/activate"
pip install beautifulsoup4 lxml >/dev/null 2> pip.log || { echo "Failed to install beautifulsoup4 or lxml, check pip.log"; deactivate; exit 1; }

# Post-process HTML files with Python and BeautifulSoup
cat > process_html.py << EOF
#!/usr/bin/env python3
import sys
import os
import logging
from bs4 import BeautifulSoup, Comment, NavigableString
from urllib.parse import urljoin, urlparse, unquote

# Set up logging
logging.basicConfig(filename='debug_html.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')

# Allowed tags
ALLOWED_TAGS = {
    'div', 'tt', 'p', 'a', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
    'img', 'span', 'strong', 'em', 'b', 'i', 'br', 'hr', 'blockquote'
}

# Allowed attributes
ALLOWED_ATTRS = {'a': ['href'], 'img': ['src', 'alt', 'width', 'height']}

# Placeholder image patterns
PLACEHOLDER_PATTERNS = ['404', 'broken', 'missing', 'placeholder', 'default']

def clean_attributes(tag):
    """Remove unwanted attributes, keeping only allowed ones."""
    attrs = dict(tag.attrs)
    tag.attrs = {}
    allowed = ALLOWED_ATTRS.get(tag.name, [])
    for attr in allowed:
        if attr in attrs:
            tag[attr] = attrs[attr]
    logging.debug(f"Cleaned attributes for tag {tag.name}: {tag.attrs}")

def process_html(input_file, base_url, archive_date):
    """Process HTML file to keep only allowed tags, update paths, and add footer."""
    try:
        # Read input HTML
        logging.debug(f"Reading input file: {input_file}")
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
        if not content.strip():
            logging.error("Input file is empty")
            raise ValueError("Input file is empty")
        soup = BeautifulSoup(content, 'lxml')
        logging.debug("Parsed HTML with BeautifulSoup using lxml parser")

        # Extract title and description
        title_tag = soup.find('title')
        title = str(title_tag) if title_tag else '<title>Untitled</title>'
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = str(description_tag) if description_tag else ''
        logging.debug(f"Extracted title: {title}")
        logging.debug(f"Extracted description: {description}")

        # Create new HTML structure
        new_soup = BeautifulSoup('<!DOCTYPE html><html><head><meta charset="UTF-8"></head><body></body></html>', 'lxml')
        
        # Add title and description to head
        new_soup.head.append(BeautifulSoup(title, 'lxml'))
        if description:
            new_soup.head.append(BeautifulSoup(description, 'lxml'))
        css_link = new_soup.new_tag('link', rel='stylesheet', href='./css/custom.css')
        new_soup.head.append(css_link)
        logging.debug("Added CSS link to head")

        # Process all elements in the document
        body = new_soup.body
        elements = soup.find_all(recursive=True)
        logging.debug(f"Found {len(elements)} elements in input HTML")
        
        # Create a temporary container to hold valid elements
        temp_container = BeautifulSoup('<div></div>', 'lxml').div
        
        for element in elements:
            if isinstance(element, Comment):
                logging.debug("Skipping comment")
                continue
            if isinstance(element, NavigableString) and element.strip():
                temp_container.append(NavigableString(element.strip()))
                logging.debug(f"Appended text: {element.strip()}")
            elif element.name in ALLOWED_TAGS:
                clean_attributes(element)
                if element.name == 'img' and 'src' in element.attrs:
                    src = element['src']
                    # Decode URL-encoded src
                    decoded_src = unquote(src)
                    logging.debug(f"Decoded image src: {src} -> {decoded_src}")
                    if any(pattern in decoded_src.lower() for pattern in PLACEHOLDER_PATTERNS):
                        logging.debug(f"Skipping placeholder image: {decoded_src}")
                        element.decompose()
                        continue
                    filename = os.path.basename(decoded_src)
                    if os.path.exists(f'./images/{filename}') and os.path.getsize(f'./images/{filename}') > 0:
                        element['src'] = f'./images/{filename}'
                        logging.debug(f"Updated image src to: ./images/{filename}")
                    else:
                        logging.debug(f"Removing non-existent or empty image: ./images/{filename}")
                        element.decompose()
                        continue
                elif element.name == 'a' and 'href' in element.attrs:
                    href = element['href']
                    # Decode URL-encoded href
                    decoded_href = unquote(href)
                    if decoded_href.lower().endswith(('.pdf', '.csv', '.xls', '.xlsx')):
                        filename = os.path.basename(decoded_href)
                        element['href'] = f'./files/{filename}'
                        logging.debug(f"Updated link href to: ./files/{filename}")
                    else:
                        element['href'] = decoded_href
                        logging.debug(f"Kept decoded href: {decoded_href}")
                temp_container.append(element)
                logging.debug(f"Appended tag: {element.name}")
            else:
                logging.debug(f"Skipping tag: {element.name if hasattr(element, 'name') else 'None'}")

        # Move valid elements to body
        for child in temp_container.children:
            body.append(child)
        logging.debug("Moved valid elements to body")

        # Add footer
        footer = new_soup.new_tag('footer')
        footer_p = new_soup.new_tag('p')
        footer_p.string = f"Archived by AmFile.org Script on {archive_date}"
        footer.append(footer_p)
        body.append(footer)
        logging.debug(f"Added footer: Archived by AmFile.org Script on {archive_date}")

        # Write output HTML
        output_file = input_file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(str(new_soup.prettify()))
        logging.debug(f"Wrote processed HTML to: {output_file}")
    except Exception as e:
        logging.error(f"Error processing {input_file}: {str(e)}")
        raise

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: python3 process_html.py <input_file> <base_url> <archive_date>")
        sys.exit(1)
    process_html(sys.argv[1], sys.argv[2], sys.argv[3])
EOF

# Make Python script executable
chmod +x process_html.py

# Post-process HTML files with Python
for file in *.html *.htm; do
  if [ -f "$file" ]; then
    echo "Processing file: $file"
    cp "$file" "${file}.backup"  # Backup original file
    "$venv_dir/bin/python" process_html.py "$file" "$url" "$archive_date" 2>> debug_html.log
    if [ $? -ne 0 ]; then
      echo "Python processing failed for $file, falling back to sed"
      temp_file=$(mktemp)
      title=$(grep -oE '<title>[^<]*</title>' "$file" || echo "<title>Untitled</title>")
      description=$(grep -oE '<meta[^>]*name=["'\'']description["'\''][^>]*>' "$file" || echo "")
      echo "<!DOCTYPE html><html><head><meta charset=\"UTF-8\">$title$description<link rel=\"stylesheet\" href=\"./css/custom.css\"></head><body>" > "$temp_file"
      # Improved sed to handle nested tags
      sed -n '/<(div|tt|p|a|li|ul|ol|h1|h2|h3|h4|h5|h6|img|span|strong|em|b|i|br|hr|blockquote)[^>]*>/,/<\/(div|tt|p|a|li|ul|ol|h1|h2|h3|h4|h5|h6|img|span|strong|em|b|i|br|hr|blockquote)>/p' "$file" | \
        sed -E 's/\s+style\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \
        sed -E 's/\s+on[a-zA-Z]+\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \
        sed -E 's/\s+data-[a-zA-Z0-9_-]+\s*=\s*(["'\''][^"'\'']*["'\'']|[^"'\''[:space:]]+)//g' | \
        sed -E '/<img/s/\s+src\s*=\s*(["'\''])([^"'\'']*(404|broken|missing|placeholder|default)\.(png|jpg|jpeg|gif))\1//g' | \
        sed -E '/<img/!s/\s+src\s*=\s*(["'\''][^"'\'']*["'\''])//g' | \
        sed -E '/<a/!s/\s+href\s*=\s*(["'\''][^"'\'']*["'\''])//g' | \
        sed -E 's/(href|src)\s*=\s*(["'\''])([^"'\'']*\.(png|jpg|jpeg|gif))\2/\1=\2.\/images\/\3\2/g' | \
        sed -E 's/(href|src)\s*=\s*(["'\''])([^"'\'']*\.(pdf|csv|xls|xlsx))\2/\1=\2.\/files\/\3\2/g' >> "$temp_file"
      while IFS= read -r img_line; do
        src=$(echo "$img_line" | sed -E 's/.*src\s*=\s*(["'\''])([^"'\'']*)\1.*/\2/')
        decoded_src=$(python3 -c "import urllib.parse; print(urllib.parse.unquote('$src'))")
        filename=$(basename "$decoded_src")
        if [ ! -s "./images/$filename" ]; then
          echo "Removing broken image: ./images/$filename"
          sed -i -E "/$(echo "$img_line" | sed -e 's/[]\/$*.^|[]/\\&/g')/d" "$temp_file"
        else
          sed -i -E "s|$src|./images/$filename|g" "$temp_file"
        fi
      done < <(grep -oE '<img\b[^>]*src\s*=\s*["'\''][^"'\'']*\.(png|jpg|jpeg|gif)["'\''][^>]*>' "$temp_file")
      echo "<footer><p>Archived by AmFile.org Script on $archive_date</p></footer>" >> "$temp_file"
      echo "</body></html>" >> "$temp_file"
      mv "$temp_file" "$file"
      mv "${file}.backup" "${file}.failed"  # Keep backup for debugging
    else
      rm "${file}.backup"  # Remove backup if successful
    fi
  fi
done

# Clean up Python script and virtual environment
rm -f process_html.py
deactivate
rm -rf "$venv_dir"

echo "Mirroring complete. Content saved in $output_dir/index.html, images in $output_dir/images, documents in $output_dir/files, CSS in $output_dir/css/custom.css, custom formatting enforced, broken/placeholder images removed, all JavaScript/AJAX stripped, and all original CSS removed."
BASH to Home