BASH Post Services

Viewing: 1744743653_pngocr.sh

Try wget https://bash.commongrounds.cc/uploads/1744743653_pngocr.sh from the console

Raw File Link

#!/bin/bash

# pngocr v1.0 by Page Telegram Volunteer Services.
# This script uses GPU if available first to render
# and CPU second. File OCR all png extracted from PDF
# documents from current folder and applies in the
# text output file the page number with the title of
# the PNG file (without the extension).

# Check if required tools are installed
for cmd in python3; do
    if ! command -v $cmd &> /dev/null; then
        echo "Error: $cmd is not installed."
        exit 1
    fi
done

# Check if Python packages are installed
for package in cv2 easyocr; do
    if ! python3 -c "import $package" 2>/dev/null; then
        echo "Error: Python package '$package' is not installed."
        exit 1
    fi
done

# Output file for combined OCR results
OUTPUT_FILE="output.txt"
TEMP_DIR="./temp_ocr"
BACKUP_DIR="./temp_ocr_backup"
LOCK_FILE="/tmp/ocr_progress.lock"
PROCESSED_FILE="processed_files.txt"
LOG_FILE="ocr_processing.log"

# Number of cores to use
NUM_CORES=8

# Create directories and initialize files
mkdir -p "$TEMP_DIR" "$BACKUP_DIR"
[ -f "$PROCESSED_FILE" ] || touch "$PROCESSED_FILE"
[ -f "$LOG_FILE" ] || echo "$(date '+%H:%M:%S') Starting OCR process with EasyOCR" > "$LOG_FILE"
[ -f "$OUTPUT_FILE" ] || > "$OUTPUT_FILE"

# Create horizontal line (50 underscores)
HORIZONTAL_LINE=$(printf '_%.0s' {1..50})

# Python script for GPU-accelerated image preprocessing
PREPROCESS_SCRIPT="preprocess_image.py"
cat << 'EOF' > "$PREPROCESS_SCRIPT"
import cv2
import numpy as np
import sys

try:
    # Load image
    input_path = sys.argv[1]
    output_path = sys.argv[2]
    img = cv2.imread(input_path)
    if img is None:
        print(f"Error: Could not load image {input_path}")
        sys.exit(1)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

    # Save preprocessed image
    cv2.imwrite(output_path, thresh)
    print(f"Successfully preprocessed {input_path}")

except Exception as e:
    print(f"Error processing {input_path}: {str(e)}")
    sys.exit(1)
EOF

# Python script for EasyOCR processing
OCR_SCRIPT="run_easyocr.py"
cat << 'EOF' > "$OCR_SCRIPT"
import easyocr
import sys
import os

try:
    # Initialize EasyOCR reader
    reader = easyocr.Reader(['en'])

    # Read image path from command line
    image_path = sys.argv[1]
    
    if not os.path.exists(image_path):
        print(f"Error: Image file {image_path} does not exist")
        sys.exit(1)

    # Perform OCR
    result = reader.readtext(image_path)

    # Output results in a simple format
    if not result:
        print(f"Warning: No text detected in {image_path}")
    
    for detection in result:
        text = detection[1]  # Extract the text
        print(text)

except Exception as e:
    print(f"Error processing {image_path}: {str(e)}")
    sys.exit(1)
EOF

# Function to process a batch of files
process_batch() {
    local core_id=$1
    local files=("${@:2}")
    local temp_file="$TEMP_DIR/ocr_part_${core_id}.txt"
    local backup_file="$BACKUP_DIR/ocr_part_${core_id}.txt"
    
    echo "Core $core_id starting to process ${#files[@]} files" >> "$LOG_FILE"
    > "$temp_file"
    
    for img in "${files[@]}"; do
        if [ -f "$img" ]; then
            # Skip if already processed
            if grep -Fxq "$img" "$PROCESSED_FILE"; then
                echo "$(date '+%H:%M:%S') Core $core_id: Skipping already processed $img" >> "$LOG_FILE"
                continue
            fi
            
            # Preprocess image
            preprocessed_img="$TEMP_DIR/preprocessed_${core_id}_$(basename "$img")"
            echo "$(date '+%H:%M:%S') Core $core_id: Preprocessing $img" >> "$LOG_FILE"
            
            python3 "$PREPROCESS_SCRIPT" "$img" "$preprocessed_img" 2>> "$LOG_FILE"
            if [ $? -ne 0 ]; then
                echo "$(date '+%H:%M:%S') Core $core_id: Preprocessing failed for $img" >> "$LOG_FILE"
                continue
            fi
            
            echo "$(date '+%H:%M:%S') Core $core_id: Running OCR on $img" >> "$LOG_FILE"
            (
                flock -x 200
                echo "Processing: $img" >> "$temp_file"
                python3 "$OCR_SCRIPT" "$preprocessed_img" >> "$temp_file" 2>> "$LOG_FILE"
                echo "" >> "$temp_file"
                echo "$HORIZONTAL_LINE" >> "$temp_file"
                echo "" >> "$temp_file"
                echo "$img" >> "$PROCESSED_FILE"
            ) 200>"$LOCK_FILE"
            
            # Check if OCR produced output
            if [ ! -s "$temp_file" ]; then
                echo "$(date '+%H:%M:%S') Core $core_id: Warning - No output generated for $img" >> "$LOG_FILE"
            fi
            
            # Backup temp file after each image
            cp "$temp_file" "$backup_file"
            
            # Clean up preprocessed image
            rm -f "$preprocessed_img"
        fi
    done
    
    echo "Core $core_id finished processing" >> "$LOG_FILE"
}

# Get all PNG files into an array safely using find
mapfile -t IMAGE_FILES < <(find . -maxdepth 1 -type f -name "*.png" | sort)

TOTAL_FILES=${#IMAGE_FILES[@]}
if [ $TOTAL_FILES -eq 0 ]; then
    echo "No PNG files found in current directory"
    exit 1
fi

FILES_PER_CORE=$(( (TOTAL_FILES + NUM_CORES - 1) / NUM_CORES ))

# Count already processed files
PROCESSED_COUNT=$(wc -l < "$PROCESSED_FILE")
REMAINING_FILES=$((TOTAL_FILES - PROCESSED_COUNT))

echo "Performing OCR on $TOTAL_FILES PNG files ($PROCESSED_COUNT already processed, $REMAINING_FILES remaining) using $NUM_CORES cores..."
echo "$(date '+%H:%M:%S') Starting OCR: $TOTAL_FILES total, $PROCESSED_COUNT processed, $REMAINING_FILES remaining" >> "$LOG_FILE"

# Launch parallel processes
declare -a PIDS
for ((i=0; i<NUM_CORES; i++)); do
    START_IDX=$((i * FILES_PER_CORE))
    END_IDX=$((START_IDX + FILES_PER_CORE - 1))
    
    if [ $END_IDX -ge $TOTAL_FILES ]; then
        END_IDX=$((TOTAL_FILES - 1))
    fi
    
    if [ $START_IDX -ge $TOTAL_FILES ]; then
        continue
    fi
    
    BATCH_FILES=("${IMAGE_FILES[@]:$START_IDX:$FILES_PER_CORE}")
    
    process_batch "$i" "${BATCH_FILES[@]}" &
    PIDS[$i]=$!
done

# Wait for all background processes to complete
echo "$(date '+%H:%M:%S') Waiting for all processes to complete..." >> "$LOG_FILE"
for pid in "${PIDS[@]}"; do
    if [ -n "$pid" ]; then
        wait "$pid"
        echo "$(date '+%H:%M:%S') Process $pid completed" >> "$LOG_FILE"
    fi
done

# Combine results into main output file
echo "Combining results..." >> "$LOG_FILE"
TEMP_OUTPUT="temp_output.txt"
> "$TEMP_OUTPUT"
for ((i=0; i<NUM_CORES; i++)); do
    temp_file="$TEMP_DIR/ocr_part_${i}.txt"
    if [ -f "$temp_file" ]; then
        if [ ! -s "$temp_file" ]; then
            echo "Warning: temp file $temp_file is empty" >> "$LOG_FILE"
        else
            cat "$temp_file" >> "$TEMP_OUTPUT"
        fi
    fi
done

# Check if temporary output has content
if [ ! -s "$TEMP_OUTPUT" ]; then
    echo "Error: No OCR output generated. Check $LOG_FILE for details."
    exit 1
fi

# Move temporary output to final output
mv "$TEMP_OUTPUT" "$OUTPUT_FILE"

# Clean up
rm -f "$LOCK_FILE" 2>/dev/null

FINAL_PROCESSED=$(wc -l < "$PROCESSED_FILE")
echo "OCR completed. Processed $FINAL_PROCESSED/$TOTAL_FILES files. Results saved to $OUTPUT_FILE"
echo "Check $LOG_FILE for processing details"
echo "$(date '+%H:%M:%S') OCR completed: $FINAL_PROCESSED/$TOTAL_FILES files processed" >> "$LOG_FILE"
BASH to Home