Try wget https://bash.commongrounds.cc/uploads/1744743653_pngocr.sh from the console
#!/bin/bash
# pngocr v1.0 by Page Telegram Volunteer Services.
# This script uses GPU if available first to render
# and CPU second. File OCR all png extracted from PDF
# documents from current folder and applies in the
# text output file the page number with the title of
# the PNG file (without the extension).
# Check if required tools are installed
for cmd in python3; do
if ! command -v $cmd &> /dev/null; then
echo "Error: $cmd is not installed."
exit 1
fi
done
# Check if Python packages are installed
for package in cv2 easyocr; do
if ! python3 -c "import $package" 2>/dev/null; then
echo "Error: Python package '$package' is not installed."
exit 1
fi
done
# Output file for combined OCR results
OUTPUT_FILE="output.txt"
TEMP_DIR="./temp_ocr"
BACKUP_DIR="./temp_ocr_backup"
LOCK_FILE="/tmp/ocr_progress.lock"
PROCESSED_FILE="processed_files.txt"
LOG_FILE="ocr_processing.log"
# Number of cores to use
NUM_CORES=8
# Create directories and initialize files
mkdir -p "$TEMP_DIR" "$BACKUP_DIR"
[ -f "$PROCESSED_FILE" ] || touch "$PROCESSED_FILE"
[ -f "$LOG_FILE" ] || echo "$(date '+%H:%M:%S') Starting OCR process with EasyOCR" > "$LOG_FILE"
[ -f "$OUTPUT_FILE" ] || > "$OUTPUT_FILE"
# Create horizontal line (50 underscores)
HORIZONTAL_LINE=$(printf '_%.0s' {1..50})
# Python script for GPU-accelerated image preprocessing
PREPROCESS_SCRIPT="preprocess_image.py"
cat << 'EOF' > "$PREPROCESS_SCRIPT"
import cv2
import numpy as np
import sys
try:
# Load image
input_path = sys.argv[1]
output_path = sys.argv[2]
img = cv2.imread(input_path)
if img is None:
print(f"Error: Could not load image {input_path}")
sys.exit(1)
# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply adaptive thresholding
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
# Save preprocessed image
cv2.imwrite(output_path, thresh)
print(f"Successfully preprocessed {input_path}")
except Exception as e:
print(f"Error processing {input_path}: {str(e)}")
sys.exit(1)
EOF
# Python script for EasyOCR processing
OCR_SCRIPT="run_easyocr.py"
cat << 'EOF' > "$OCR_SCRIPT"
import easyocr
import sys
import os
try:
# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])
# Read image path from command line
image_path = sys.argv[1]
if not os.path.exists(image_path):
print(f"Error: Image file {image_path} does not exist")
sys.exit(1)
# Perform OCR
result = reader.readtext(image_path)
# Output results in a simple format
if not result:
print(f"Warning: No text detected in {image_path}")
for detection in result:
text = detection[1] # Extract the text
print(text)
except Exception as e:
print(f"Error processing {image_path}: {str(e)}")
sys.exit(1)
EOF
# Function to process a batch of files
process_batch() {
local core_id=$1
local files=("${@:2}")
local temp_file="$TEMP_DIR/ocr_part_${core_id}.txt"
local backup_file="$BACKUP_DIR/ocr_part_${core_id}.txt"
echo "Core $core_id starting to process ${#files[@]} files" >> "$LOG_FILE"
> "$temp_file"
for img in "${files[@]}"; do
if [ -f "$img" ]; then
# Skip if already processed
if grep -Fxq "$img" "$PROCESSED_FILE"; then
echo "$(date '+%H:%M:%S') Core $core_id: Skipping already processed $img" >> "$LOG_FILE"
continue
fi
# Preprocess image
preprocessed_img="$TEMP_DIR/preprocessed_${core_id}_$(basename "$img")"
echo "$(date '+%H:%M:%S') Core $core_id: Preprocessing $img" >> "$LOG_FILE"
python3 "$PREPROCESS_SCRIPT" "$img" "$preprocessed_img" 2>> "$LOG_FILE"
if [ $? -ne 0 ]; then
echo "$(date '+%H:%M:%S') Core $core_id: Preprocessing failed for $img" >> "$LOG_FILE"
continue
fi
echo "$(date '+%H:%M:%S') Core $core_id: Running OCR on $img" >> "$LOG_FILE"
(
flock -x 200
echo "Processing: $img" >> "$temp_file"
python3 "$OCR_SCRIPT" "$preprocessed_img" >> "$temp_file" 2>> "$LOG_FILE"
echo "" >> "$temp_file"
echo "$HORIZONTAL_LINE" >> "$temp_file"
echo "" >> "$temp_file"
echo "$img" >> "$PROCESSED_FILE"
) 200>"$LOCK_FILE"
# Check if OCR produced output
if [ ! -s "$temp_file" ]; then
echo "$(date '+%H:%M:%S') Core $core_id: Warning - No output generated for $img" >> "$LOG_FILE"
fi
# Backup temp file after each image
cp "$temp_file" "$backup_file"
# Clean up preprocessed image
rm -f "$preprocessed_img"
fi
done
echo "Core $core_id finished processing" >> "$LOG_FILE"
}
# Get all PNG files into an array safely using find
mapfile -t IMAGE_FILES < <(find . -maxdepth 1 -type f -name "*.png" | sort)
TOTAL_FILES=${#IMAGE_FILES[@]}
if [ $TOTAL_FILES -eq 0 ]; then
echo "No PNG files found in current directory"
exit 1
fi
FILES_PER_CORE=$(( (TOTAL_FILES + NUM_CORES - 1) / NUM_CORES ))
# Count already processed files
PROCESSED_COUNT=$(wc -l < "$PROCESSED_FILE")
REMAINING_FILES=$((TOTAL_FILES - PROCESSED_COUNT))
echo "Performing OCR on $TOTAL_FILES PNG files ($PROCESSED_COUNT already processed, $REMAINING_FILES remaining) using $NUM_CORES cores..."
echo "$(date '+%H:%M:%S') Starting OCR: $TOTAL_FILES total, $PROCESSED_COUNT processed, $REMAINING_FILES remaining" >> "$LOG_FILE"
# Launch parallel processes
declare -a PIDS
for ((i=0; i<NUM_CORES; i++)); do
START_IDX=$((i * FILES_PER_CORE))
END_IDX=$((START_IDX + FILES_PER_CORE - 1))
if [ $END_IDX -ge $TOTAL_FILES ]; then
END_IDX=$((TOTAL_FILES - 1))
fi
if [ $START_IDX -ge $TOTAL_FILES ]; then
continue
fi
BATCH_FILES=("${IMAGE_FILES[@]:$START_IDX:$FILES_PER_CORE}")
process_batch "$i" "${BATCH_FILES[@]}" &
PIDS[$i]=$!
done
# Wait for all background processes to complete
echo "$(date '+%H:%M:%S') Waiting for all processes to complete..." >> "$LOG_FILE"
for pid in "${PIDS[@]}"; do
if [ -n "$pid" ]; then
wait "$pid"
echo "$(date '+%H:%M:%S') Process $pid completed" >> "$LOG_FILE"
fi
done
# Combine results into main output file
echo "Combining results..." >> "$LOG_FILE"
TEMP_OUTPUT="temp_output.txt"
> "$TEMP_OUTPUT"
for ((i=0; i<NUM_CORES; i++)); do
temp_file="$TEMP_DIR/ocr_part_${i}.txt"
if [ -f "$temp_file" ]; then
if [ ! -s "$temp_file" ]; then
echo "Warning: temp file $temp_file is empty" >> "$LOG_FILE"
else
cat "$temp_file" >> "$TEMP_OUTPUT"
fi
fi
done
# Check if temporary output has content
if [ ! -s "$TEMP_OUTPUT" ]; then
echo "Error: No OCR output generated. Check $LOG_FILE for details."
exit 1
fi
# Move temporary output to final output
mv "$TEMP_OUTPUT" "$OUTPUT_FILE"
# Clean up
rm -f "$LOCK_FILE" 2>/dev/null
FINAL_PROCESSED=$(wc -l < "$PROCESSED_FILE")
echo "OCR completed. Processed $FINAL_PROCESSED/$TOTAL_FILES files. Results saved to $OUTPUT_FILE"
echo "Check $LOG_FILE for processing details"
echo "$(date '+%H:%M:%S') OCR completed: $FINAL_PROCESSED/$TOTAL_FILES files processed" >> "$LOG_FILE"
BASH to Home