Rendere PDF che contiene del testo inserito in una immagine ricercabile con Tesseract e script bash

A volte ci sono dei pdf che non sono ricercabili. Per permettere la ricerca di una parola o di una frase (anche con l’uso del mio script pubblicato in una altro link) ho trovato questo script chiamato pdfocr.sh

#! /bin/bash

INPUT=
declare -a INPUT_FILES
ARGUMENTS=( “$@” )
OUTPUT=
OUT_FINAL=
TESS_CONFIG=pdf
TESS_LANG=ita
RES=300
IMG_FMT=
MODE=full
TESS_PARAMS=
KEEP_TMP=false
MODE=full
PARALLEL=false
JOBS=1
GSDEV=jpeg
INPUT_BASENAME=
TMPDIR_PATH=~/tmp
VERBOSE=false

RESET=”\e[0m”
PENDING=”\e[1;91m”
ERROR=”\e[1;31m”
DONE=”\e[1;36m”
MSG=”\e[1;36m”

#regexes
shopt -s extglob

#kill child processes on error
trap ‘end_jobs’ EXIT

function pdfocr {

if [ ${#ARGUMENTS[@]} -eq 0 ]; then
print_usage
exit
fi

parse_args

check_req

for input_file in “${INPUT_FILES[@]}”; do
echo -e “${MSG}Input file:$RESET” `basename “$input_file”`

if [[ $MODE = clean ]]; then
echo Cleaning temp files…
fi

check_file “$input_file”

init “$input_file”

pdfocr_proc “$input_file”

echo

done

echo -e “${DONE}Finished$RESET”

}

function pdfocr_proc {

local input_file=$1

split “$input_file”

local IN_P=”$TMPDIR_PATH/$INPUT_BASENAME”

if [[ -n $PREPROCESSOR ]]; then
preprocess “$IN_P” “$IMG_FMT”
fi

ocr “$IN_P” “$IMG_FMT”

merge “$IN_P”

clean_tmp “$IN_P” “$IMG_FMT”
}

function split {

if ! [[ $MODE = @(full|split) ]];then
return
fi

echo -e ${PENDING}Running$RESET ghostscript

local IN_F=$1

local GS_OUT=”$TMPDIR_PATH/$INPUT_BASENAME%04d_gs.$IMG_FMT”
gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=$GSDEV -r$RES -dTextAlphaBits=4 \
-o “$GS_OUT” -f “$IN_F” > /dev/null

try “Error while converting $IN_F”
}

function preprocess {

if [ $PARALLEL = true ]; then
echo -e ${PENDING}Running$RESET preprocessor in parallel
export -f run_preproc
export -f try
local PRE4PARA=`parallel –shellquote ::: “$PREPROCESSOR”`
parallel -n 1 -d ::: -j $JOBS -m run_preproc “$PRE4PARA” {} ::: “$1″*_gs.”$2”
try “Error during parallel preprocessing!”
else
echo -e ${PENDING}Running$RESET preprocessor
for f in “$1″*_gs.”$2”;do
run_preproc “$PREPROCESSOR” “$f”
done
fi

}

function ocr {

if ! [[ $MODE = @(full|ocr) ]]; then
return
fi

local IN_P=”$TMPDIR_PATH/$INPUT_BASENAME”
if [ $PARALLEL = true ]; then
echo -e ${PENDING}Running$RESET tesseract in parallel
if [ “$VERBOSE” = false ];then
run_wait
fi

export -f run_tess
export -f try
export -f print_errors

if [ -z “$TESS_PARAMS” ]; then
TESS_PARAMS=” ”
fi

local ESC=`echo $TESS_PARAMS| sed ‘s/-/__-/’`
ESC=`parallel –shellquote ::: “$ESC”`
parallel -n1 -j $JOBS -m run_tess {} “$TESS_LANG” “$ESC” “$TESS_CONFIG” “$VERBOSE” ::: “$1″*_gs.”$2”
try “Error while running parallel tesseract jobs!”

if [ “$VERBOSE” = false ]; then
kill -INT $!
fi

else
echo -e ${PENDING}Running$RESET tesseract
if [ “$VERBOSE” = false ];then
run_wait
fi

for f in “$1″*_gs.”$2”; do
run_tess “$f” “$TESS_LANG” “$TESS_PARAMS” “$TESS_CONFIG” “$VERBOSE”
done

if [ “$VERBOSE” = false ]; then
kill -INT $!
fi
fi
}

function merge {

if ! [[ $MODE = @(full|merge) ]]; then
return
fi

# local IN_P=”$TMPDIR_PATH/$INPUT_BASENAME”
local file_count=`ls -1 “$1″*_gs_tess.pdf | wc -l`
echo -e ${PENDING}Merging$RESET into $OUT_FINAL
if [[ $file_count -gt 1 ]]; then
pdfunite “$1″*_gs_tess.pdf “$OUT_FINAL”
else
cp “$1″*_gs_tess.pdf “$OUT_FINAL”
fi
try “Error while merging $1″*”$2 into $OUT_FINAL”
}

function clean_tmp {
if [ $KEEP_TMP = true ]; then
return
fi

if [[ $MODE != split ]]; then
rm “$1″*_gs.”$2”
fi
if [[ $MODE != ocr ]]; then
rm “$1″*_gs_tess.pdf
fi
}
function run_wait {

export -f wait_anim
wait_anim &

}

function run_tess {
local IN=$1
local TESS_LANG=$2
local TESS_PARAMS=$3
TESS_PARAMS=`echo $TESS_PARAMS | sed ‘s/__-/-/’`
local TESS_CONFIG=$4
local VERBOSE=$5
local tess_o=
if [ “$VERBOSE” = true ]; then
echo tesseract “$IN” “${IN%.*}_tess” -l $TESS_LANG -psm 3 $TESS_PARAMS $TESS_CONFIG
fi

tess_o=$(tesseract “$IN” “${IN%.*}_tess” -l $TESS_LANG -psm 3 $TESS_PARAMS $TESS_CONFIG 2>&1)
try “Error while performing ocr!” “$tess_o”

if [ “$VERBOSE” = true ]; then
echo $tess_o
echo Tesseract input $1
echo Tesseract output ${1%.*}_tess.pdf
fi
}

function run_preproc {
local PAGE_NUM=`echo $2|gawk ‘match($0,/.+([0-9]{4})_gs.*/,arr) { print arr[1]}’`
$1 “$2” $PAGE_NUM
try “Error while preprocessing!” “Preprocessor: $1” “Input file: $2”
}

function init {

local INPUT=$1

INPUT_BASENAME=$(basename “$INPUT”)
INPUT_BASENAME=${INPUT_BASENAME%.*}

if [ -z “$OUTPUT” ]; then

local INPUT_DIR=$(dirname “$INPUT”)
OUT_FINAL=”$INPUT_DIR/$INPUT_BASENAME”_ocr.pdf

else

if [[ “$OUTPUT” =~ INPUT_BASENAME ]]; then
OUT_FINAL=”${OUTPUT/INPUT_BASENAME/$INPUT_BASENAME}”
elif [[ “${OUTPUT: -1}” == “/” ]]; then
OUT_FINAL=”$OUTPUT$INPUT_BASENAME”_ocr.pdf
OUTPUT_DIR=$OUTPUT
else
OUT_FINAL=$OUTPUT
fi

if [ -z $OUTPUT_DIR ]; then
local OUTPUT_DIR=$(dirname $OUTPUT)
fi

if [ ! -d “$OUTPUT_DIR” ]; then
mkdir -p “$OUTPUT_DIR”
try “Failed creating the output directory: $OUTPUT_DIR”
fi

if [ ! -d “TMPDIR_PATH” ]; then
mkdir -p “$TMPDIR_PATH”
try “Failed creating the temporary directory: $TMPDIR_PATH”
fi

case $GSDEV in
jpeg)
IMG_FMT=jpg
;;
ppm)
;;
png*)
IMG_FMT=png
;;
tiff*)
IMG_FMT=tiff
;;
*)
throw “$IMG_FMT not supported”
;;
esac
}

function parse_args {

local let in_c=0
local let v_c=0

local LANGS_ARR=()
local IN_ARG=

while [[ $v_c -lt ${#ARGUMENTS[@]} ]]; do

key=${ARGUMENTS[$v_c]}

if (( v_c < ${#ARGUMENTS[@]} ));then
local val=”${ARGUMENTS[$((v_c+1))]}”
if [[ $val == -* ]]; then
val=
fi

fi
case $key in
-*)
IN_ARG=
let in_c=0
;;&
-i|–input)
IN_ARG=INPUT
;;
-o|–output)
OUTPUT=”$val”
;;
-t|–tempdir)
TMPDIR_PATH=”$val”
;;
–tess-config)
TESS_CONFIG=”$val”
;;
-l|–language)
IN_ARG=TESS_LANG
;;
-r|–resolution)
RES=”$val”
;;
-f|–img-format)
GSDEV=”$val”
;;
-c|–tess-param)
TESS_PARAMS=”$TESS_PARAMS -c $val”
;;
–keep-tmp)
KEEP_TMP=true
;;
-m|–mode)
MODE=”$val”
;;
-p|–parallel)
if hash parallel 2>/dev/null; then
PARALLEL=true
if [[ -n $val ]]; then
JOBS=$val
else
JOBS=`parallel –number-of-cores`
fi
fi
;;
-s|–preprocessor)
PREPROCESSOR=$val
;;
-v|–verbose)
VERBOSE=true
;;
-h|–help)
print_help
exit
;;
*)
case $IN_ARG in
INPUT)
INPUT_FILES[$in_c]=$key
let in_c++
;;
TESS_LANG)
LANGS_ARR[$in_c]=$key
let in_c++
esac
;;
esac
let v_c++
done

if [[ ${#LANGS_ARR} -gt 0 ]]; then
TESS_LANG=”${LANGS_ARR[@]}”
TESS_LANG=`echo “$TESS_LANG”|sed ‘s/ /+/’`
fi

if ! [[ $MODE = @(split|full|ocr|merge|clean) ]];then
echo “Invalid mode: $MODE”
print_usage
exit
fi

}

function check_req {

if ! hash pdfunite 2> /dev/null; then
throw “pdfunite missing!”
fi

if ! hash tesseract 2> /dev/null; then
throw “tesseract missing!”
fi

if ! hash gs 2> /dev/null; then
throw “ghostscript missing!”
fi

if [ ${#INPUT_FILES[@]} -eq 0 ]; then
throw “pdf input path is missing!”
fi
}

function check_file {
if ! [ -f “$1” ]; then
throw “No such file: $1”
fi
}

function try {
if [[ $? -ne 0 ]] ; then
throw “$@”
fi
return 0
}

function throw {
print_errors “$@”
exit 1
}

function print_errors {
for msg; do
echo -e “$ERROR$msg$RESET”
done
}

function print_help {

cat << END
pdfocr script

Description:

A convenience script that implements a pipeline for creating a searchable
pdf file. The pipeline involves three parts:
-pdf splitting and conversion to images
-character recognition
-merging into the final pdf
Each step can be performed separately of others. You can split pdf, preprocess
images (e.g. with ImageMagick) and then perform ocr and merging.

Requirements:

The following software applications are required:
gs (Ghostscript)
tesseract
pdfunite

Optional:
parallel (speeds up ocr on multiple cores)

Usage:

pdfocr -i|–input input.pdf [options…]

Options:

-l, –lang LANGS set the language(s) for tesseract; check available
languages with:
tesseract –list-langs

-o, –output OUTPUT_PATH set the output path; it can be explicit or use the
INPUT_BASENAME variable to construct it dynamically
e.g. , -o INPUT_BASENAME_ocr.pdf
Default: INPUT_BASENAME_ocr.pdf

-t, –tempdir TMPDIR_PATH set the path to directory with intermediate
files; Default: ~/tmp

-s, –preprocessor PROC_PATH set the path to an image preprocessor;
the preprocessor shall be executed for the image of
each page like this:
preprocessor img_path page_num
the preprocessor should overwrite the original image

-m, –mode MODE set the mode to perform only the part of processing;
MODE can be one of:
split
ocr
merge
full
Default: full
note: it is assumed that required files are in the
TMPDIR_PATH; modes ‘split’ and ‘ocr’ don’t delete
their output intermediate files

–tess-config CONFIG set the tesseract configuration; default: pdf

-p, –parallel [JOBS] use GNU parallel if available; limit the number
of jobs to JOBS

-v, –verbose allow verbose output

–keep-tmp keep the intermediate files; deleted by default

-f, –img-format the format of the intermediate images;
possible values:
jpeg png* ppm tiff*
any format supported by Ghostscript that matches
the above values is valid;
note: the image format makes a difference for
tesseract, so experiment with different values
Default: jpeg

-r, –resolution RES set the resolution of intermediate images to RES;
default: 300

-c, –tess-param key=val set a tesseract parameter
e.g., -c textord_min_linesize=2.5

-h, –help print this
END
}

function print_usage {
cat << END
pdfocr script

Usage:

pdfocr -i|–input input.pdf [options…]

Options:

-h, –help print help
-l, –lang LANGS set the language(s) for tesseract
-o, –output OUTPUT_PATH set the output path
-t, –tempdir TMPDIR_PATH set the path to tempdir (def: ~/tmp)
-m, –mode MODE set the mode (split,ocr,merge,full)
–tess-config CONFIG set the tesseract configuration; default: pdf
-f, –img-format FMT the format of the intermediate images;
jpeg png* ppm tiff*
-r, –resolution NUM set the resolution of the intermediate images
-c, –tess-param key=val set a tesseract parameter
–keep-tmp keep the intermediate files; deleted by default
-p, –parallel [JOBS] use GNU parallel if available
-v, –verbose allow verbose output
-s, –preprocessor PROC_PATH set the path to images preprocessor

END

}

function wait_anim {

trap “__STOP_PRINT=true” SIGINT
local move=r
local let length=5
local let pos=0
while [[ $__STOP_PRINT != true ]]; do

echo -en ” |”

for ((i=0;i<pos;i++));do
echo -n ” ”
done

echo -en “$PENDING*$RESET”

for ((i=0;i<length-pos;i++));do
echo -n ” ”
done

echo -en “|”
echo -ne \\r
if [[ $move == r ]]; then
let pos++
if ((pos>=length)); then
move=l
fi
else
let pos–
if ((pos<=0)); then
move=r
fi
fi

sleep 0.07
done
}

function end_jobs {

local job_n=`jobs -p | wc -l`
if ((job_n>0)); then
jobs -p | xargs kill
fi

}

pdfocr
trap ” EXIT SIGINT

# vim: ts=2 sw=2

Salvatore Crapanzano

Non solo Informatica e fisco, ma anche altro..

Rendere PDF che contiene del testo inserito in una immagine ricercabile con Tesseract e script bash