Un amico mi chiese una applicazione per poter cercare nei pdf….. 2 soluzioni
a) recoll http://www.lesbonscomptes.com/recoll/download.html#general
b) pdfgrep https://gitlab.com/pdfgrep/pdfgrep
presenti sia sotto linux che sotto windows.
Lo script per estrarre pagine pdf da un file pdf piu’ grande il file si chiama pdfselect.sh
#!/bin/bash
# extract pages from a big pdf file using gs
# Usage:
# pdfselect.sh pdffile startPageNumber endPageNumber
function getPDFPages()
{
if [ -e $1 ]; then
pdfinfo $1 | grep “Pages“ | awk ‘{print $2}‘
else
echo 0;
fi
}
if [ $# -lt 2 ]; then
sed -n ‘3,4p‘ `which pdfselect.sh`
exit
fi
oriPdf=$1
[ ! -e ${oriPdf} ] && echo “${oriPdf} does not exist! “ && exit
numPage=`getPDFPages ${oriPdf}`
[ ${numPage} -eq 0 ] && “ZERO page is found in ${oriPdf}“ && exit
sPage=$2
[ ${sPage} -gt ${numPage} ] && ${sPage}=${numPage}
i
———————- SCRIPT FILE ESTRATTORE DI PAGINE DA FILE PDF IN BASE ALLA RICERCA ——
# ./extract_pdf_results.sh Lagrange ./test.pdf
# ./extract_pdf_results.sh “Text” $file
STRING=”$1″
FILE=”$2″
FILENAME=”${FILE##*/})”
BASENAME=”${FILENAME%.*}”
DIRNAME=”${FILE%/*}”
echo “Processing $FILE…”
## find pages that contain string, remove duplicates, convert newlines to spaces
echo “Looking for $STRING…”
PAGES=”$(pdfgrep -n “$STRING” “$FILE” | cut -f1 -d “:” | uniq | tr ‘\n’ ‘ ‘)”
echo “Matching pages:
$PAGES”
## extract pages to new file in original directory
echo “Extracting result pages…”
pdftk “$FILE” cat $PAGES output “${DIRNAME}/${BASENAME}_pages_with_${STRING}.pdf”
echo “Done.”
—— SCRIPT CHE COPIA FILE PDF CHE CONTENGONO STRINGA IN UNA OPPORTUNA DIR —
#!/bin/bash
# Ricerca Testo e copia file presso la directory di destinazione
# bash pdfgrepandmv.sh “Text” “./target/*” “./destination/”
while read line; do
file=$(echo $line|awk -F: ‘{print $1}’)
printf “$file: ”
echo “$line”|grep -q :0$ && echo STRINGA NON TROVATA! && continue
echo STRINGA TROVATA
cp -i “$file” $3
done < <(find $2 -type f -iname ‘*.pdf’ -exec pdfgrep -Hc “$1” {} \;)