-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkilldoubles.sh
executable file
·55 lines (52 loc) · 2.5 KB
/
killdoubles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
FILENAME=$1
if [ -d "./.tmp" ]; then
rm -r "./.tmp"
fi
echo "Usage: ./killdoubles.sh NAMEOFPDF.pdf"
echo "Note: The pdf file must be in the same directory as the script itself!"
# create a working dir
mkdir "./.tmp"
cd "./.tmp"
echo "Converting the PDF file to single pages… (this will take a while!)"
# divide the pdf file in new pdfs, page by page
gs -o doube%04d.pdf -dCompatibilityLevel=1.4 -dPDFSETTINGS=/prepress -sDEVICE=pdfwrite "../$FILENAME" >/dev/null &
# convert the pdf to bmp files with a resolution of 74x74 dpi
gs -o doube%04d.bmp -sDEVICE=bmp256 "../$FILENAME" >/dev/null &
# wait for the ghostscript tasks in the background to finish
wait
echo "Conversion of files done."
echo "Looking for duplicate pages…"
# pass the current directory to the custom ruby script in order to check cryptographically which bmp files in the current folder are indeed unique
ruby ../dupehunter.rb ./ bmp | cut -d "/" -f 2 | cut -d "." -f 1 | awk '{print $1".pdf"}' | sort > listofpdfs.txt
# ^ omit everything before the first '/'
# ^ omit the '.bmp' file ending by ignoring everything before the first '.'
# ^ append a '.pdf' to every filename
# ^ sort every filename alphabetically in order to assemble the pdfs correctly
echo "Search done."
mkdir temp
FILENUMBER=1
echo "Joining unique pages… (Step 1/2)"
# loop until 'listofpdfs.txt' is empty
until [ `cat listofpdfs.txt | wc -l` -eq 0 ]
do
# unite the first 100 PDFs in 'listofpdfs.txt' and save the result into ./temp/NUMBEROFITERATION
# (This should be good for PDFs below of 200*100 pages because of the "too many open files"-bug)
pdfunite $(cat listofpdfs.txt | head -n 100) ./temp/"$FILENUMBER".pdf
# delete the first 100 files from 'listofpdfs.txt'
cat listofpdfs.txt | tail -n +101 > listofpdfs.tmp && mv listofpdfs.tmp listofpdfs.txt
let FILENUMBER++
done
cd temp
echo "Joining unique pages… (Step 2/2)"
pdfunite `ls *.pdf | sort -n` "$FILENAME"_unoptimized.pdf >/dev/null
echo "Joining of PDF done."
mv "$FILENAME"_unoptimized.pdf ..
cd ..
echo "Optimizing PDF…"
gs -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/prepress -sOutputFile=foo.pdf "$FILENAME"_unoptimized.pdf >/dev/null
mv foo.pdf ../"$FILENAME"_withoutdupes.pdf
echo "PDF optimized. Cleaning up…"
cd ..
rm -r "./.tmp"
echo "Job done. C'ya!"