-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathSpringerbook
executable file
·127 lines (100 loc) · 4.33 KB
/
Springerbook
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/bin/bash
# Call this script like
# Springerbook http://www.springerlink.com/content/978-3-540-74119-0 or
# Springerbook http://link.springer.com/book/10.1007/0-387-32995-1
# where the final digits are the ISBN numbers
# Parse the arguments
if [ ! $# = 1 ]; then
echo "Usage: $(basename $0) URL-to-Springer-content"
echo "Example: $(basename $0) http://www.springerlink.com/content/978-3-540-74119-0"
echo "Example: $(basename $0) http://link.springer.com/book/10.1007/0-387-32995-1"
exit
fi
# First (and only) command line argument is the URL of the online resource
URL=$1
ISBN=`echo $URL | grep -o '[0-9]\{3\}-[0-9]\{1\}-[0-9]\{3,5\}-[0-9]\{3,5\}-[0-9X]\{1\}'`
if [ $ISBN"xxx" == "xxx" ]; then
# Alternative: just try to download $1 and grep the ISBN there.
download=$(mktemp) # Generate temp filename
wget $1 -O $download
ISBN=`grep -o '[0-9]\{3\}-[0-9]\{1,2\}-[0-9]\{3,5\}-[0-9]\{3,5\}-[0-9X]\{1\} (Print)' $download | grep -o '[0-9-]*'`
rm $download
if [ $ISBN"xxx" == "xxx" ]; then
echo "$(basename $0): Failed to parse ISBN."
exit 1
fi
fi
echo "Looking up $URL (ISBN $ISBN)"
# Remember the current directory
pushd .
# Make a temporary directory
echo "Making directory /tmp/$ISBN"
rm -rf /tmp/$ISBN
mkdir /tmp/$ISBN
# Get the source
cd /tmp/$ISBN
echo "$(basename $0): Getting the HTML source"
# wget http://www.springerlink.com/content/$ISBN/contents -O source
wget https://link.springer.com/book/10.1007/$ISBN -O source
# If resource was secured, print an error message and exit
grep --silent "Get Access" source
if [ $? == 0 ]; then
echo "$(basename $0): Resource seems to be secured."
exit
fi
# Try to find the title of the book.
TITLE=$(grep '<h1>' source | grep -o ">.*<" | grep -o "[^<>]*" | awk '//{gsub(" ","_"); print;}')
AUTHORS=$(grep -P '(?<="authors__name">)[^<]*(?=<)' -o < source | grep -o '[-[:alnum:]. ]*$'| awk '//{printf $1 "_";}')
BOOKTITLE=${AUTHORS}_${TITLE}_BOOK
# Find link to full book download
FULLBOOKPATH=$(grep "bookpdf-link" < source | head -n 1 | grep 'href="[^\"]*"' -o | grep "\/[^\"]*.pdf" -o)
if [[ "X$FULLBOOKPATH" != "X" ]]; then
echo "$(basename $0): Getting the full book"
# Download full book
wget http://link.springer.com$FULLBOOKPATH -o logfile
# Find name of PDF
PDF=$(perl -ne '/^Saving to: (‘|\`|“)(.+?)(’|'\''|”)$/ and print "$2\n";' < logfile)
# Rename
mv "$PDF" "$BOOKTITLE.pdf"
else
# Get all pages
while grep 'class="next"' source | tail -n 1 | grep -q '<a class="next"'; do
nextpage=$(grep 'class="next"' source | tail -n 1 | grep -o '/book/[^"]*')
wget http://link.springer.com$nextpage -O - >> source
done
# Parse the source to retrieve an ordered list of download URLs
# Old method (pre 2012/10/16, does not work with http://www.springerlink.com/content/978-3-642-17817-7/contents
# grep "Download PDF" source | grep -o -e '"[^"]*\.pdf"' | grep -o '[^"]*' > list
# # Filter out lines which do not end in ".pdf"
# grep -e '\.pdf$' list > list2
# New method (post 2012/11/06)
# awk 'BEGIN{a=0}/<span class="content-type-list__action-label"/{a=1}//{if(a==1)print;}/<\/span/{a=0;}' source | grep -o -e 'href="/content/[^"]*"' | grep -o -e '/content/[^"]*' > list
# New method (post 2018/04/05)
grep '<a class="content-type-list__action-label' < source | grep 'href="[^\"]*"' -o | grep "\/[^\"]*.pdf" -o > list
# Retrieve the individual files from Springer
echo "$(basename $0): Getting the individual chapter files"
wget -i list --base="http://link.springer.com" -o logfile
# If resource was secured, print an error message and exit
grep --silent resource-secured logfile
if [ $? == 0 ]; then
echo "$(basename $0): Resource seems to be secured."
exit
fi
# Create a list of files to concatenate
perl -ne '/^Saving to: (‘|\`|“)(.+?)(’|'\''|”)$/ and print "$2\n";' < logfile > download_list
# qpdf is confused by the name of the downloaded files.
# Therefore, we rename them.
j=0;
rm -f concat_list;
for i in $(cat download_list); do
mv "$i" $((++j)).pdf;
echo $j.pdf >> concat_list
done
# Concatenate the pdf files to a book
echo "$(basename $0): Concatenating files"
qpdf --empty $BOOKTITLE.pdf --pages $(cat concat_list) --
fi
# Return to the current directory
popd
# Inform the user
echo "The book has been compiled and now resides in /tmp/$ISBN/$BOOKTITLE.pdf"