Skip to content
This repository has been archived by the owner on Sep 4, 2024. It is now read-only.

Commit

Permalink
add utils to download bod images - for #9
Browse files Browse the repository at this point in the history
  • Loading branch information
duhaime committed Apr 18, 2019
1 parent b5602c4 commit 781df59
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
smithsonian-images/
biodivlibrary-images/
*-images/
.ipynb_checkpoints
.DS_Store
*.swp
Expand Down
56 changes: 55 additions & 1 deletion download-images.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,63 @@
"out_dir = 'smithsonian-images'\n",
"if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
"\n",
"# fetch images\n",
"# get images\n",
"get_page_images()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Bodleian Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import requests\n",
"import os\n",
"\n",
"def download_query_results():\n",
" driver = webdriver.Chrome()\n",
" driver.get(url)\n",
" time.sleep(2)\n",
" text = driver.page_source\n",
" soup = BeautifulSoup(text)\n",
" collections = soup.select('.result')\n",
" print(' * got', len(collections), 'collections')\n",
" for i in collections:\n",
" children = i['data-children'].strip().split(',')\n",
" for j in children:\n",
" try:\n",
" download_image(j)\n",
" except Exception as exc:\n",
" print(' ! err', j, exc)\n",
" \n",
"def download_image(_id):\n",
" print(' * downloading', _id)\n",
" r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n",
" open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
" \n",
"# make output directory\n",
"out_dir = 'bodleian-images'\n",
"if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
"\n",
"# get images\n",
"root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n",
"url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n",
"\n",
"download_query_results()"
]
}
],
"metadata": {
Expand Down

0 comments on commit 781df59

Please sign in to comment.