Skip to content
This repository has been archived by the owner on Sep 4, 2024. It is now read-only.

Commit

Permalink
Merge pull request #16 from YaleDHLab/bodleian-images
Browse files Browse the repository at this point in the history
Bodleian images
  • Loading branch information
duhaime authored Apr 18, 2019
2 parents 7d3e3f7 + 781df59 commit b97469e
Show file tree
Hide file tree
Showing 3 changed files with 249 additions and 182 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
smithsonian-images/
biodivlibrary-images/
*-images/
.ipynb_checkpoints
.DS_Store
*.swp
Expand Down
248 changes: 248 additions & 0 deletions download-images.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Voynichese Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from os.path import join\n",
"import requests\n",
"import os\n",
"\n",
"def download_from_url(url, out_path):\n",
" '''Download a file at location `url` and write to `out_path`'''\n",
" if not os.path.exists(out_path):\n",
" r = requests.get(url, allow_redirects=True)\n",
" open(out_path, 'wb').write(r.content)\n",
"\n",
"def download_voynichese_coords(page_id):\n",
" '''Download the page coords for `page_id` from voynichese.com'''\n",
" url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n",
" download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n",
"\n",
"def download_voynichese_page(page_id):\n",
" '''Download a page image with page id `page_id` from voynichese.com'''\n",
" url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n",
" download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n",
"\n",
"def download_voynichese_data(page_ids):\n",
" '''Download page images and word coords from voynichese.com'''\n",
" for i in ['coords', 'voynichese-images']:\n",
" if not os.path.exists(join('voynichese', i)):\n",
" os.makedirs(join('voynichese', i))\n",
" for page_id in page_ids:\n",
" download_voynichese_coords(page_id)\n",
" download_voynichese_page(page_id)\n",
"\n",
"download_voynichese_data(pages.keys())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Biodiversity Heritage Library Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from os.path import join\n",
"import flickr_api\n",
"import os\n",
"\n",
"flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n",
"if os.path.exists('flickr.credentials'):\n",
" flickr_api.set_auth_handler('flickr.credentials')\n",
"else:\n",
" a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n",
" perms = 'read' # set the required permissions\n",
" url = a.get_authorization_url(perms)\n",
" print(url) # open the printed url in a browser and agree; paste verifier code in xml response below"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists('flickr.credentials'):\n",
" a.set_verifier('5b58510bb6f0641b')\n",
" flickr_api.set_auth_handler(a)\n",
" a.save('flickr.credentials')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"user = flickr_api.Person.findByUserName('biodivlibrary')\n",
"errored = []\n",
"photos = user.getPhotos(page=2, perpage=100)\n",
"photos.info"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"\n",
"if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n",
"\n",
"page_num = 6\n",
"while page_num < photos.info.pages:\n",
" print(' * page num', page_num)\n",
" page_photos = user.getPhotos(page=page_num, perpage=100) \n",
" for i in page_photos:\n",
" try:\n",
" out_path = join('biodivlibrary-images', i.id)\n",
" if os.path.exists(out_path): continue\n",
" i.save(out_path)\n",
" time.sleep(2)\n",
" except:\n",
" print(' * could not save', i)\n",
" errored.append(i)\n",
" page_num += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Smithsonian Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import os\n",
"\n",
"def get_page_images(page_number=0):\n",
" '''Save each herbal image in page number `page_number` to disk'''\n",
" print(' * fetching page', page_number)\n",
" r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n",
" text = r.content.decode('utf8')\n",
" soup = BeautifulSoup(text)\n",
" imgs = soup.select('.dams-image')\n",
" for i in imgs:\n",
" try:\n",
" src = i.find('img')['src']\n",
" image_id = os.path.basename(src).split('id=')[1]\n",
" download_image(image_id)\n",
" except Exception as exc:\n",
" print(' ! err', i, exc)\n",
" if '/topic/botany/images?page=' + str(page_number+1) in text:\n",
" get_page_images(page_number+1)\n",
" \n",
"def download_image(_id):\n",
" '''Download an image by SI image id'''\n",
" r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n",
" open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
"\n",
"# make output directory\n",
"out_dir = 'smithsonian-images'\n",
"if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
"\n",
"# get images\n",
"get_page_images()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Bodleian Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"from bs4 import BeautifulSoup\n",
"import time\n",
"import requests\n",
"import os\n",
"\n",
"def download_query_results():\n",
" driver = webdriver.Chrome()\n",
" driver.get(url)\n",
" time.sleep(2)\n",
" text = driver.page_source\n",
" soup = BeautifulSoup(text)\n",
" collections = soup.select('.result')\n",
" print(' * got', len(collections), 'collections')\n",
" for i in collections:\n",
" children = i['data-children'].strip().split(',')\n",
" for j in children:\n",
" try:\n",
" download_image(j)\n",
" except Exception as exc:\n",
" print(' ! err', j, exc)\n",
" \n",
"def download_image(_id):\n",
" print(' * downloading', _id)\n",
" r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n",
" open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
" \n",
"# make output directory\n",
"out_dir = 'bodleian-images'\n",
"if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
"\n",
"# get images\n",
"root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n",
"url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n",
"\n",
"download_query_results()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit b97469e

Please sign in to comment.