This repository has been archived by the owner on Sep 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from YaleDHLab/bodleian-images
Bodleian images
- Loading branch information
Showing
3 changed files
with
249 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
smithsonian-images/ | ||
biodivlibrary-images/ | ||
*-images/ | ||
.ipynb_checkpoints | ||
.DS_Store | ||
*.swp | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,248 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Download Voynichese Images" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from os.path import join\n", | ||
"import requests\n", | ||
"import os\n", | ||
"\n", | ||
"def download_from_url(url, out_path):\n", | ||
" '''Download a file at location `url` and write to `out_path`'''\n", | ||
" if not os.path.exists(out_path):\n", | ||
" r = requests.get(url, allow_redirects=True)\n", | ||
" open(out_path, 'wb').write(r.content)\n", | ||
"\n", | ||
"def download_voynichese_coords(page_id):\n", | ||
" '''Download the page coords for `page_id` from voynichese.com'''\n", | ||
" url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n", | ||
" download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n", | ||
"\n", | ||
"def download_voynichese_page(page_id):\n", | ||
" '''Download a page image with page id `page_id` from voynichese.com'''\n", | ||
" url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n", | ||
" download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n", | ||
"\n", | ||
"def download_voynichese_data(page_ids):\n", | ||
" '''Download page images and word coords from voynichese.com'''\n", | ||
" for i in ['coords', 'voynichese-images']:\n", | ||
" if not os.path.exists(join('voynichese', i)):\n", | ||
" os.makedirs(join('voynichese', i))\n", | ||
" for page_id in page_ids:\n", | ||
" download_voynichese_coords(page_id)\n", | ||
" download_voynichese_page(page_id)\n", | ||
"\n", | ||
"download_voynichese_data(pages.keys())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Download Biodiversity Heritage Library Images" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from os.path import join\n", | ||
"import flickr_api\n", | ||
"import os\n", | ||
"\n", | ||
"flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n", | ||
"if os.path.exists('flickr.credentials'):\n", | ||
" flickr_api.set_auth_handler('flickr.credentials')\n", | ||
"else:\n", | ||
" a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n", | ||
" perms = 'read' # set the required permissions\n", | ||
" url = a.get_authorization_url(perms)\n", | ||
" print(url) # open the printed url in a browser and agree; paste verifier code in xml response below" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"if not os.path.exists('flickr.credentials'):\n", | ||
" a.set_verifier('5b58510bb6f0641b')\n", | ||
" flickr_api.set_auth_handler(a)\n", | ||
" a.save('flickr.credentials')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"user = flickr_api.Person.findByUserName('biodivlibrary')\n", | ||
"errored = []\n", | ||
"photos = user.getPhotos(page=2, perpage=100)\n", | ||
"photos.info" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import time\n", | ||
"\n", | ||
"if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n", | ||
"\n", | ||
"page_num = 6\n", | ||
"while page_num < photos.info.pages:\n", | ||
" print(' * page num', page_num)\n", | ||
" page_photos = user.getPhotos(page=page_num, perpage=100) \n", | ||
" for i in page_photos:\n", | ||
" try:\n", | ||
" out_path = join('biodivlibrary-images', i.id)\n", | ||
" if os.path.exists(out_path): continue\n", | ||
" i.save(out_path)\n", | ||
" time.sleep(2)\n", | ||
" except:\n", | ||
" print(' * could not save', i)\n", | ||
" errored.append(i)\n", | ||
" page_num += 1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Download Smithsonian Images" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from bs4 import BeautifulSoup\n", | ||
"import requests\n", | ||
"import os\n", | ||
"\n", | ||
"def get_page_images(page_number=0):\n", | ||
" '''Save each herbal image in page number `page_number` to disk'''\n", | ||
" print(' * fetching page', page_number)\n", | ||
" r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n", | ||
" text = r.content.decode('utf8')\n", | ||
" soup = BeautifulSoup(text)\n", | ||
" imgs = soup.select('.dams-image')\n", | ||
" for i in imgs:\n", | ||
" try:\n", | ||
" src = i.find('img')['src']\n", | ||
" image_id = os.path.basename(src).split('id=')[1]\n", | ||
" download_image(image_id)\n", | ||
" except Exception as exc:\n", | ||
" print(' ! err', i, exc)\n", | ||
" if '/topic/botany/images?page=' + str(page_number+1) in text:\n", | ||
" get_page_images(page_number+1)\n", | ||
" \n", | ||
"def download_image(_id):\n", | ||
" '''Download an image by SI image id'''\n", | ||
" r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n", | ||
" open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n", | ||
"\n", | ||
"# make output directory\n", | ||
"out_dir = 'smithsonian-images'\n", | ||
"if not os.path.exists(out_dir): os.makedirs(out_dir)\n", | ||
"\n", | ||
"# get images\n", | ||
"get_page_images()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Download Bodleian Images" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from selenium import webdriver\n", | ||
"from selenium.webdriver.common.keys import Keys\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import time\n", | ||
"import requests\n", | ||
"import os\n", | ||
"\n", | ||
"def download_query_results():\n", | ||
" driver = webdriver.Chrome()\n", | ||
" driver.get(url)\n", | ||
" time.sleep(2)\n", | ||
" text = driver.page_source\n", | ||
" soup = BeautifulSoup(text)\n", | ||
" collections = soup.select('.result')\n", | ||
" print(' * got', len(collections), 'collections')\n", | ||
" for i in collections:\n", | ||
" children = i['data-children'].strip().split(',')\n", | ||
" for j in children:\n", | ||
" try:\n", | ||
" download_image(j)\n", | ||
" except Exception as exc:\n", | ||
" print(' ! err', j, exc)\n", | ||
" \n", | ||
"def download_image(_id):\n", | ||
" print(' * downloading', _id)\n", | ||
" r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n", | ||
" open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n", | ||
" \n", | ||
"# make output directory\n", | ||
"out_dir = 'bodleian-images'\n", | ||
"if not os.path.exists(out_dir): os.makedirs(out_dir)\n", | ||
"\n", | ||
"# get images\n", | ||
"root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n", | ||
"url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n", | ||
"\n", | ||
"download_query_results()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.