Merge pull request #16 from YaleDHLab/bodleian-images

Bodleian images
YaleDHLab · Apr 18, 2019 · b97469e · b97469e
2 parents 7d3e3f7 + 781df59
commit b97469e
Show file tree

Hide file tree

Showing 3 changed files with 249 additions and 182 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
-smithsonian-images/
-biodivlibrary-images/
+*-images/
 .ipynb_checkpoints
 .DS_Store
 *.swp

diff --git a/download-images.ipynb b/download-images.ipynb
@@ -0,0 +1,248 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Voynichese Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os.path import join\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def download_from_url(url, out_path):\n",
+    "  '''Download a file at location `url` and write to `out_path`'''\n",
+    "  if not os.path.exists(out_path):\n",
+    "    r = requests.get(url, allow_redirects=True)\n",
+    "    open(out_path, 'wb').write(r.content)\n",
+    "\n",
+    "def download_voynichese_coords(page_id):\n",
+    "  '''Download the page coords for `page_id` from voynichese.com'''\n",
+    "  url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n",
+    "  download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n",
+    "\n",
+    "def download_voynichese_page(page_id):\n",
+    "  '''Download a page image with page id `page_id` from voynichese.com'''\n",
+    "  url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n",
+    "  download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n",
+    "\n",
+    "def download_voynichese_data(page_ids):\n",
+    "  '''Download page images and word coords from voynichese.com'''\n",
+    "  for i in ['coords', 'voynichese-images']:\n",
+    "    if not os.path.exists(join('voynichese', i)):\n",
+    "      os.makedirs(join('voynichese', i))\n",
+    "  for page_id in page_ids:\n",
+    "    download_voynichese_coords(page_id)\n",
+    "    download_voynichese_page(page_id)\n",
+    "\n",
+    "download_voynichese_data(pages.keys())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Biodiversity Heritage Library Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os.path import join\n",
+    "import flickr_api\n",
+    "import os\n",
+    "\n",
+    "flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n",
+    "if os.path.exists('flickr.credentials'):\n",
+    "  flickr_api.set_auth_handler('flickr.credentials')\n",
+    "else:\n",
+    "  a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n",
+    "  perms = 'read' # set the required permissions\n",
+    "  url = a.get_authorization_url(perms)\n",
+    "  print(url) # open the printed url in a browser and agree; paste verifier code in xml response below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists('flickr.credentials'):\n",
+    "  a.set_verifier('5b58510bb6f0641b')\n",
+    "  flickr_api.set_auth_handler(a)\n",
+    "  a.save('flickr.credentials')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user = flickr_api.Person.findByUserName('biodivlibrary')\n",
+    "errored = []\n",
+    "photos = user.getPhotos(page=2, perpage=100)\n",
+    "photos.info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n",
+    "\n",
+    "page_num = 6\n",
+    "while page_num < photos.info.pages:\n",
+    "  print(' * page num', page_num)\n",
+    "  page_photos = user.getPhotos(page=page_num, perpage=100)  \n",
+    "  for i in page_photos:\n",
+    "    try:\n",
+    "      out_path = join('biodivlibrary-images', i.id)\n",
+    "      if os.path.exists(out_path): continue\n",
+    "      i.save(out_path)\n",
+    "      time.sleep(2)\n",
+    "    except:\n",
+    "      print(' * could not save', i)\n",
+    "      errored.append(i)\n",
+    "  page_num += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Smithsonian Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def get_page_images(page_number=0):\n",
+    "  '''Save each herbal image in page number `page_number` to disk'''\n",
+    "  print(' * fetching page', page_number)\n",
+    "  r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n",
+    "  text = r.content.decode('utf8')\n",
+    "  soup = BeautifulSoup(text)\n",
+    "  imgs = soup.select('.dams-image')\n",
+    "  for i in imgs:\n",
+    "    try:\n",
+    "      src = i.find('img')['src']\n",
+    "      image_id = os.path.basename(src).split('id=')[1]\n",
+    "      download_image(image_id)\n",
+    "    except Exception as exc:\n",
+    "      print(' ! err', i, exc)\n",
+    "  if '/topic/botany/images?page=' + str(page_number+1) in text:\n",
+    "    get_page_images(page_number+1)\n",
+    "  \n",
+    "def download_image(_id):\n",
+    "  '''Download an image by SI image id'''\n",
+    "  r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n",
+    "  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
+    "\n",
+    "# make output directory\n",
+    "out_dir = 'smithsonian-images'\n",
+    "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
+    "\n",
+    "# get images\n",
+    "get_page_images()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Bodleian Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.keys import Keys\n",
+    "from bs4 import BeautifulSoup\n",
+    "import time\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def download_query_results():\n",
+    "  driver = webdriver.Chrome()\n",
+    "  driver.get(url)\n",
+    "  time.sleep(2)\n",
+    "  text = driver.page_source\n",
+    "  soup = BeautifulSoup(text)\n",
+    "  collections = soup.select('.result')\n",
+    "  print(' * got', len(collections), 'collections')\n",
+    "  for i in collections:\n",
+    "    children = i['data-children'].strip().split(',')\n",
+    "    for j in children:\n",
+    "      try:\n",
+    "        download_image(j)\n",
+    "      except Exception as exc:\n",
+    "        print(' ! err', j, exc)\n",
+    "    \n",
+    "def download_image(_id):\n",
+    "  print(' * downloading', _id)\n",
+    "  r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n",
+    "  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
+    "  \n",
+    "# make output directory\n",
+    "out_dir = 'bodleian-images'\n",
+    "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
+    "\n",
+    "# get images\n",
+    "root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n",
+    "url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n",
+    "\n",
+    "download_query_results()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}