From b5602c4e58fa2a23a702e13d8923081258e59aff Mon Sep 17 00:00:00 2001
From: duhaime <douglas.duhaime@gmail.com>
Date: Thu, 18 Apr 2019 13:34:06 -0400
Subject: [PATCH 1/2] factor data fetching utils into separate nb

---
 download-images.ipynb | 194 ++++++++++++++++++++++++++++++++++++++++++
 voynich.ipynb         | 180 ---------------------------------------
 2 files changed, 194 insertions(+), 180 deletions(-)
 create mode 100644 download-images.ipynb

diff --git a/download-images.ipynb b/download-images.ipynb
new file mode 100644
index 0000000..176ecd6
--- /dev/null
+++ b/download-images.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Voynichese Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os.path import join\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def download_from_url(url, out_path):\n",
+    "  '''Download a file at location `url` and write to `out_path`'''\n",
+    "  if not os.path.exists(out_path):\n",
+    "    r = requests.get(url, allow_redirects=True)\n",
+    "    open(out_path, 'wb').write(r.content)\n",
+    "\n",
+    "def download_voynichese_coords(page_id):\n",
+    "  '''Download the page coords for `page_id` from voynichese.com'''\n",
+    "  url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n",
+    "  download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n",
+    "\n",
+    "def download_voynichese_page(page_id):\n",
+    "  '''Download a page image with page id `page_id` from voynichese.com'''\n",
+    "  url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n",
+    "  download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n",
+    "\n",
+    "def download_voynichese_data(page_ids):\n",
+    "  '''Download page images and word coords from voynichese.com'''\n",
+    "  for i in ['coords', 'voynichese-images']:\n",
+    "    if not os.path.exists(join('voynichese', i)):\n",
+    "      os.makedirs(join('voynichese', i))\n",
+    "  for page_id in page_ids:\n",
+    "    download_voynichese_coords(page_id)\n",
+    "    download_voynichese_page(page_id)\n",
+    "\n",
+    "download_voynichese_data(pages.keys())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Biodiversity Heritage Library Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os.path import join\n",
+    "import flickr_api\n",
+    "import os\n",
+    "\n",
+    "flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n",
+    "if os.path.exists('flickr.credentials'):\n",
+    "  flickr_api.set_auth_handler('flickr.credentials')\n",
+    "else:\n",
+    "  a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n",
+    "  perms = 'read' # set the required permissions\n",
+    "  url = a.get_authorization_url(perms)\n",
+    "  print(url) # open the printed url in a browser and agree; paste verifier code in xml response below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists('flickr.credentials'):\n",
+    "  a.set_verifier('5b58510bb6f0641b')\n",
+    "  flickr_api.set_auth_handler(a)\n",
+    "  a.save('flickr.credentials')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "user = flickr_api.Person.findByUserName('biodivlibrary')\n",
+    "errored = []\n",
+    "photos = user.getPhotos(page=2, perpage=100)\n",
+    "photos.info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n",
+    "\n",
+    "page_num = 6\n",
+    "while page_num < photos.info.pages:\n",
+    "  print(' * page num', page_num)\n",
+    "  page_photos = user.getPhotos(page=page_num, perpage=100)  \n",
+    "  for i in page_photos:\n",
+    "    try:\n",
+    "      out_path = join('biodivlibrary-images', i.id)\n",
+    "      if os.path.exists(out_path): continue\n",
+    "      i.save(out_path)\n",
+    "      time.sleep(2)\n",
+    "    except:\n",
+    "      print(' * could not save', i)\n",
+    "      errored.append(i)\n",
+    "  page_num += 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Smithsonian Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def get_page_images(page_number=0):\n",
+    "  '''Save each herbal image in page number `page_number` to disk'''\n",
+    "  print(' * fetching page', page_number)\n",
+    "  r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n",
+    "  text = r.content.decode('utf8')\n",
+    "  soup = BeautifulSoup(text)\n",
+    "  imgs = soup.select('.dams-image')\n",
+    "  for i in imgs:\n",
+    "    try:\n",
+    "      src = i.find('img')['src']\n",
+    "      image_id = os.path.basename(src).split('id=')[1]\n",
+    "      download_image(image_id)\n",
+    "    except Exception as exc:\n",
+    "      print(' ! err', i, exc)\n",
+    "  if '/topic/botany/images?page=' + str(page_number+1) in text:\n",
+    "    get_page_images(page_number+1)\n",
+    "  \n",
+    "def download_image(_id):\n",
+    "  '''Download an image by SI image id'''\n",
+    "  r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n",
+    "  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
+    "\n",
+    "# make output directory\n",
+    "out_dir = 'smithsonian-images'\n",
+    "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
+    "\n",
+    "# fetch images\n",
+    "get_page_images()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/voynich.ipynb b/voynich.ipynb
index 5f2e9ae..d00b7c7 100644
--- a/voynich.ipynb
+++ b/voynich.ipynb
@@ -94,186 +94,6 @@
     "pages = line_map[annotator]"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Download Voynichese Images"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from os.path import join\n",
-    "import requests\n",
-    "import os\n",
-    "\n",
-    "def download_from_url(url, out_path):\n",
-    "  '''Download a file at location `url` and write to `out_path`'''\n",
-    "  if not os.path.exists(out_path):\n",
-    "    r = requests.get(url, allow_redirects=True)\n",
-    "    open(out_path, 'wb').write(r.content)\n",
-    "\n",
-    "def download_voynichese_coords(page_id):\n",
-    "  '''Download the page coords for `page_id` from voynichese.com'''\n",
-    "  url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n",
-    "  download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n",
-    "\n",
-    "def download_voynichese_page(page_id):\n",
-    "  '''Download a page image with page id `page_id` from voynichese.com'''\n",
-    "  url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n",
-    "  download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n",
-    "\n",
-    "def download_voynichese_data(page_ids):\n",
-    "  '''Download page images and word coords from voynichese.com'''\n",
-    "  for i in ['coords', 'voynichese-images']:\n",
-    "    if not os.path.exists(join('voynichese', i)):\n",
-    "      os.makedirs(join('voynichese', i))\n",
-    "  for page_id in page_ids:\n",
-    "    download_voynichese_coords(page_id)\n",
-    "    download_voynichese_page(page_id)\n",
-    "\n",
-    "download_voynichese_data(pages.keys())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Download Biodiversity Heritage Library Images"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from os.path import join\n",
-    "import flickr_api\n",
-    "import os\n",
-    "\n",
-    "flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n",
-    "if os.path.exists('flickr.credentials'):\n",
-    "  flickr_api.set_auth_handler('flickr.credentials')\n",
-    "else:\n",
-    "  a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n",
-    "  perms = 'read' # set the required permissions\n",
-    "  url = a.get_authorization_url(perms)\n",
-    "  print(url) # open the printed url in a browser and agree; paste verifier code in xml response below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not os.path.exists('flickr.credentials'):\n",
-    "  a.set_verifier('5b58510bb6f0641b')\n",
-    "  flickr_api.set_auth_handler(a)\n",
-    "  a.save('flickr.credentials')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Info(page=2, perpage=100, pages=1342, total=134143)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "user = flickr_api.Person.findByUserName('biodivlibrary')\n",
-    "errored = []\n",
-    "photos = user.getPhotos(page=2, perpage=100)\n",
-    "photos.info"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "\n",
-    "if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n",
-    "\n",
-    "page_num = 6\n",
-    "while page_num < photos.info.pages:\n",
-    "  print(' * page num', page_num)\n",
-    "  page_photos = user.getPhotos(page=page_num, perpage=100)  \n",
-    "  for i in page_photos:\n",
-    "    try:\n",
-    "      out_path = join('biodivlibrary-images', i.id)\n",
-    "      if os.path.exists(out_path): continue\n",
-    "      i.save(out_path)\n",
-    "      time.sleep(2)\n",
-    "    except:\n",
-    "      print(' * could not save', i)\n",
-    "      errored.append(i)\n",
-    "  page_num += 1"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Download Smithsonian Images"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from bs4 import BeautifulSoup\n",
-    "import requests\n",
-    "import os\n",
-    "\n",
-    "def get_page_images(page_number=0):\n",
-    "  '''Save each herbal image in page number `page_number` to disk'''\n",
-    "  print(' * fetching page', page_number)\n",
-    "  r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n",
-    "  text = r.content.decode('utf8')\n",
-    "  soup = BeautifulSoup(text)\n",
-    "  imgs = soup.select('.dams-image')\n",
-    "  for i in imgs:\n",
-    "    try:\n",
-    "      src = i.find('img')['src']\n",
-    "      image_id = os.path.basename(src).split('id=')[1]\n",
-    "      download_image(image_id)\n",
-    "    except Exception as exc:\n",
-    "      print(' ! err', i, exc)\n",
-    "  if '/topic/botany/images?page=' + str(page_number+1) in text:\n",
-    "    get_page_images(page_number+1)\n",
-    "  \n",
-    "def download_image(_id):\n",
-    "  '''Download an image by SI image id'''\n",
-    "  r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n",
-    "  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
-    "\n",
-    "# make output directory\n",
-    "out_dir = 'smithsonian-images'\n",
-    "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
-    "\n",
-    "# fetch images\n",
-    "get_page_images()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},

From 781df591a2a01c04d3f6c857d77c397098920881 Mon Sep 17 00:00:00 2001
From: duhaime <douglas.duhaime@gmail.com>
Date: Thu, 18 Apr 2019 13:48:47 -0400
Subject: [PATCH 2/2] add utils to download bod images - for #9

---
 .gitignore            |  3 +--
 download-images.ipynb | 56 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 10f42c4..e276a35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
-smithsonian-images/
-biodivlibrary-images/
+*-images/
 .ipynb_checkpoints
 .DS_Store
 *.swp
diff --git a/download-images.ipynb b/download-images.ipynb
index 176ecd6..c592380 100644
--- a/download-images.ipynb
+++ b/download-images.ipynb
@@ -165,9 +165,63 @@
     "out_dir = 'smithsonian-images'\n",
     "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
     "\n",
-    "# fetch images\n",
+    "# get images\n",
     "get_page_images()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Bodleian Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.keys import Keys\n",
+    "from bs4 import BeautifulSoup\n",
+    "import time\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def download_query_results():\n",
+    "  driver = webdriver.Chrome()\n",
+    "  driver.get(url)\n",
+    "  time.sleep(2)\n",
+    "  text = driver.page_source\n",
+    "  soup = BeautifulSoup(text)\n",
+    "  collections = soup.select('.result')\n",
+    "  print(' * got', len(collections), 'collections')\n",
+    "  for i in collections:\n",
+    "    children = i['data-children'].strip().split(',')\n",
+    "    for j in children:\n",
+    "      try:\n",
+    "        download_image(j)\n",
+    "      except Exception as exc:\n",
+    "        print(' ! err', j, exc)\n",
+    "    \n",
+    "def download_image(_id):\n",
+    "  print(' * downloading', _id)\n",
+    "  r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n",
+    "  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
+    "  \n",
+    "# make output directory\n",
+    "out_dir = 'bodleian-images'\n",
+    "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
+    "\n",
+    "# get images\n",
+    "root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n",
+    "url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n",
+    "\n",
+    "download_query_results()"
+   ]
   }
  ],
  "metadata": {