add utils to download bod images - for #9

YaleDHLab · Apr 18, 2019 · 781df59 · 781df59
1 parent b5602c4
commit 781df59
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
-smithsonian-images/
-biodivlibrary-images/
+*-images/
 .ipynb_checkpoints
 .DS_Store
 *.swp

diff --git a/download-images.ipynb b/download-images.ipynb
@@ -165,9 +165,63 @@
     "out_dir = 'smithsonian-images'\n",
     "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
     "\n",
-    "# fetch images\n",
+    "# get images\n",
     "get_page_images()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Bodleian Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.keys import Keys\n",
+    "from bs4 import BeautifulSoup\n",
+    "import time\n",
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "def download_query_results():\n",
+    "  driver = webdriver.Chrome()\n",
+    "  driver.get(url)\n",
+    "  time.sleep(2)\n",
+    "  text = driver.page_source\n",
+    "  soup = BeautifulSoup(text)\n",
+    "  collections = soup.select('.result')\n",
+    "  print(' * got', len(collections), 'collections')\n",
+    "  for i in collections:\n",
+    "    children = i['data-children'].strip().split(',')\n",
+    "    for j in children:\n",
+    "      try:\n",
+    "        download_image(j)\n",
+    "      except Exception as exc:\n",
+    "        print(' ! err', j, exc)\n",
+    "    \n",
+    "def download_image(_id):\n",
+    "  print(' * downloading', _id)\n",
+    "  r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n",
+    "  open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n",
+    "  \n",
+    "# make output directory\n",
+    "out_dir = 'bodleian-images'\n",
+    "if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
+    "\n",
+    "# get images\n",
+    "root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n",
+    "url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n",
+    "\n",
+    "download_query_results()"
+   ]
   }
  ],
  "metadata": {