From b5602c4e58fa2a23a702e13d8923081258e59aff Mon Sep 17 00:00:00 2001 From: duhaime Date: Thu, 18 Apr 2019 13:34:06 -0400 Subject: [PATCH 1/2] factor data fetching utils into separate nb --- download-images.ipynb | 194 ++++++++++++++++++++++++++++++++++++++++++ voynich.ipynb | 180 --------------------------------------- 2 files changed, 194 insertions(+), 180 deletions(-) create mode 100644 download-images.ipynb diff --git a/download-images.ipynb b/download-images.ipynb new file mode 100644 index 0000000..176ecd6 --- /dev/null +++ b/download-images.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Voynichese Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from os.path import join\n", + "import requests\n", + "import os\n", + "\n", + "def download_from_url(url, out_path):\n", + " '''Download a file at location `url` and write to `out_path`'''\n", + " if not os.path.exists(out_path):\n", + " r = requests.get(url, allow_redirects=True)\n", + " open(out_path, 'wb').write(r.content)\n", + "\n", + "def download_voynichese_coords(page_id):\n", + " '''Download the page coords for `page_id` from voynichese.com'''\n", + " url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n", + " download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n", + "\n", + "def download_voynichese_page(page_id):\n", + " '''Download a page image with page id `page_id` from voynichese.com'''\n", + " url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n", + " download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n", + "\n", + "def download_voynichese_data(page_ids):\n", + " '''Download page images and word coords from voynichese.com'''\n", + " for i in ['coords', 'voynichese-images']:\n", + " if not os.path.exists(join('voynichese', i)):\n", + " os.makedirs(join('voynichese', i))\n", + " for page_id in page_ids:\n", + " download_voynichese_coords(page_id)\n", + " download_voynichese_page(page_id)\n", + "\n", + "download_voynichese_data(pages.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Biodiversity Heritage Library Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from os.path import join\n", + "import flickr_api\n", + "import os\n", + "\n", + "flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n", + "if os.path.exists('flickr.credentials'):\n", + " flickr_api.set_auth_handler('flickr.credentials')\n", + "else:\n", + " a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n", + " perms = 'read' # set the required permissions\n", + " url = a.get_authorization_url(perms)\n", + " print(url) # open the printed url in a browser and agree; paste verifier code in xml response below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('flickr.credentials'):\n", + " a.set_verifier('5b58510bb6f0641b')\n", + " flickr_api.set_auth_handler(a)\n", + " a.save('flickr.credentials')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "user = flickr_api.Person.findByUserName('biodivlibrary')\n", + "errored = []\n", + "photos = user.getPhotos(page=2, perpage=100)\n", + "photos.info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n", + "\n", + "page_num = 6\n", + "while page_num < photos.info.pages:\n", + " print(' * page num', page_num)\n", + " page_photos = user.getPhotos(page=page_num, perpage=100) \n", + " for i in page_photos:\n", + " try:\n", + " out_path = join('biodivlibrary-images', i.id)\n", + " if os.path.exists(out_path): continue\n", + " i.save(out_path)\n", + " time.sleep(2)\n", + " except:\n", + " print(' * could not save', i)\n", + " errored.append(i)\n", + " page_num += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Smithsonian Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "import requests\n", + "import os\n", + "\n", + "def get_page_images(page_number=0):\n", + " '''Save each herbal image in page number `page_number` to disk'''\n", + " print(' * fetching page', page_number)\n", + " r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n", + " text = r.content.decode('utf8')\n", + " soup = BeautifulSoup(text)\n", + " imgs = soup.select('.dams-image')\n", + " for i in imgs:\n", + " try:\n", + " src = i.find('img')['src']\n", + " image_id = os.path.basename(src).split('id=')[1]\n", + " download_image(image_id)\n", + " except Exception as exc:\n", + " print(' ! err', i, exc)\n", + " if '/topic/botany/images?page=' + str(page_number+1) in text:\n", + " get_page_images(page_number+1)\n", + " \n", + "def download_image(_id):\n", + " '''Download an image by SI image id'''\n", + " r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n", + " open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n", + "\n", + "# make output directory\n", + "out_dir = 'smithsonian-images'\n", + "if not os.path.exists(out_dir): os.makedirs(out_dir)\n", + "\n", + "# fetch images\n", + "get_page_images()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/voynich.ipynb b/voynich.ipynb index 5f2e9ae..d00b7c7 100644 --- a/voynich.ipynb +++ b/voynich.ipynb @@ -94,186 +94,6 @@ "pages = line_map[annotator]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Download Voynichese Images" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from os.path import join\n", - "import requests\n", - "import os\n", - "\n", - "def download_from_url(url, out_path):\n", - " '''Download a file at location `url` and write to `out_path`'''\n", - " if not os.path.exists(out_path):\n", - " r = requests.get(url, allow_redirects=True)\n", - " open(out_path, 'wb').write(r.content)\n", - "\n", - "def download_voynichese_coords(page_id):\n", - " '''Download the page coords for `page_id` from voynichese.com'''\n", - " url = 'http://www.voynichese.com/2/data/folio/script/' + page_id + '.js'\n", - " download_from_url(url, join('voynichese', 'coords', page_id + '.js'))\n", - "\n", - "def download_voynichese_page(page_id):\n", - " '''Download a page image with page id `page_id` from voynichese.com'''\n", - " url = 'http://www.voynichese.com/2/data/folio/image/glance/color/large/' + page_id + '.jpg'\n", - " download_from_url(url, join('voynichese', 'images', page_id + '.jpg'))\n", - "\n", - "def download_voynichese_data(page_ids):\n", - " '''Download page images and word coords from voynichese.com'''\n", - " for i in ['coords', 'voynichese-images']:\n", - " if not os.path.exists(join('voynichese', i)):\n", - " os.makedirs(join('voynichese', i))\n", - " for page_id in page_ids:\n", - " download_voynichese_coords(page_id)\n", - " download_voynichese_page(page_id)\n", - "\n", - "download_voynichese_data(pages.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Download Biodiversity Heritage Library Images" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from os.path import join\n", - "import flickr_api\n", - "import os\n", - "\n", - "flickr_api.set_keys(api_key='a704ce9732b363a9caece2d65f7d041a', api_secret ='f3f5e1d5baaf4d38')\n", - "if os.path.exists('flickr.credentials'):\n", - " flickr_api.set_auth_handler('flickr.credentials')\n", - "else:\n", - " a = flickr_api.auth.AuthHandler() # creates a new AuthHandler object\n", - " perms = 'read' # set the required permissions\n", - " url = a.get_authorization_url(perms)\n", - " print(url) # open the printed url in a browser and agree; paste verifier code in xml response below" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('flickr.credentials'):\n", - " a.set_verifier('5b58510bb6f0641b')\n", - " flickr_api.set_auth_handler(a)\n", - " a.save('flickr.credentials')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Info(page=2, perpage=100, pages=1342, total=134143)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user = flickr_api.Person.findByUserName('biodivlibrary')\n", - "errored = []\n", - "photos = user.getPhotos(page=2, perpage=100)\n", - "photos.info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n", - "\n", - "page_num = 6\n", - "while page_num < photos.info.pages:\n", - " print(' * page num', page_num)\n", - " page_photos = user.getPhotos(page=page_num, perpage=100) \n", - " for i in page_photos:\n", - " try:\n", - " out_path = join('biodivlibrary-images', i.id)\n", - " if os.path.exists(out_path): continue\n", - " i.save(out_path)\n", - " time.sleep(2)\n", - " except:\n", - " print(' * could not save', i)\n", - " errored.append(i)\n", - " page_num += 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Download Smithsonian Images" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from bs4 import BeautifulSoup\n", - "import requests\n", - "import os\n", - "\n", - "def get_page_images(page_number=0):\n", - " '''Save each herbal image in page number `page_number` to disk'''\n", - " print(' * fetching page', page_number)\n", - " r = requests.get('https://library.si.edu/topic/botany/images?page=' + str(page_number))\n", - " text = r.content.decode('utf8')\n", - " soup = BeautifulSoup(text)\n", - " imgs = soup.select('.dams-image')\n", - " for i in imgs:\n", - " try:\n", - " src = i.find('img')['src']\n", - " image_id = os.path.basename(src).split('id=')[1]\n", - " download_image(image_id)\n", - " except Exception as exc:\n", - " print(' ! err', i, exc)\n", - " if '/topic/botany/images?page=' + str(page_number+1) in text:\n", - " get_page_images(page_number+1)\n", - " \n", - "def download_image(_id):\n", - " '''Download an image by SI image id'''\n", - " r = requests.get('https://ids.si.edu/ids/deliveryService?id=' + _id, allow_redirects=True)\n", - " open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n", - "\n", - "# make output directory\n", - "out_dir = 'smithsonian-images'\n", - "if not os.path.exists(out_dir): os.makedirs(out_dir)\n", - "\n", - "# fetch images\n", - "get_page_images()" - ] - }, { "cell_type": "markdown", "metadata": {}, From 781df591a2a01c04d3f6c857d77c397098920881 Mon Sep 17 00:00:00 2001 From: duhaime Date: Thu, 18 Apr 2019 13:48:47 -0400 Subject: [PATCH 2/2] add utils to download bod images - for #9 --- .gitignore | 3 +-- download-images.ipynb | 56 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 10f42c4..e276a35 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ -smithsonian-images/ -biodivlibrary-images/ +*-images/ .ipynb_checkpoints .DS_Store *.swp diff --git a/download-images.ipynb b/download-images.ipynb index 176ecd6..c592380 100644 --- a/download-images.ipynb +++ b/download-images.ipynb @@ -165,9 +165,63 @@ "out_dir = 'smithsonian-images'\n", "if not os.path.exists(out_dir): os.makedirs(out_dir)\n", "\n", - "# fetch images\n", + "# get images\n", "get_page_images()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Bodleian Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.common.keys import Keys\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "import requests\n", + "import os\n", + "\n", + "def download_query_results():\n", + " driver = webdriver.Chrome()\n", + " driver.get(url)\n", + " time.sleep(2)\n", + " text = driver.page_source\n", + " soup = BeautifulSoup(text)\n", + " collections = soup.select('.result')\n", + " print(' * got', len(collections), 'collections')\n", + " for i in collections:\n", + " children = i['data-children'].strip().split(',')\n", + " for j in children:\n", + " try:\n", + " download_image(j)\n", + " except Exception as exc:\n", + " print(' ! err', j, exc)\n", + " \n", + "def download_image(_id):\n", + " print(' * downloading', _id)\n", + " r = requests.get('https://digital.bodleian.ox.ac.uk/inquire/resolver.iip?FIF={0}.jp2&HEI=514&RGN=0,0,1,1&CVT=jpeg'.format(_id))\n", + " open(os.path.join(out_dir, _id + '.jpg'), 'wb').write(r.content)\n", + " \n", + "# make output directory\n", + "out_dir = 'bodleian-images'\n", + "if not os.path.exists(out_dir): os.makedirs(out_dir)\n", + "\n", + "# get images\n", + "root_url = 'https://digital.bodleian.ox.ac.uk/inquire/Discover/Search/#/?'\n", + "url = root_url + 'p=c+NaN,t+herbal,rsrs+0,rsps+100,fa+,so+ox%3Asort%5Easc,scids+,pid+,vi+'\n", + "\n", + "download_query_results()" + ] } ], "metadata": {