Merge pull request #35 from YaleDHLab/setup

Setup
YaleDHLab · Aug 29, 2019 · 81a46fe · 81a46fe
2 parents b97469e + e05cacf
commit 81a46fe
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 83 deletions.
diff --git a/download-images.ipynb b/download-images.ipynb
@@ -17,6 +17,8 @@
     "import requests\n",
     "import os\n",
     "\n",
+    "from helpers import pages\n",
+    "\n",
     "def download_from_url(url, out_path):\n",
     "  '''Download a file at location `url` and write to `out_path`'''\n",
     "  if not os.path.exists(out_path):\n",
@@ -42,6 +44,7 @@
     "    download_voynichese_coords(page_id)\n",
     "    download_voynichese_page(page_id)\n",
     "\n",
+    "print(' * preparing to download', pages.keys())\n",
     "download_voynichese_data(pages.keys())"
    ]
   },
@@ -106,7 +109,7 @@
     "\n",
     "if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n",
     "\n",
-    "page_num = 6\n",
+    "page_num = 0\n",
     "while page_num < photos.info.pages:\n",
     "  print(' * page num', page_num)\n",
     "  page_photos = user.getPhotos(page=page_num, perpage=100)  \n",
@@ -240,7 +243,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,

diff --git a/helpers.py b/helpers.py
@@ -0,0 +1,39 @@
+from collections import defaultdict, Counter
+import codecs, warnings
+warnings.filterwarnings('ignore')
+
+def parse_pages(path='text16e6.evt'):
+  '''
+  Return a mapping from single character representation of transcriptor to
+  array of line strings.
+  @kwarg str path: the path to the voynich full text
+  @returns
+    [str] page_order: a list of the page keys in order
+    dict d: # d[annotator][page] = ['line_one', 'line_two']
+  '''
+  page_order = []
+  d = defaultdict(lambda: defaultdict(list))
+  with codecs.open('text16e6.evt', 'r', 'latin1') as f:
+    f = f.read()
+    for line_idx, line in enumerate(f.split('\n')):
+      if not line.strip(): continue
+      if line[0] != '<': continue # skip paratextual lines
+      meta = line.split('<')[1].split('>')[0]
+      if '.' not in meta: # indicates the start of a new page (e.g. <f1r>)
+        page_order.append(meta)
+        continue 
+      page, sheet, line_num_and_annotator = meta.split('.')
+      line_num, annotator = line_num_and_annotator.split(';')
+      if '>' not in line: continue
+      if not page: continue # skip the page id 0
+      line_text = line.split('>')[1].strip()
+      d[annotator][page].append(line_text)
+  return page_order, d
+
+page_order, line_map = parse_pages()
+
+# select the annotator to use (Takahashi)
+annotator = 'H'
+
+# set page array
+pages = line_map[annotator]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,9 @@
+beautifulsoup4>=4.5.3
+flickr_api>=0.7.3
+matplotlib>=2.2.2
+numpy>=1.17.0
+requests>=2.11.1
+scikit-image>=0.15.0
+scipy>=1.1.0
+selenium>=3.14.0
+
diff --git a/voynich.ipynb b/voynich.ipynb
@@ -14,86 +14,6 @@
     "warnings.filterwarnings('ignore')"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Parse Transcriptions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from collections import defaultdict, Counter\n",
-    "import codecs\n",
-    "\n",
-    "def parse_pages(path='text16e6.evt'):\n",
-    "  '''\n",
-    "  Return a mapping from single character representation of transcriptor to\n",
-    "  array of line strings.\n",
-    "  @kwarg str path: the path to the voynich full text\n",
-    "  @returns\n",
-    "    [str] page_order: a list of the page keys in order\n",
-    "    dict d: # d[annotator][page] = ['line_one', 'line_two']\n",
-    "  '''\n",
-    "  page_order = []\n",
-    "  d = defaultdict(lambda: defaultdict(list))\n",
-    "  with codecs.open('text16e6.evt', 'r', 'latin1') as f:\n",
-    "    f = f.read()\n",
-    "    for line_idx, line in enumerate(f.split('\\n')):\n",
-    "      if not line.strip(): continue\n",
-    "      if line[0] != '<': continue # skip paratextual lines\n",
-    "      meta = line.split('<')[1].split('>')[0]\n",
-    "      if '.' not in meta: # indicates the start of a new page (e.g. <f1r>)\n",
-    "        page_order.append(meta)\n",
-    "        continue \n",
-    "      page, sheet, line_num_and_annotator = meta.split('.')\n",
-    "      line_num, annotator = line_num_and_annotator.split(';')\n",
-    "      if '>' not in line: continue\n",
-    "      if not page: continue # skip the page id 0\n",
-    "      line_text = line.split('>')[1].strip()\n",
-    "      d[annotator][page].append(line_text)\n",
-    "  return page_order, d\n",
-    "      \n",
-    "page_order, line_map = parse_pages()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Select Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# each key in line_map is a transcriber\n",
-    "line_map.keys()\n",
-    "\n",
-    "# show how many pages each transcriber transcribed\n",
-    "for k in line_map: print(k, len(line_map[k]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# select the annotator to use (Takahashi)\n",
-    "annotator = 'H'\n",
-    "\n",
-    "# set page array\n",
-    "pages = line_map[annotator]"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -117,6 +37,8 @@
     "import numpy as np\n",
     "import os, re, glob, json\n",
     "\n",
+    "from helpers import pages\n",
+    "\n",
     "##\n",
     "# Show a single page image\n",
     "##\n",
@@ -296,7 +218,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,