Skip to content
This repository has been archived by the owner on Sep 4, 2024. It is now read-only.

Commit

Permalink
Merge pull request #35 from YaleDHLab/setup
Browse files Browse the repository at this point in the history
Setup
  • Loading branch information
duhaime authored Aug 29, 2019
2 parents b97469e + e05cacf commit 81a46fe
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 83 deletions.
7 changes: 5 additions & 2 deletions download-images.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"import requests\n",
"import os\n",
"\n",
"from helpers import pages\n",
"\n",
"def download_from_url(url, out_path):\n",
" '''Download a file at location `url` and write to `out_path`'''\n",
" if not os.path.exists(out_path):\n",
Expand All @@ -42,6 +44,7 @@
" download_voynichese_coords(page_id)\n",
" download_voynichese_page(page_id)\n",
"\n",
"print(' * preparing to download', pages.keys())\n",
"download_voynichese_data(pages.keys())"
]
},
Expand Down Expand Up @@ -106,7 +109,7 @@
"\n",
"if not os.path.exists('biodivlibrary-images'): os.makedirs('biodivlibrary-images')\n",
"\n",
"page_num = 6\n",
"page_num = 0\n",
"while page_num < photos.info.pages:\n",
" print(' * page num', page_num)\n",
" page_photos = user.getPhotos(page=page_num, perpage=100) \n",
Expand Down Expand Up @@ -240,7 +243,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down
39 changes: 39 additions & 0 deletions helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from collections import defaultdict, Counter
import codecs, warnings
warnings.filterwarnings('ignore')

def parse_pages(path='text16e6.evt'):
'''
Return a mapping from single character representation of transcriptor to
array of line strings.
@kwarg str path: the path to the voynich full text
@returns
[str] page_order: a list of the page keys in order
dict d: # d[annotator][page] = ['line_one', 'line_two']
'''
page_order = []
d = defaultdict(lambda: defaultdict(list))
with codecs.open('text16e6.evt', 'r', 'latin1') as f:
f = f.read()
for line_idx, line in enumerate(f.split('\n')):
if not line.strip(): continue
if line[0] != '<': continue # skip paratextual lines
meta = line.split('<')[1].split('>')[0]
if '.' not in meta: # indicates the start of a new page (e.g. <f1r>)
page_order.append(meta)
continue
page, sheet, line_num_and_annotator = meta.split('.')
line_num, annotator = line_num_and_annotator.split(';')
if '>' not in line: continue
if not page: continue # skip the page id 0
line_text = line.split('>')[1].strip()
d[annotator][page].append(line_text)
return page_order, d

page_order, line_map = parse_pages()

# select the annotator to use (Takahashi)
annotator = 'H'

# set page array
pages = line_map[annotator]
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
beautifulsoup4>=4.5.3
flickr_api>=0.7.3
matplotlib>=2.2.2
numpy>=1.17.0
requests>=2.11.1
scikit-image>=0.15.0
scipy>=1.1.0
selenium>=3.14.0

84 changes: 3 additions & 81 deletions voynich.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,86 +14,6 @@
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parse Transcriptions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict, Counter\n",
"import codecs\n",
"\n",
"def parse_pages(path='text16e6.evt'):\n",
" '''\n",
" Return a mapping from single character representation of transcriptor to\n",
" array of line strings.\n",
" @kwarg str path: the path to the voynich full text\n",
" @returns\n",
" [str] page_order: a list of the page keys in order\n",
" dict d: # d[annotator][page] = ['line_one', 'line_two']\n",
" '''\n",
" page_order = []\n",
" d = defaultdict(lambda: defaultdict(list))\n",
" with codecs.open('text16e6.evt', 'r', 'latin1') as f:\n",
" f = f.read()\n",
" for line_idx, line in enumerate(f.split('\\n')):\n",
" if not line.strip(): continue\n",
" if line[0] != '<': continue # skip paratextual lines\n",
" meta = line.split('<')[1].split('>')[0]\n",
" if '.' not in meta: # indicates the start of a new page (e.g. <f1r>)\n",
" page_order.append(meta)\n",
" continue \n",
" page, sheet, line_num_and_annotator = meta.split('.')\n",
" line_num, annotator = line_num_and_annotator.split(';')\n",
" if '>' not in line: continue\n",
" if not page: continue # skip the page id 0\n",
" line_text = line.split('>')[1].strip()\n",
" d[annotator][page].append(line_text)\n",
" return page_order, d\n",
" \n",
"page_order, line_map = parse_pages()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Select Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# each key in line_map is a transcriber\n",
"line_map.keys()\n",
"\n",
"# show how many pages each transcriber transcribed\n",
"for k in line_map: print(k, len(line_map[k]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# select the annotator to use (Takahashi)\n",
"annotator = 'H'\n",
"\n",
"# set page array\n",
"pages = line_map[annotator]"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -117,6 +37,8 @@
"import numpy as np\n",
"import os, re, glob, json\n",
"\n",
"from helpers import pages\n",
"\n",
"##\n",
"# Show a single page image\n",
"##\n",
Expand Down Expand Up @@ -296,7 +218,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 81a46fe

Please sign in to comment.