This repository has been archived by the owner on Sep 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from YaleDHLab/setup
Setup
- Loading branch information
Showing
4 changed files
with
56 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from collections import defaultdict, Counter | ||
import codecs, warnings | ||
warnings.filterwarnings('ignore') | ||
|
||
def parse_pages(path='text16e6.evt'): | ||
''' | ||
Return a mapping from single character representation of transcriptor to | ||
array of line strings. | ||
@kwarg str path: the path to the voynich full text | ||
@returns | ||
[str] page_order: a list of the page keys in order | ||
dict d: # d[annotator][page] = ['line_one', 'line_two'] | ||
''' | ||
page_order = [] | ||
d = defaultdict(lambda: defaultdict(list)) | ||
with codecs.open('text16e6.evt', 'r', 'latin1') as f: | ||
f = f.read() | ||
for line_idx, line in enumerate(f.split('\n')): | ||
if not line.strip(): continue | ||
if line[0] != '<': continue # skip paratextual lines | ||
meta = line.split('<')[1].split('>')[0] | ||
if '.' not in meta: # indicates the start of a new page (e.g. <f1r>) | ||
page_order.append(meta) | ||
continue | ||
page, sheet, line_num_and_annotator = meta.split('.') | ||
line_num, annotator = line_num_and_annotator.split(';') | ||
if '>' not in line: continue | ||
if not page: continue # skip the page id 0 | ||
line_text = line.split('>')[1].strip() | ||
d[annotator][page].append(line_text) | ||
return page_order, d | ||
|
||
page_order, line_map = parse_pages() | ||
|
||
# select the annotator to use (Takahashi) | ||
annotator = 'H' | ||
|
||
# set page array | ||
pages = line_map[annotator] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
beautifulsoup4>=4.5.3 | ||
flickr_api>=0.7.3 | ||
matplotlib>=2.2.2 | ||
numpy>=1.17.0 | ||
requests>=2.11.1 | ||
scikit-image>=0.15.0 | ||
scipy>=1.1.0 | ||
selenium>=3.14.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters