-
Notifications
You must be signed in to change notification settings - Fork 0
/
ursusMarkDown.py
301 lines (268 loc) · 13.6 KB
/
ursusMarkDown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This script turns my own 'ursusmarkdown' conventions into XML/TEI for the digital edition
# of manuscript Casanatensis 1086.
# It's written in Python 3.4, but also works with Python 2.7.
# It uses library lxml.
#
# Warning: when writing in 'ursusmarkdown', note that every string at the beginning
# of the line that does not start with '<' or end with '>'
# will be substituted by this script with
# <w n="string">string</w>
# So, if this is not intended, every alphanumerical string that must not be substituted
# should start with whitespace, such as tab or space(s)
from __future__ import print_function
import datetime
import shutil
import zipfile
import os
import re
import xml.etree.ElementTree as ET
#from lxml import etree as ET
###################
# My own markdown #
###################
# Clear screen
os.system('clear')
# Set working files and folders
# Input
c = '/home/ilbuonme/siti/paolo.monella/ursus/' # working directory for input file
inBaseFN='casanatensis.xml' # input base file name
inFP = c+inBaseFN # input file path (folder + base file name)
# Output
outBaseFN = 'temp_'+inBaseFN # Output Base File name
outFP = c+outBaseFN # (temporary) Output Filename Path (folder + base file name).
# Archive
t = '/home/ilbuonme/voluminosi/ursus/old_versions_of_xml_source_file/' # directory where old versions are archived
outFH=open(outFP,'w') # File Handler for the output file
# These lists will include the lines not substituted because starting or
# ending with whitespace
startWList = [] # not substituted because starting with whitespace
endWList = [] # not substituted because ending with whitespace
startEndWList = [] # not substituted because starting end ending with whitespace
nofitList = [] # not substituted because not fitting my regexes
# Define regex patterns for my own 'markdown' conventions
abPatt = "(.*),(.*),(.*),(.*),(.*)" # Single abbreviation
dAbPatt = "(.*),(.*),(.*),(.*),(.*),(.*),(.*),(.*),(.*)" # Double abbreviation
stPatt = "<.*" # Lines that [s]tart with a [t]ag
etPatt = ".*>" # Lines that [e]nd with a [t]ag
#utPatt = ".*>_" # Lines that [e]nd with a [t]ag and a final underscore (old version: no longer in use)
#wPatt = "[\wþŋ¢÷].*" # Regular word (no abbreviations), possibly starting with non-ASCII brevigraph/logograph
wPatt = "\w.*" # Regular word (no abbreviations)
sewPatt = "\s+\w*\s+" # It starts and ends with whitespace, and has chars in between
swPatt = "\s+\w*" # The line starts with whitespace, then has characters
ewPatt = "\w*\s+" # The line ends with whitespace, and has chars before
# Different spellings between medieval and contemporary Latin
sd = {
'ae':'e',
'oe':'e',
}
def reSpell(stringToParse, spellingDict):
for s in spellingDict:
stringToParse = stringToParse.replace(s, spellingDict[s])
return stringToParse
# Perform substitutions
with open(inFP) as inFH:
for line in inFH:
# Clear variables
nL = '' # Clear the nL (newLine) variable
#wSpace= '_' # Old version
wSpace= '\n\t<pc type="space"> </pc>' # Reset the wSpace variable; maybe this was not necessary
# Check if word has a space after it in the MSS
if line[-2:-1] == "0": # Words that don't have a space after them
wSpace = ''
line = line[:-2] # Remove the final '0'
else:
wSpace= '\n\t<pc type="space"> </pc>' # Words that have a space
#wSpace = '_' # Old version
# Perform changes
#if re.match(stPatt, line) or re.match(etPatt, line) or re.match(utPatt, line): # Old version
if re.match(stPatt, line) or re.match(etPatt, line): # Starts or ends with XML tags
if (line[-1]) != '>': # This if/else is necessary b/c otherwise it will crop the final '>' in the last
# line, that normally is </TEI>
nL = line[:-1]
else:
nL = line
print(nL, file=outFH)
elif re.match(dAbPatt, line): # Double abbreviations
line = line.replace('v', 'u') # This substitution must be applied to all layers (GL, AL, LL)
line = line.replace('-', '¯') # This allows me to insert ,st,-,sunt, (easier) instead of ,st,¯,sunt,
#print('trovato doppio in:\t'+line)
dAbM = re.match(dAbPatt, line)
print(dAbM.group(4))
abPre = dAbM.group(1)
abBase1 = dAbM.group(2)
abAm1 = dAbM.group(3)
if abAm1 == '':
abType1 = 'brevigraph'
abAmTag1 = ''
elif abAm1 == ';':
abType1 = 'after'
abAmTag1 = '<am>'+abAm1+'</am>'
else:
abType1 = 'superscription'
abAmTag1 = '<am>'+abAm1+'</am>'
abAlph1 = dAbM.group(4)
abMiddle = dAbM.group(5)
abBase2 = dAbM.group(6)
abAm2 = dAbM.group(7)
if abAm2 == '':
abType2 = 'brevigraph'
abAmTag2 = ''
elif abAm2 == ';':
abType2 = 'after'
abAmTag2 = '<am>'+abAm2+'</am>'
else:
abType2 = 'superscription'
abAmTag2 = '<am>'+abAm2+'</am>'
abAlph2 = dAbM.group(8)
abPost = dAbM.group(9)
#abWordLL = abPre+abAlph1+abMiddle+abAlph2+abPost # (L)inguistic (L)ayer, old version
#abWordLL = (abPre+abAlph1+abMiddle+abAlph2+abPost).replace('æ', 'ae') # (L)inguistic (L)ayer, old version
abWordLL = abPre+abAlph1+abMiddle+abAlph2+abPost # (L)inguistic (L)ayer
abWordLL = abWordLL.replace('æ', 'ae') # E caudatum (medieval romæ → contemporary rome)
#abWordLL = abWordLL.replace('þ', 'per').replace('ŋ', 'pro').replace('¢', 'qui') # Brevigraphs
#abWordLL = abWordLL.replace('÷', 'est') # Logographs
nL='<w n="'+abWordLL+'">'+reSpell(abPre, sd)+'\n\t<choice>\n'
nL=nL+'\t\t<abbr type="'+abType1+'">'+reSpell(abBase1, sd)+abAmTag1+'</abbr>\n'
nL=nL+'\t\t<expan>'+reSpell(abAlph1, sd)+'</expan>\n\t</choice>\n\t'+reSpell(abMiddle, sd)+'\n\t<choice>\n'
nL=nL+'\t\t<abbr type="'+abType2+'">'+reSpell(abBase2, sd)+abAmTag2+'</abbr>\n'
nL=nL+'\t\t<expan>'+reSpell(abAlph2, sd)+'</expan>\n\t</choice>\n'+reSpell(abPost, sd)+'</w>'+wSpace
print(nL, file=outFH)
elif re.match(abPatt, line): # Single abbreviations
line = line.replace('v', 'u') # This substitution must be applied to all layers (GL, AL, LL)
line = line.replace('-', '¯') # This allows me to insert ,st,-,sunt, (easier) instead of ,st,¯,sunt,
abM = re.match(abPatt, line)
abPre = abM.group(1)
abBase = abM.group(2)
abAm = abM.group(3)
if abAm == '':
abType = 'brevigraph'
abAmTag = ''
elif abAm == ';':
abType = 'after'
abAmTag = '<am>'+abAm+'</am>'
else:
abType = 'superscription'
abAmTag = '<am>'+abAm+'</am>'
abAlph = abM.group(4)
abPost = abM.group(5)
#nL='<w n="'+abPre+abAlph+abPost+'">'+reSpell(abPre, sd)+'\n\t<choice>\n' old version
abWordLL = abPre+abAlph+abPost # abbreviated word at Linguistic Layer
abWordLL = abWordLL.replace('æ', 'ae') # E caudatum (medieval romæ → contemporary rome)
#abWordLL = abWordLL.replace('þ', 'per').replace('ŋ', 'pro').replace('¢', 'qui') # Brevigraphs
#abWordLL = abWordLL.replace('÷', 'est') # Logographs
nL='<w n="'+abWordLL+'">'+reSpell(abPre, sd)+'\n\t<choice>\n'
nL=nL+'\t\t<abbr type="'+abType+'">'+reSpell(abBase, sd)+abAmTag+'</abbr>\n'
nL=nL+'\t\t<expan>'+reSpell(abAlph, sd)+'</expan>\n\t</choice>\n'+reSpell(abPost, sd)+'</w>'+wSpace
print(nL, file=outFH)
elif line == '\n': # Empty line
nL = line[:-1]
print(nL, file=outFH)
elif re.match(sewPatt, line[:-1]): # Starts and ends with whitespace
print('This line starts and ends with whitespace!')
nL = line[:-1]
startEndWList.append(nL)
print(nL, file=outFH)
elif re.match(swPatt, line[:-2]): # Starts with whitespace
nL = line[:-1]
startWList.append(nL)
print(nL, file=outFH)
elif re.match(ewPatt, line[:-1]): # Ends with whitespace
nL = line[:-1]
endWList.append(nL)
print(nL, file=outFH)
elif re.match(wPatt, line): # Regular words (no abbreviations)
line = line.replace('v', 'u') # This substitution must be applied to all layers (GL, AL, LL)
wM = re.match(wPatt, line)
wWordGL = wWordLL = wM.group(0) # (L)inguistic (L)ayer vs. (G)raphematic (L)ayer
wWordGL = reSpell(wWordGL, sd) # contemporary romae → medieval rome (et sim.)
wWordLL = wWordLL.replace('æ', 'ae') # E caudatum (medieval romæ → contemporary rome)
#wWordLL = wWordLL.replace('þ', 'per').replace('ŋ', 'pro').replace('¢', 'qui') # Brevigraphs
#wWordLL = wWordLL.replace('÷', 'est') # Logographs
nL='<w n="'+wWordLL+'">'+wWordGL+'</w>'+wSpace
print(nL, file=outFH)
else: # Doesn't fit any regex
nL = line[:-1]
nofitList.append(nL)
print(nL, file=outFH)
# Close file handers
outFH.close()
#nac.close()
inFH.close()
def outputListAsTable(disclaimer, listName):
if len(listName) > 0:
print(disclaimer)
for x in listName:
print('\t«'+x+'»')
print() # This just inserts a blank line
# Output the lines that have not been substituted
outputListAsTable('Lines not substituted because they start with whitespace:', startWList)
outputListAsTable('ಠ_ಠ Lines not substituted because they start and end with whitespace:', startEndWList)
outputListAsTable('ಠ_ಠ Lines not substituted because they end with whitespace:', endWList)
outputListAsTable('ಠ_ಠ Lines not substituted because they do not fit any regex:', nofitList)
if len(startEndWList) == 0 and len(endWList) == 0 and len(nofitList) == 0:
print(' ͡° ͜ʖ ͡° No parsing errors')
#################
# Insert xml:id #
#################
# The namespaces
n = '{http://www.tei-c.org/ns/1.0}' # for XML/TEI
nx = '{http://www.w3.org/XML/1998/namespace}' # for attributes like xml:id
#ET.register_namespace('', 'http://www.tei-c.org/ns/1.0') # This used to be needed when I used ElementTree instead of lxml
# Parse the tree
tree = ET.parse(outFP) # It works on the temp output file and adds it xml:ids
# The following code block makes a list of existing IDs so it checks that new IDs do not
# already exist in the file (to avoid duplicate IDs)
existing_w_ids = []
for word in tree.findall('.//' + n + 'w'):
if word.get(nx + 'id'):
existing_w_ids.append(word.get(nx + 'id'))
# The following variables will be useful later to check that there are no unordered IDs
last_existing_id = existing_w_ids[-1]
reached_last_id = False
# If a word has no xml:id, set one
for word in tree.findall('.//' + n + 'w'):
if word.get(nx + 'id'):
idstring = word.get(nx + 'id')
idcount = int(idstring[1:])
if idstring == last_existing_id:
reached_last_id = True
else:
idcount = idcount + 3
idstring = 'w' + str(idcount)
if idstring in existing_w_ids: # If it's a duplicate ID, warn me and let me manage it
idstring = idstring + '_duplicate'
print('\nWARNING: DUPLICATE ID "' + idstring + '"')
if not reached_last_id and idcount > int(last_existing_id[1:]):
idstring = idstring + '_unordered'
print('\nWARNING: UNORDERED ID "' + idstring + '"')
#print(idstring + '_' + word.get('n'), end=' ')
word.set(nx + 'id', idstring)
existing_w_ids.append(idstring)
tree.write(outFP, encoding="UTF-8", method="xml", xml_declaration=True)
############################
# Zip and archive old file #
############################
def zipArchiveFile(inputFileName, outputFileName, workingFolder, archiveFolder):
""" This function zips and archives the old input file.
Then, it gives the output file the same name of the old input file.
Arguments:
inputFileName = base file name (not complete path) of old input file, including extension
outputFileName = base file name (not complete path) of new output file, including extension
workingFolder = the folder where both the input file and the outpu file are
archiveFolder = the folder where the old input file will be archived
"""
# Zip and store the old input file
#dateTag = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S") # A string with current date and time
# In next line, 'datetime...' generates a string with current date and time
stored = archiveFolder + datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S") +'_'+inputFileName
shutil.move(workingFolder+inputFileName, stored)
zf = zipfile.ZipFile(stored+'.zip', mode='w')
zf.write(stored, compress_type=zipfile.ZIP_DEFLATED)
zf.close()
os.remove(stored)
# Give the output file the same filename as the original input file
shutil.move(workingFolder+outputFileName,workingFolder+inputFileName)
zipArchiveFile(inBaseFN, outBaseFN, c, t)