Skip to content

Commit

Permalink
Fix bug in textbook
Browse files Browse the repository at this point in the history
  • Loading branch information
patriotyk committed Dec 15, 2023
1 parent fb11cd3 commit b03e400
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions narizaka/textbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ def _is_fb2(self, filename: pathlib.Path)-> bool:
mimetype = magic.from_file(filename=filename, mime=True)
return mimetype == 'text/xml'

def remove_beginning_dashes(self, text):
def norm(self, text):
text = regex.sub(r'[᠆‐‑‒–—―⁻₋−⸺⸻]', '-', text)
text = regex.sub(r'^\s*?\-', '', text)
text = regex.sub(r'\[.*?\]', '', text)
text = regex.sub(r'\s+\.', '. ', text)
return text

def _get_text(self, el):
Expand All @@ -41,10 +42,9 @@ def _get_text(self, el):
text += el.text

for e in el:
if e.tag.endswith('}a'):
continue
for s in self._get_text(e):
text += s
if not e.tag.endswith('}a'):
for s in self._get_text(e):
text += s
if e.tail:
text += e.tail
return text
Expand All @@ -54,7 +54,7 @@ def more_text(self):
text = ''
for i in self.iter:
if i.tag.endswith('}p'):
text += self.remove_beginning_dashes(self._get_text(i)) + ' '
text += self.norm(self._get_text(i)) + ' '
if len(text) >= self.min_text_length:
break
return text

0 comments on commit b03e400

Please sign in to comment.