Skip to content

Commit

Permalink
Preserves URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
javadr committed Apr 2, 2022
1 parent 4367163 commit a1b5663
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 5 deletions.
3 changes: 3 additions & 0 deletions Changelog.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.9.7 - 2022-04-02
-- Preserves URLs; number inside the URLs will not change

0.9.6 - 2022-04-02
-- Fixed issue #21, suffix اش
-- Untouchable words updated
Expand Down
51 changes: 48 additions & 3 deletions negar/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path

__version__ = "0.9.6"
__version__ = "0.9.7"

LOGO = (Path(__file__).parent.absolute()/"logo.png").as_posix()
DATAFILE = Path(__file__).parent.absolute()/"data/untouchable.dat"
Expand All @@ -18,5 +18,50 @@
* کلماتی که به اشتباه بدون فاصله به صورت ' میشود '، ' بیشک ' , یا ' کمکتان ' نوشته شده‌، به صورت درست فاصله گذاری میشوند.
* از استفاده ی بیش از یک علامت ؟؟؟؟ یا !!! جلوگیری می کند.
* کلماتی که به صورت کشیـــــــــده نوشته شده اند را به صورت درست می نویسد.
* از فاصله گذاری بیش از حد جلوگیری می کند.
"""
* از فاصله گذاری بیش از حد جلوگیری می کند."""

URLREGX = r""" #https://gist.github.com/gruber/8891611
(?xi)
\b
( # Capture 1: entire matched URL
(?:
https?|ftp: # URL protocol and colon
(?:
/{1,3} # 1-3 slashes
| # or
[a-z0-9%] # Single letter or digit or '%'
# (Trying not to match e.g. "URI::Escape")
)
| # or
# looks like domain name followed by a slash:
[a-z0-9.\-]+[.]
(?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
[/?]
)
(?: # One or more:
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
| # or
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
|
\([^\s]+?\) # balanced parens, non-recursive: (…)
)+
(?: # End with:
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (…(…)…)
|
\([^\s]+?\) # balanced parens, non-recursive: (…)
| # or
[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
)
| # OR, the following to match naked domains:
(?:
(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
[a-z0-9]+
(?:[.\-][a-z0-9]+)*
[.]
(?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
\b
/?
(?!@) # not succeeded by a @, avoid matching "foo.na" in "[email protected]"
)
)
"""
21 changes: 19 additions & 2 deletions negar/virastar.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
#!/usr/bin/env python

import re
from constants import DATAFILE, USERFILE
import enum
from constants import DATAFILE, USERFILE, URLREGX

class State(enum.Enum):
save = 1
restore = 2

class PersianEditor:
"""
Expand Down Expand Up @@ -38,6 +43,7 @@ def __init__(self, text, *args):
self.cleanup()

def cleanup(self):
self._handle_urls(State.save)
if self._fix_dashes: self.fix_dashes()
if self._fix_three_dots: self.fix_three_dots()
if self._fix_english_quotes: self.fix_english_quotes()
Expand All @@ -57,14 +63,25 @@ def cleanup(self):
if self._trim_leading_trailing_whitespaces:
self.text = '\n'.join([line.strip() for line in self.text.split('\n')])
self.cleanup_redundant_zwnj()

self._handle_urls(State.restore)
return self.text

def __str__(self):
return self.text

__repr__ = __str__

def _handle_urls(self, state):
"""Removing URLs and putting them back at the end of process"""
if state == State.save:
self.urls = re.findall(URLREGX, self.text, re.M)
self.urls.sort(key=lambda x: len(x), reverse=True)
for i, url in enumerate(self.urls):
self.text = re.sub(rf"\b{re.escape(url)}\b", rf'__URL__#{i}__', self.text)
if state == State.restore:
for i, url in enumerate(self.urls):
self.text = re.sub(f'__URL__#{i}__', url, self.text)

def fix_dashes(self):
"""Replaces double and triple dashes with `ndash` and `mdash`, respectively."""
self.text = re.sub(r'-{3}', r'—', self.text)
Expand Down

0 comments on commit a1b5663

Please sign in to comment.