Preserves URLs

shahinism · Apr 2, 2022 · a1b5663 · a1b5663
1 parent 4367163
commit a1b5663
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 5 deletions.
diff --git a/Changelog.txt b/Changelog.txt
@@ -1,3 +1,6 @@
+0.9.7 - 2022-04-02
+-- Preserves URLs; number inside the URLs will not change
+
 0.9.6 - 2022-04-02
 -- Fixed issue #21, suffix اش
 -- Untouchable words updated

diff --git a/negar/constants.py b/negar/constants.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-__version__ = "0.9.6"
+__version__ = "0.9.7"
 
 LOGO = (Path(__file__).parent.absolute()/"logo.png").as_posix()
 DATAFILE = Path(__file__).parent.absolute()/"data/untouchable.dat"
@@ -18,5 +18,50 @@
 * کلماتی که به اشتباه بدون فاصله به صورت ' میشود '، ' بیشک ' , یا ' کمکتان ' نوشته شده‌، به صورت درست فاصله گذاری میشوند.
 * از استفاده ی بیش از یک علامت ؟؟؟؟ یا !!! جلوگیری می کند.
 * کلماتی که به صورت کشیـــــــــده نوشته شده اند را به صورت درست می نویسد.
-* از فاصله گذاری     بیش از حد    جلوگیری می کند.
-"""
+* از فاصله گذاری     بیش از حد    جلوگیری می کند."""
+
+URLREGX = r""" 	#https://gist.github.com/gruber/8891611
+(?xi)
+\b
+(							# Capture 1: entire matched URL
+  (?:
+    https?|ftp:				# URL protocol and colon
+    (?:
+      /{1,3}						# 1-3 slashes
+      |								#   or
+      [a-z0-9%]						# Single letter or digit or '%'
+      								# (Trying not to match e.g. "URI::Escape")
+    )
+    |							#   or
+    							# looks like domain name followed by a slash:
+    [a-z0-9.\-]+[.]
+    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
+    [/?]
+  )
+  (?:							# One or more:
+    [^\s()<>{}\[\]]+						# Run of non-space, non-()<>{}[]
+    |								#   or
+    \([^\s()]*?\([^\s()]+\)[^\s()]*?\)  # balanced parens, one level deep: (…(…)…)
+    |
+    \([^\s]+?\)							# balanced parens, non-recursive: (…)
+  )+
+  (?:							# End with:
+    \([^\s()]*?\([^\s()]+\)[^\s()]*?\)  # balanced parens, one level deep: (…(…)…)
+    |
+    \([^\s]+?\)							# balanced parens, non-recursive: (…)
+    |									#   or
+    [^\s`!()\[\]{};:'".,<>?«»“”‘’]		# not a space or one of these punct chars
+  )
+  |					# OR, the following to match naked domains:
+  (?:
+  	(?<!@)			# not preceded by a @, avoid matching foo@_gmail.com_
+    [a-z0-9]+
+    (?:[.\-][a-z0-9]+)*
+    [.]
+    (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj| Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
+    \b
+    /?
+    (?!@)			# not succeeded by a @, avoid matching "foo.na" in "[email protected]"
+  )
+)
+"""
diff --git a/negar/virastar.py b/negar/virastar.py
@@ -2,7 +2,12 @@
 #!/usr/bin/env python
 
 import re
-from constants import DATAFILE, USERFILE
+import enum
+from constants import DATAFILE, USERFILE, URLREGX
+
+class State(enum.Enum):
+    save = 1
+    restore = 2
 
 class PersianEditor:
     """
@@ -38,6 +43,7 @@ def __init__(self, text, *args):
         self.cleanup()
 
     def cleanup(self):
+        self._handle_urls(State.save)
         if self._fix_dashes: self.fix_dashes()
         if self._fix_three_dots: self.fix_three_dots()
         if self._fix_english_quotes: self.fix_english_quotes()
@@ -57,14 +63,25 @@ def cleanup(self):
         if self._trim_leading_trailing_whitespaces:
             self.text = '\n'.join([line.strip() for line in self.text.split('\n')])
         self.cleanup_redundant_zwnj()
-
+        self._handle_urls(State.restore)
         return self.text
 
     def __str__(self):
         return self.text
 
     __repr__ = __str__
 
+    def _handle_urls(self, state):
+        """Removing URLs and putting them back at the end of process"""
+        if state == State.save:
+            self.urls = re.findall(URLREGX, self.text, re.M)
+            self.urls.sort(key=lambda x: len(x), reverse=True)
+            for i, url in enumerate(self.urls):
+                self.text = re.sub(rf"\b{re.escape(url)}\b", rf'__URL__#{i}__', self.text)
+        if state == State.restore:
+            for i, url in enumerate(self.urls):
+                self.text = re.sub(f'__URL__#{i}__', url, self.text)
+
     def fix_dashes(self):
         """Replaces double and triple dashes with `ndash` and `mdash`, respectively."""
         self.text = re.sub(r'-{3}', r'—', self.text)