Merge pull request #35 from am9zZWY/tokenization

Improve text processing
am9zZWY · Jul 17, 2024 · b302f9c · b302f9c
2 parents f51d8bd + bbbed16
commit b302f9c
Show file tree

Hide file tree

Showing 6 changed files with 320 additions and 176 deletions.
diff --git a/engine/README.md b/engine/README.md
@@ -10,39 +10,45 @@
 
 1. **Install Python 3:**
 
-   - Download and install the latest version of Python 3 from the official website.
+- Download and install the latest version of Python 3 from the official website.
 
 2. **Install virtualenv:**
 
-   - Open a terminal and run the following command:
+- Open a terminal and run the following command:
 
    ```
    pip install virtualenv
    ```
 
 3. **Create a virtual environment:**
 
-   - Create the virtual environment:
+- Create the virtual environment:
 
-    ```
-    virtualenv --python=3.11 .venv
-    ```
+  ```
+  virtualenv --python=3.11 .venv
+  ```
 
-   - Activate the virtual environment:
+- Activate the virtual environment:
 
-    ```
-    source .venv/bin/activate
-    ```
+  ```
+  source .venv/bin/activate
+  ```
 
 4. **Install requirements:**
 
-   - Ensure you have a requirements.txt file in your project directory.
-   - Run the following command to install the dependencies:
+- Ensure you have a requirements.txt file in your project directory.
+- Run the following command to install the dependencies:
 
    ```
    pip install -r requirements.txt
    ```
 
+- For the text-processing part, we use `spaCy`. You need to download the English model by running the following command:
+
+   ```
+    python -m spacy download en_core_web_sm
+   ```
+
 5. **Start developing the project**
 
 ## Usage

diff --git a/engine/crawl.py b/engine/crawl.py
@@ -148,6 +148,12 @@ def user_agent(self):
         return self.user_agents[self._page_count % len(self.user_agents)]
 
     async def process(self):
+        """
+        Starts the crawling process.
+        Is called in the Pipeline.
+        Returns: None
+
+        """
         async with ClientSession(connector=self._connector, timeout=self._timeout) as session:
             tasks = set()
             while not self.is_shutdown() and len(self.urls_crawled) < self.max_size:
@@ -179,7 +185,16 @@ async def process(self):
 
         print("Crawler finished processing")
 
-    async def _process_url_with_semaphore(self, session, url):
+    async def _process_url_with_semaphore(self, session, url: str):
+        """
+        Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests.
+        Args:
+            session: aiohttp ClientSession
+            url: URL to crawl
+
+        Returns: None
+
+        """
         async with self._semaphore:
             await self._process_url(session, url)
 
@@ -190,7 +205,7 @@ async def _process_url(self, session, url: str):
             session: aiohttp ClientSession
             url: URL to crawl
 
-        Returns:
+        Returns: None
         """
         if url in self.currently_crawled:
             log_warning(f"Ignoring {url} because it is already being crawled")
@@ -265,15 +280,15 @@ async def _process_url(self, session, url: str):
         if not self.is_shutdown():
             await self.call_next(soup, url)
 
-    async def _handle_links(self, soup, url):
+    async def _handle_links(self, soup: BeautifulSoup, url: str):
         """
         Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
         found list, and not in the to_crawl_set.
         Args:
             soup: BeautifulSoup object
             url: URL of the page
 
-        Returns:
+        Returns: None
 
         """
         for a_tag in soup.find_all("a", href=True):
@@ -283,6 +298,9 @@ async def _handle_links(self, soup, url):
             if found_link.startswith("#"):
                 continue
 
+            # Strip out the fragment
+            found_link = found_link.split("#")[0]
+
             # Check if link is relative
             if found_link.startswith("/"):
                 base_url = get_base_url(url)

diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
diff --git a/engine/requirements.txt b/engine/requirements.txt
@@ -7,7 +7,8 @@ Flask==3.0.3
 Flask-Cors==4.0.1
 nest-asyncio==1.6.0
 nltk==3.8.1
-numpy==2.0.0
+numpy==1.26.4
 pandas==2.2.2
 scikit-learn==1.5.1
 aiohttp==3.9.5
+spacy==3.7.5
diff --git a/engine/summarize.py b/engine/summarize.py
@@ -0,0 +1,15 @@
+from gensim.summarization import summarize
+
+# Input text to be summarized
+input_text = """
+Your input text goes here. It can be a long paragraph or multiple paragraphs. 
+"""
+
+# Generate the summary using TextRank algorithm
+summary = summarize(input_text, ratio=0.3)  # You can adjust the ratio parameter based on the summary length you desire
+
+# Output the summary
+print("Original Text:")
+print(input_text)
+print("\nSummary:")
+print(summary)