Add comments and improve anchor handling

am9zZWY · Jul 17, 2024 · 3d18322 · 3d18322
1 parent 58e1eff
commit 3d18322
Showing 1 changed file with 22 additions and 4 deletions.
diff --git a/engine/crawl.py b/engine/crawl.py
@@ -148,6 +148,12 @@ def user_agent(self):
         return self.user_agents[self._page_count % len(self.user_agents)]
 
     async def process(self):
+        """
+        Starts the crawling process.
+        Is called in the Pipeline.
+        Returns: None
+
+        """
         async with ClientSession(connector=self._connector, timeout=self._timeout) as session:
             tasks = set()
             while not self.is_shutdown() and len(self.urls_crawled) < self.max_size:
@@ -179,7 +185,16 @@ async def process(self):
 
         print("Crawler finished processing")
 
-    async def _process_url_with_semaphore(self, session, url):
+    async def _process_url_with_semaphore(self, session, url: str):
+        """
+        Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests.
+        Args:
+            session: aiohttp ClientSession
+            url: URL to crawl
+
+        Returns: None
+
+        """
         async with self._semaphore:
             await self._process_url(session, url)
 
@@ -190,7 +205,7 @@ async def _process_url(self, session, url: str):
             session: aiohttp ClientSession
             url: URL to crawl
 
-        Returns:
+        Returns: None
         """
         if url in self.currently_crawled:
             log_warning(f"Ignoring {url} because it is already being crawled")
@@ -265,15 +280,15 @@ async def _process_url(self, session, url: str):
         if not self.is_shutdown():
             await self.call_next(soup, url)
 
-    async def _handle_links(self, soup, url):
+    async def _handle_links(self, soup: BeautifulSoup, url: str):
         """
         Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
         found list, and not in the to_crawl_set.
         Args:
             soup: BeautifulSoup object
             url: URL of the page
 
-        Returns:
+        Returns: None
 
         """
         for a_tag in soup.find_all("a", href=True):
@@ -283,6 +298,9 @@ async def _handle_links(self, soup, url):
             if found_link.startswith("#"):
                 continue
 
+            # Strip out the fragment
+            found_link = found_link.split("#")[0]
+
             # Check if link is relative
             if found_link.startswith("/"):
                 base_url = get_base_url(url)