Skip to content

Commit

Permalink
Add comments and improve anchor handling
Browse files Browse the repository at this point in the history
  • Loading branch information
am9zZWY committed Jul 17, 2024
1 parent 58e1eff commit 3d18322
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def user_agent(self):
return self.user_agents[self._page_count % len(self.user_agents)]

async def process(self):
"""
Starts the crawling process.
Is called in the Pipeline.
Returns: None
"""
async with ClientSession(connector=self._connector, timeout=self._timeout) as session:
tasks = set()
while not self.is_shutdown() and len(self.urls_crawled) < self.max_size:
Expand Down Expand Up @@ -179,7 +185,16 @@ async def process(self):

print("Crawler finished processing")

async def _process_url_with_semaphore(self, session, url):
async def _process_url_with_semaphore(self, session, url: str):
"""
Wrapper for _process_url that uses a semaphore to limit the number of concurrent requests.
Args:
session: aiohttp ClientSession
url: URL to crawl
Returns: None
"""
async with self._semaphore:
await self._process_url(session, url)

Expand All @@ -190,7 +205,7 @@ async def _process_url(self, session, url: str):
session: aiohttp ClientSession
url: URL to crawl
Returns:
Returns: None
"""
if url in self.currently_crawled:
log_warning(f"Ignoring {url} because it is already being crawled")
Expand Down Expand Up @@ -265,15 +280,15 @@ async def _process_url(self, session, url: str):
if not self.is_shutdown():
await self.call_next(soup, url)

async def _handle_links(self, soup, url):
async def _handle_links(self, soup: BeautifulSoup, url: str):
"""
Checks the links in the soup and adds them to the to_crawl_queue if they are not in the ignore list, not in the
found list, and not in the to_crawl_set.
Args:
soup: BeautifulSoup object
url: URL of the page
Returns:
Returns: None
"""
for a_tag in soup.find_all("a", href=True):
Expand All @@ -283,6 +298,9 @@ async def _handle_links(self, soup, url):
if found_link.startswith("#"):
continue

# Strip out the fragment
found_link = found_link.split("#")[0]

# Check if link is relative
if found_link.startswith("/"):
base_url = get_base_url(url)
Expand Down

0 comments on commit 3d18322

Please sign in to comment.