Skip to content

Commit

Permalink
add config option: external URLs for feeds and sitemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Nov 7, 2023
1 parent 2aa3197 commit fcb6a97
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
3 changes: 2 additions & 1 deletion trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,11 @@ def cli_discovery(args):
input_urls = url_store.dump_urls()
if args.list:
url_store.reset()
ext = use_config(filename=args.config_file).getboolean('DEFAULT', 'EXTERNAL_URLS')

# link discovery and storage
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
futures = (executor.submit(func, url, target_lang=args.target_language) for url in input_urls)
futures = (executor.submit(func, url, target_lang=args.target_language, external=ext) for url in input_urls)
# process results from the parallel threads and add them
# to the compressed URL dictionary for further processing
for future in as_completed(futures):
Expand Down
20 changes: 11 additions & 9 deletions trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
BLACKLIST = re.compile(r'\bcomments\b') # no comment feed


def handle_link_list(linklist, domainname, baseurl, target_lang=None):
def handle_link_list(linklist, domainname, baseurl, target_lang=None, external=False):
'''Examine links to determine if they are valid and
lead to a web page'''
output_links = []
Expand All @@ -38,7 +38,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None):
# control output for validity
checked = check_url(link, language=target_lang)
if checked is not None:
if not is_similar_domain(domainname, checked[1]) and not "feed" in link:
if not external and not is_similar_domain(domainname, checked[1]) and not "feed" in link:
LOGGER.warning('Rejected, diverging domain names: %s %s', domainname, checked[1])
else:
output_links.append(checked[0])
Expand All @@ -48,7 +48,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None):
return output_links


def extract_links(feed_string, domainname, baseurl, reference, target_lang=None):
def extract_links(feed_string, domainname, baseurl, reference, target_lang=None, external=False):
'''Extract links from Atom and RSS feeds'''
feed_links = []
# check if it's a feed
Expand Down Expand Up @@ -91,7 +91,7 @@ def extract_links(feed_string, domainname, baseurl, reference, target_lang=None)
)

# refine
output_links = handle_link_list(feed_links, domainname, baseurl, target_lang)
output_links = handle_link_list(feed_links, domainname, baseurl, target_lang, external)
output_links = [l for l in output_links if l != reference and l.count('/') > 2]
# log result
if feed_links:
Expand Down Expand Up @@ -145,14 +145,16 @@ def determine_feed(htmlstring, baseurl, reference):
return output_urls


def find_feed_urls(url, target_lang=None):
def find_feed_urls(url, target_lang=None, external=False):
"""Try to find feed URLs.
Args:
url: Webpage or feed URL as string.
Triggers URL-based filter if the webpage isn't a homepage.
target_lang: Define a language to filter URLs based on heuristics
(two-letter string, ISO 639-1 format).
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
Returns:
The extracted links as a list (sorted list of unique links).
Expand All @@ -166,12 +168,12 @@ def find_feed_urls(url, target_lang=None):
downloaded = fetch_url(url)
if downloaded is not None:
# assume it's a feed
feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang)
feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang, external)
if len(feed_links) == 0:
# assume it's a web page
for feed in determine_feed(downloaded, baseurl, url):
feed_string = fetch_url(feed)
feed_links.extend(extract_links(feed_string, domainname, baseurl, url, target_lang))
feed_links.extend(extract_links(feed_string, domainname, baseurl, url, target_lang, external))

Check warning on line 176 in trafilatura/feeds.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/feeds.py#L176

Added line #L176 was not covered by tests
# filter triggered, prepare it
if len(url) > len(baseurl) + 2:
urlfilter = url
Expand All @@ -191,7 +193,7 @@ def find_feed_urls(url, target_lang=None):
f'https://news.google.com/rss/search?q=site:{baseurl}&hl={target_lang}&scoring=n&num=100'
)
if downloaded is not None:
feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang)
feed_links = extract_links(downloaded, domainname, baseurl, url, target_lang, external)

Check warning on line 196 in trafilatura/feeds.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/feeds.py#L196

Added line #L196 was not covered by tests
feed_links = filter_urls(feed_links, urlfilter)
LOGGER.debug('%s Google news links found for %s', len(feed_links), domainname)
return feed_links
Expand Down
3 changes: 3 additions & 0 deletions trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ MAX_REPETITIONS = 2

# Extraction option for Htmldate
EXTENSIVE_DATE_SEARCH = on

# URLs in feeds and sitemaps
EXTERNAL_URLS = off
15 changes: 9 additions & 6 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,13 @@

class SitemapObject:
"Store all necessary information on sitemap download and processing."
__slots__ = ["base_url", "content", "domain", "sitemap_url", "sitemap_urls", "target_lang", "urls"]
__slots__ = ["base_url", "content", "domain", "external", "sitemap_url", "sitemap_urls", "target_lang", "urls"]

def __init__(self, base_url: str, domain: str, sitemap_url: str, target_lang: Optional[str] = None) -> None:
def __init__(self, base_url: str, domain: str, sitemap_url: str, target_lang: Optional[str] = None, external: bool = False) -> None:
self.base_url: str = base_url
self.content: str = ""
self.domain: str = domain
self.external: bool = external
self.sitemap_url: str = sitemap_url
self.sitemap_urls: List[str] = []
self.target_lang: Optional[str] = target_lang
Expand Down Expand Up @@ -78,7 +79,7 @@ def handle_link(self, link: str) -> None:

# don't take links from another domain and make an exception for main platforms
# also bypass: subdomains vs. domains
if not is_similar_domain(self.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
if not self.external and not is_similar_domain(self.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
LOGGER.warning('link discarded, diverging domain names: %s %s', self.domain, newdomain)
return

Expand Down Expand Up @@ -128,14 +129,16 @@ def process(self) -> None:
self.extract_sitemap_links()


def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]:
def sitemap_search(url: str, target_lang: Optional[str] = None, external: bool = False) -> List[str]:
"""Look for sitemaps for the given URL and gather links.
Args:
url: Webpage or sitemap URL as string.
Triggers URL-based filter if the webpage isn't a homepage.
target_lang: Define a language to filter URLs based on heuristics
(two-letter string, ISO 639-1 format).
(two-letter string, ISO 639-1 format).
external: Similar hosts only or external URLs
(boolean, defaults to False).
Returns:
The extracted links as a list (sorted list of unique links).
Expand All @@ -159,7 +162,7 @@ def sitemap_search(url: str, target_lang: Optional[str] = None) -> List[str]:
if len(url) > len(baseurl) + 2:
urlfilter = url

sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang)
sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang, external)
sitemap.fetch()
sitemap.process()

Expand Down

0 comments on commit fcb6a97

Please sign in to comment.