Skip to content

Commit

Permalink
Merge pull request #158 from openzim/fix_various
Browse files Browse the repository at this point in the history
Move to cssbeautifier and fix delays in category scraping
  • Loading branch information
benoit74 authored Feb 19, 2024
2 parents e814919 + ed80431 commit 4cd8498
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,6 @@ dmypy.json

output/
wikihow2zim/assets/vendor/

# Visual Studio Code configuration which is not maintained
.vscode
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
requests>=2.28.0,<3.0
Jinja2>=3.1.2,<4.0
zimscraperlib>=2.0.0,<2.1
css-beautify>=1.10.3<2.0
six>=1.16.0,<2.0 # css-beautify dependency
cssbeautifier>=1.10.3,<2.0
kiwixstorage>=0.8.1,<0.9
pif>=0.8.2,<0.9
tld>=0.12.6,<0.13
Expand Down
4 changes: 3 additions & 1 deletion wikihow2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,15 +512,17 @@ def scrape_categories(self):
logger.info("Scraping expected category pages")
for category in self.expected_categories:
self.scrape_category(category)
if self.conf.delay:
time.sleep(self.conf.delay)

def scrape_category(self, category: str):
logger.info(f"> Category:{category}")
nb_pages = self.scrape_category_page(category, page_num=1)
if nb_pages > 1:
for page_num in range(2, nb_pages + 1):
self.scrape_category_page(category, page_num=page_num)
if self.conf.delay:
time.sleep(self.conf.delay)
self.scrape_category_page(category, page_num=page_num)

def scrape_category_page(self, category: str, page_num: int):
category_url = f"/{self.metadata['category_prefix']}:{category}"
Expand Down
2 changes: 1 addition & 1 deletion wikihow2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def write(line):
end = line.index(")")

# check whether it's quoted or not
if line[start + 1] in ("'", '"'):
if line[start] in ("'", '"'):
start += 1
end -= 1

Expand Down

0 comments on commit 4cd8498

Please sign in to comment.