diff --git a/README.md b/README.md index adb450f..4cdad48 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,11 @@ At this point your bot should run daily and publish a static website. You can te **Optional but highly recommended**: -6. [Set up a slack bot](https://api.slack.com/start/quickstart), get the OAuth key, set it to `SLACK_KEY` as a github secret -7. Make a channel for the bot (and invite it to the channel), get its [Slack channel id](https://stackoverflow.com/questions/40940327/what-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id), set it as `SLACK_CHANNEL_ID` in a github secret. -8. Take a look at `configs/config.ini` to tweak how things are filtered. -9. Set the github repo private to avoid github actions being [set to inactive after 60 days](https://docs.github.com/en/actions/using-workflows/disabling-and-enabling-a-workflow) +7. Get and set up a semantic scholar API key (`S2_KEY`) as a github secret. Otherwise the author search step will be very slow +8. [Set up a slack bot](https://api.slack.com/start/quickstart), get the OAuth key, set it to `SLACK_KEY` as a github secret +9. Make a channel for the bot (and invite it to the channel), get its [Slack channel id](https://stackoverflow.com/questions/40940327/what-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id), set it as `SLACK_CHANNEL_ID` in a github secret. +10. Take a look at `configs/config.ini` to tweak how things are filtered. +11. Set the github repo private to avoid github actions being [set to inactive after 60 days](https://docs.github.com/en/actions/using-workflows/disabling-and-enabling-a-workflow) Each day at 1pm UTC, the bot will run and post to slack and publish a github pages website (see the publish_md and cron_runs actions for details). diff --git a/main.py b/main.py index 45012df..849fd62 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,8 @@ from requests import Session from typing import TypeVar, Generator import io + +from retry import retry from tqdm import tqdm from arxiv_scraper import get_papers_from_arxiv_rss_api @@ -94,6 +96,7 @@ def get_author_batch( return response.json() +@retry(tries=3, delay=2.0) def get_one_author(session, author: str, S2_API_KEY: str) -> str: # query the right endpoint https://api.semanticscholar.org/graph/v1/author/search?query=adam+smith params = {"query": author, "fields": "authorId,name,hIndex", "limit": "10"} @@ -142,8 +145,12 @@ def get_authors( auth_map = get_one_author(session, author, S2_API_KEY) if auth_map is not None: author_metadata_dict[author] = auth_map - # add a 10ms wait time to avoid rate limiting - time.sleep(0.01) + # add a 20ms wait time to avoid rate limiting + # otherwise, semantic scholar aggressively rate limits, so do 1s + if S2_API_KEY is not None: + time.sleep(0.02) + else: + time.sleep(1.0) return author_metadata_dict