From 02c7d366e3808269ca3c1dda82aeee1cedf39ef4 Mon Sep 17 00:00:00 2001 From: John Andersen Date: Fri, 5 Jan 2024 20:10:21 +0000 Subject: [PATCH] scripts: dump discussion: Incorperate advanced patterns for GitHub GraphQL API Youtube: https://www.youtube.com/watch?v=i5pIszu9MeM&t=719s Signed-off-by: John Andersen --- scripts/discussion_dump_to_markdown.py | 3 +- scripts/dump_discussion.py | 120 +++++++++++++++++-------- 2 files changed, 83 insertions(+), 40 deletions(-) diff --git a/scripts/discussion_dump_to_markdown.py b/scripts/discussion_dump_to_markdown.py index 108c0a8dd3..35afe42216 100644 --- a/scripts/discussion_dump_to_markdown.py +++ b/scripts/discussion_dump_to_markdown.py @@ -175,12 +175,13 @@ async def main(): for comment in input_data["comments"]: discussion.comments.append( Comment( + id=comment["id"], body=comment["body"], replies=[], ) ) for reply in comment["replies"]: - discussion.comments[-1].replies.append(Reply(body=reply["body"])) + discussion.comments[-1].replies.append(Reply(id=reply["id"], body=reply["body"])) output_markdown(discussion, pathlib.Path(__file__).parents[1]) # os.system(f"rm -rf 'docs/tutorials/alice/'") diff --git a/scripts/dump_discussion.py b/scripts/dump_discussion.py index 98de3eb069..b0089f2be0 100644 --- a/scripts/dump_discussion.py +++ b/scripts/dump_discussion.py @@ -1,4 +1,4 @@ -""" +r""" Usage ***** @@ -12,14 +12,19 @@ import json from dataclasses import dataclass from typing import List +import logging import argparse +logger = logging.getLogger(__file__) + @dataclass class Reply: + id: str body: str @dataclass class Comment: + id: str body: str replies: List[Reply] @@ -29,34 +34,47 @@ class Discussion: title: str comments: List[Comment] -async def fetch_discussion_data(session, token, owner, repo, discussion_number): +async def fetch_discussion_data(session, graphql_url, token, owner, repo, discussion_number): headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" } - query = """ - query($owner: String!, $repo: String!, $discussionNumber: Int!, $commentsCursor: String, $repliesCursor: String) { + comments_query = """ + query($owner: String!, $repo: String!, $discussionNumber: Int!, $commentsCursor: String) { repository(owner: $owner, name: $repo) { discussion(number: $discussionNumber) { title body comments(first: 100, after: $commentsCursor) { + totalCount + pageInfo { + hasNextPage + endCursor + } + nodes { + id + body + } + } + } + } + } + """ + replies_query = """ + query($discussionCommentIds: [ID!]!){ + nodes(ids: $discussionCommentIds) { + ... on DiscussionComment { + id + replies(first: 10) { + totalCount pageInfo { hasNextPage endCursor } nodes { + id body - replies(first: 100, after: $repliesCursor) { - pageInfo { - hasNextPage - endCursor - } - nodes { - body - } - } } } } @@ -72,15 +90,43 @@ async def fetch_discussion_data(session, token, owner, repo, discussion_number): discussion_data = [] has_next_page = True - comments_cursor = None + comments_by_id = {} + comments_by_id_lock = asyncio.Lock() + discussion_title = None + discussion_body = None - while has_next_page: - variables["commentsCursor"] = comments_cursor - response = await session.post("https://api.github.com/graphql", headers=headers, json={"query": query, "variables": variables}) + async def paginate_replies(tg, batch_comment_ids): + nonlocal comments_by_id + nonlocal comments_by_id_lock + + logger.debug("Sending nested replies pagination query: %r: %s", variables, replies_query) + response = await session.post(graphql_url, headers=headers, json={"query": replies_query, "variables": {"discussionCommentIds": batch_comment_ids}}) result = await response.json() + logger.debug("Received nested replies comments pagination query result: %s", json.dumps(result, indent=4, sort_keys=True)) - if "data" not in result: - raise Exception(json.dumps(result, indent=4, sort_keys=True)) + for comment in result["data"]["nodes"]: + reply_nodes = comment["replies"]["nodes"] + has_next_page = comment["replies"]["pageInfo"]["hasNextPage"] + replies_cursor = comment["replies"]["pageInfo"]["endCursor"] + + async with comments_by_id_lock: + for reply in reply_nodes: + comments_by_id[comment["id"]].replies.append(Reply(id=reply["id"], body=reply["body"])) + + if has_next_page: + raise NotImplementedError() + + async def paginate_comments(tg, comments_cursor = None): + nonlocal comments_by_id + nonlocal comments_by_id_lock + nonlocal discussion_title + nonlocal discussion_body + + variables["commentsCursor"] = comments_cursor + logger.debug("Sending top level comments pagination query: %r: %s", variables, comments_query) + response = await session.post(graphql_url, headers=headers, json={"query": comments_query, "variables": variables}) + result = await response.json() + logger.debug("Received top level comments pagination query result: %s", json.dumps(result, indent=4, sort_keys=True)) discussion_title = result["data"]["repository"]["discussion"]["title"] discussion_body = result["data"]["repository"]["discussion"]["body"] @@ -88,29 +134,22 @@ async def fetch_discussion_data(session, token, owner, repo, discussion_number): has_next_page = result["data"]["repository"]["discussion"]["comments"]["pageInfo"]["hasNextPage"] comments_cursor = result["data"]["repository"]["discussion"]["comments"]["pageInfo"]["endCursor"] - for comment in comments: - comment_body = comment["body"] - replies = [] - - has_next_reply_page = True - replies_cursor = None + batch_comment_ids = [] - while has_next_reply_page: - variables["repliesCursor"] = replies_cursor - response = await session.post("https://api.github.com/graphql", headers=headers, json={"query": query, "variables": variables}) - reply_result = await response.json() + async with comments_by_id_lock: + for comment in comments: + comment = Comment(id=comment["id"], body=comment["body"], replies=[]) + comments_by_id[comment.id] = comment + batch_comment_ids.append(comment.id) + discussion_data.append(comment) - if "replies" not in reply_result: - raise Exception(json.dumps(reply_result, indent=4, sort_keys=True)) + tg.create_task(paginate_replies(tg, batch_comment_ids)) - reply_nodes = comment["replies"]["nodes"] - has_next_reply_page = comment["replies"]["pageInfo"]["hasNextPage"] - replies_cursor = comment["replies"]["pageInfo"]["endCursor"] + if has_next_page: + tg.create_task(paginate_comments(tg, comments_cursor)) - for reply in reply_nodes: - replies.append(Reply(body=reply["body"])) - - discussion_data.append(Comment(body=comment_body, replies=replies)) + async with asyncio.TaskGroup() as tg: + tg.create_task(paginate_comments(tg, None)) return Discussion(title=discussion_title, body=discussion_body, comments=discussion_data) @@ -120,10 +159,13 @@ async def main(): parser.add_argument("--owner", help="GitHub Repository Owner") parser.add_argument("--repo", help="GitHub Repository Name") parser.add_argument("--discussion-number", type=int, help="GitHub Discussion Number") + parser.add_argument("--api", help="GitHub GraphQL endpoint", default="https://api.github.com/graphql") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG) + async with aiohttp.ClientSession(trust_env=True) as session: - discussion_data = await fetch_discussion_data(session, args.token, args.owner, args.repo, args.discussion_number) + discussion_data = await fetch_discussion_data(session, args.api, args.token, args.owner, args.repo, args.discussion_number) print(json.dumps(discussion_data, default=lambda x: x.__dict__, indent=2)) if __name__ == "__main__":