From 02c7d366e3808269ca3c1dda82aeee1cedf39ef4 Mon Sep 17 00:00:00 2001
From: John Andersen <johnandersenpdx@gmail.com>
Date: Fri, 5 Jan 2024 20:10:21 +0000
Subject: [PATCH] scripts: dump discussion: Incorperate advanced patterns for
 GitHub GraphQL API

Youtube: https://www.youtube.com/watch?v=i5pIszu9MeM&t=719s
Signed-off-by: John Andersen <johnandersenpdx@gmail.com>
---
 scripts/discussion_dump_to_markdown.py |   3 +-
 scripts/dump_discussion.py             | 120 +++++++++++++++++--------
 2 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/scripts/discussion_dump_to_markdown.py b/scripts/discussion_dump_to_markdown.py
index 108c0a8dd3..35afe42216 100644
--- a/scripts/discussion_dump_to_markdown.py
+++ b/scripts/discussion_dump_to_markdown.py
@@ -175,12 +175,13 @@ async def main():
     for comment in input_data["comments"]:
         discussion.comments.append(
             Comment(
+                id=comment["id"],
                 body=comment["body"],
                 replies=[],
             )
         )
         for reply in comment["replies"]:
-            discussion.comments[-1].replies.append(Reply(body=reply["body"]))
+            discussion.comments[-1].replies.append(Reply(id=reply["id"], body=reply["body"]))
     output_markdown(discussion, pathlib.Path(__file__).parents[1])
     # os.system(f"rm -rf 'docs/tutorials/alice/'")
 
diff --git a/scripts/dump_discussion.py b/scripts/dump_discussion.py
index 98de3eb069..b0089f2be0 100644
--- a/scripts/dump_discussion.py
+++ b/scripts/dump_discussion.py
@@ -1,4 +1,4 @@
-"""
+r"""
 Usage
 *****
 
@@ -12,14 +12,19 @@
 import json
 from dataclasses import dataclass
 from typing import List
+import logging
 import argparse
 
+logger = logging.getLogger(__file__)
+
 @dataclass
 class Reply:
+    id: str
     body: str
 
 @dataclass
 class Comment:
+    id: str
     body: str
     replies: List[Reply]
 
@@ -29,34 +34,47 @@ class Discussion:
     title: str
     comments: List[Comment]
 
-async def fetch_discussion_data(session, token, owner, repo, discussion_number):
+async def fetch_discussion_data(session, graphql_url, token, owner, repo, discussion_number):
     headers = {
         "Authorization": f"Bearer {token}",
         "Content-Type": "application/json"
     }
 
-    query = """
-    query($owner: String!, $repo: String!, $discussionNumber: Int!, $commentsCursor: String, $repliesCursor: String) {
+    comments_query = """
+    query($owner: String!, $repo: String!, $discussionNumber: Int!, $commentsCursor: String) {
       repository(owner: $owner, name: $repo) {
         discussion(number: $discussionNumber) {
           title
           body
           comments(first: 100, after: $commentsCursor) {
+            totalCount
+            pageInfo {
+              hasNextPage
+              endCursor
+            }
+            nodes {
+              id
+              body
+            }
+          }
+        }
+      }
+    }
+    """
+    replies_query = """
+    query($discussionCommentIds: [ID!]!){
+      nodes(ids: $discussionCommentIds) {
+        ... on DiscussionComment {
+          id
+          replies(first: 10) {
+            totalCount
             pageInfo {
               hasNextPage
               endCursor
             }
             nodes {
+              id
               body
-              replies(first: 100, after: $repliesCursor) {
-                pageInfo {
-                  hasNextPage
-                  endCursor
-                }
-                nodes {
-                  body
-                }
-              }
             }
           }
         }
@@ -72,15 +90,43 @@ async def fetch_discussion_data(session, token, owner, repo, discussion_number):
 
     discussion_data = []
     has_next_page = True
-    comments_cursor = None
+    comments_by_id = {}
+    comments_by_id_lock = asyncio.Lock()
+    discussion_title = None
+    discussion_body = None
 
-    while has_next_page:
-        variables["commentsCursor"] = comments_cursor
-        response = await session.post("https://api.github.com/graphql", headers=headers, json={"query": query, "variables": variables})
+    async def paginate_replies(tg, batch_comment_ids):
+        nonlocal comments_by_id
+        nonlocal comments_by_id_lock
+
+        logger.debug("Sending nested replies pagination query: %r: %s", variables, replies_query)
+        response = await session.post(graphql_url, headers=headers, json={"query": replies_query, "variables": {"discussionCommentIds": batch_comment_ids}})
         result = await response.json()
+        logger.debug("Received nested replies comments pagination query result: %s", json.dumps(result, indent=4, sort_keys=True))
 
-        if "data" not in result:
-            raise Exception(json.dumps(result, indent=4, sort_keys=True))
+        for comment in result["data"]["nodes"]:
+            reply_nodes = comment["replies"]["nodes"]
+            has_next_page = comment["replies"]["pageInfo"]["hasNextPage"]
+            replies_cursor = comment["replies"]["pageInfo"]["endCursor"]
+
+            async with comments_by_id_lock:
+                for reply in reply_nodes:
+                    comments_by_id[comment["id"]].replies.append(Reply(id=reply["id"], body=reply["body"]))
+
+            if has_next_page:
+                raise NotImplementedError()
+
+    async def paginate_comments(tg, comments_cursor = None):
+        nonlocal comments_by_id
+        nonlocal comments_by_id_lock
+        nonlocal discussion_title
+        nonlocal discussion_body
+
+        variables["commentsCursor"] = comments_cursor
+        logger.debug("Sending top level comments pagination query: %r: %s", variables, comments_query)
+        response = await session.post(graphql_url, headers=headers, json={"query": comments_query, "variables": variables})
+        result = await response.json()
+        logger.debug("Received top level comments pagination query result: %s", json.dumps(result, indent=4, sort_keys=True))
 
         discussion_title = result["data"]["repository"]["discussion"]["title"]
         discussion_body = result["data"]["repository"]["discussion"]["body"]
@@ -88,29 +134,22 @@ async def fetch_discussion_data(session, token, owner, repo, discussion_number):
         has_next_page = result["data"]["repository"]["discussion"]["comments"]["pageInfo"]["hasNextPage"]
         comments_cursor = result["data"]["repository"]["discussion"]["comments"]["pageInfo"]["endCursor"]
 
-        for comment in comments:
-            comment_body = comment["body"]
-            replies = []
-
-            has_next_reply_page = True
-            replies_cursor = None
+        batch_comment_ids = []
 
-            while has_next_reply_page:
-                variables["repliesCursor"] = replies_cursor
-                response = await session.post("https://api.github.com/graphql", headers=headers, json={"query": query, "variables": variables})
-                reply_result = await response.json()
+        async with comments_by_id_lock:
+            for comment in comments:
+                comment = Comment(id=comment["id"], body=comment["body"], replies=[])
+                comments_by_id[comment.id] = comment
+                batch_comment_ids.append(comment.id)
+                discussion_data.append(comment)
 
-                if "replies" not in reply_result:
-                    raise Exception(json.dumps(reply_result, indent=4, sort_keys=True))
+        tg.create_task(paginate_replies(tg, batch_comment_ids))
 
-                reply_nodes = comment["replies"]["nodes"]
-                has_next_reply_page = comment["replies"]["pageInfo"]["hasNextPage"]
-                replies_cursor = comment["replies"]["pageInfo"]["endCursor"]
+        if has_next_page:
+            tg.create_task(paginate_comments(tg, comments_cursor))
 
-                for reply in reply_nodes:
-                    replies.append(Reply(body=reply["body"]))
-
-            discussion_data.append(Comment(body=comment_body, replies=replies))
+    async with asyncio.TaskGroup() as tg:
+        tg.create_task(paginate_comments(tg, None))
 
     return Discussion(title=discussion_title, body=discussion_body, comments=discussion_data)
 
@@ -120,10 +159,13 @@ async def main():
     parser.add_argument("--owner", help="GitHub Repository Owner")
     parser.add_argument("--repo", help="GitHub Repository Name")
     parser.add_argument("--discussion-number", type=int, help="GitHub Discussion Number")
+    parser.add_argument("--api", help="GitHub GraphQL endpoint", default="https://api.github.com/graphql")
     args = parser.parse_args()
 
+    logging.basicConfig(level=logging.DEBUG)
+
     async with aiohttp.ClientSession(trust_env=True) as session:
-        discussion_data = await fetch_discussion_data(session, args.token, args.owner, args.repo, args.discussion_number)
+        discussion_data = await fetch_discussion_data(session, args.api, args.token, args.owner, args.repo, args.discussion_number)
         print(json.dumps(discussion_data, default=lambda x: x.__dict__, indent=2))
 
 if __name__ == "__main__":