Merge pull request #35 from PerfectThymeTech/marvinbuss/add_language_…

…hint Add Language Hint
PerfectThymeTech · Jul 11, 2024 · 23d6c19 · 23d6c19
2 parents 12b3389 + d5197ed
commit 23d6c19
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 12 deletions.
diff --git a/code/durablefunction/models/newstagextraction.py b/code/durablefunction/models/newstagextraction.py
@@ -87,6 +87,7 @@ class VideoIndexerTranscriptItem(BaseModel):
 
 
 class LoadVideoindexerContentResponse(BaseModel):
+    language: str
     transcript_text: str
     transcript: List[VideoIndexerTranscriptItem]
 
@@ -102,6 +103,7 @@ def from_json(data: str):
 class InvokeLlmRequest(BaseModel):
     content_text: str
     content_details: str
+    content_language: str
     instance_id: str
 
     @staticmethod

diff --git a/code/durablefunction/newstagextraction/llm.py b/code/durablefunction/newstagextraction/llm.py
@@ -48,6 +48,7 @@ def __create_llm_chain(
         prompt.input_variables = [
             "format_sample",
             "news_content",
+            "language",
             "news_show_details",
         ]
 
@@ -113,8 +114,13 @@ def invoke_llm_chain(
         self,
         news_content: str,
         news_show_details: str,
+        language: str,
     ) -> InvokeLlmResponse:
         result: InvokeLlmResponse = self.__llm_chain.invoke(
-            {"news_content": news_content, "news_show_details": news_show_details}
+            {
+                "news_content": news_content,
+                "news_show_details": news_show_details,
+                "language": language,
+            },
         )
         return result
diff --git a/code/durablefunction/newstagextraction/orchestration.py b/code/durablefunction/newstagextraction/orchestration.py
@@ -92,6 +92,7 @@ def newstag_extraction_orchestrator(context: df.DurableOrchestrationContext):
     input_invoke_llm: InvokeLlmRequest = InvokeLlmRequest(
         content_text=result_load_videoindexer_content.transcript_text,
         content_details="This is a tv news show.",
+        content_language=result_load_videoindexer_content.language,
         instance_id=context.instance_id,
     )
     result_invoke_llm: InvokeLlmResponse = yield context.call_activity_with_retry(
@@ -147,23 +148,25 @@ async def load_videoindexer_content(
     data_json = json.loads(data)
     logging.info(f"Loaded json data from storage: {data_json}")
 
-    # TODO: Handle errors
-
-    # Generate Transcript fom JSON
-    transcript_text_list = []
-    transcript_list = []
+    # Pop video from list
     try:
-        transcript = (
-            data_json.get("videos", [{"insights": {"transcript": []}}])
-            .pop(0)
-            .get("insights", {"transcript": []})
-            .get("transcript", [])
+        video = data_json.get("videos", [{"insights": {"transcript": []}}]).pop(0)
+        transcript = video.get("insights", {"transcript": []}).get("transcript", [])
+        language = video.get("insights", {"sourceLanguage": "Unknown"}).get(
+            "sourceLanguage", "Unknown"
         )
     except IndexError as e:
         logging.error(
             f"Index error when loading the video indexer data, so setting empty transcript: '{e}'"
         )
         transcript = []
+        language = "Unknown"
+
+    # TODO: Handle errors
+
+    # Generate Transcript fom JSON
+    transcript_text_list = []
+    transcript_list = []
 
     # Filter items in transcript
     index_start = 0
@@ -189,7 +192,7 @@ async def load_videoindexer_content(
     logging.info(f"Loaded transcript text: {transcript_text}")
     logging.info(f"Loaded transcript items: {len(transcript_list)}")
     response: LoadVideoindexerContentResponse = LoadVideoindexerContentResponse(
-        transcript_text=transcript_text, transcript=transcript_list
+        language=language, transcript_text=transcript_text, transcript=transcript_list
     )
 
     # Upload result
@@ -220,6 +223,7 @@ async def invoke_llm(inputData: InvokeLlmRequest) -> InvokeLlmResponse:
     llm_result: Dict[Any] = llm_ineractor.invoke_llm_chain(
         news_content=inputData.content_text,
         news_show_details=inputData.content_details,
+        language=inputData.content_language,
     )
     logging.info(f"LLM response: {json.dumps(llm_result)}")