Merge pull request #62 from smcazares/649-txt-files2

Broaden multimodal pipeline to process TXT files
GoogleCloudPlatform · Oct 24, 2024 · 16e2983 · 16e2983
2 parents 3445de7 + 451ea9d
commit 16e2983
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 15 deletions.
diff --git a/components/frontend_streamlit/src/pages/4_Query.py b/components/frontend_streamlit/src/pages/4_Query.py
@@ -119,7 +119,6 @@ def chat_content():
               st.image(chunk_url)
             else:
               logging.error("Reference modality unknown")
-              st.write("Reference modality unkown")
             query_index = query_index + 1
           st.divider()
 

diff --git a/components/llm_service/src/services/query/data_source.py b/components/llm_service/src/services/query/data_source.py
@@ -286,8 +286,11 @@ def chunk_document_multimodal(self,
     try:
       doc_extension = doc_name.split(".")[-1]
       doc_extension = doc_extension.lower()
-      if doc_extension != "pdf" and doc_extension not in allowed_image_types:
-        raise ValueError(f"{doc_name} must be a PDF, PNG, JPG, BMP, or GIF")
+      if (doc_extension != "pdf" and
+          doc_extension != "txt" and
+          doc_extension not in allowed_image_types):
+        raise ValueError(f"{doc_name} must be a PDF, TXT, "
+                         f"PNG, JPG, BMP, or GIF")
       # TODO: Insert elif statements to check for additional types of
       # videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
     except Exception as e:
@@ -379,6 +382,24 @@ def chunk_document_multimodal(self,
         }
         doc_chunks.append(chunk_obj)
 
+      elif doc_extension == "txt":
+        # Chunk text in document
+        text_chunks = self.chunk_document(doc_name,
+                                          doc_url,
+                                          doc_filepath,
+                                          )
+        for text_chunk in text_chunks:
+          #TODO: Consider all characters in text_chunk,
+          #not just the first 1024
+          text_chunk = text_chunk[0:1023]
+          # Push chunk object into chunk array
+          chunk_obj = {
+            "image": None,
+            "image_url": None,
+            "text": text_chunk,
+          }
+          doc_chunks.append(chunk_obj)
+
       # TODO: Insert elif statements to chunk additional types of
       # videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
       # - For images, set "image" and "text" fields of chunk_obj

diff --git a/components/llm_service/src/services/query/query_service.py b/components/llm_service/src/services/query/query_service.py
@@ -189,15 +189,14 @@ async def query_generate(
   # (from non-text info in query_references)
   context_files = []
   for ref in query_references:
-    if hasattr(ref, "modality") and ref.modality != "text":
-      if hasattr(ref, "chunk_url"):
-        ref_filename = ref.chunk_url
-        ref_mimetype = validate_multimodal_file_type(file_name=ref_filename,
-                                                     file_b64=None)
-        context_files.append(DataSourceFile(gcs_path=ref_filename,
-                                            mime_type=ref_mimetype))
-        # TODO: If ref is a video chunk, then update new element of
-        # context_files according to ref.timestamp_start and ref.timestamp_stop
+    if ref.modality != "text" and ref.chunk_url:
+      ref_filename = ref.chunk_url
+      ref_mimetype = validate_multimodal_file_type(file_name=ref_filename,
+                                                   file_b64=None)
+      context_files.append(DataSourceFile(gcs_path=ref_filename,
+                                          mime_type=ref_mimetype))
+      # TODO: If ref is a video chunk, then update new element of
+      # context_files according to ref.timestamp_start and ref.timestamp_stop
 
   # send prompt and additional context to model
   question_response = await llm_chat(question_prompt, llm_type,

diff --git a/components/llm_service/src/services/query/vector_store.py b/components/llm_service/src/services/query/vector_store.py
@@ -446,18 +446,20 @@ async def index_document_multimodal(self,
           doc[modality] = None
 
       # Get chunk embeddings
+      user_file_bytes = None
+      if doc["image"]:
+        user_file_bytes = b64decode(doc["image"])
       chunk_embedding = \
         await embeddings.get_multimodal_embeddings(
           user_text=doc["text"],
-          user_file_bytes=b64decode(doc["image"]),
+          user_file_bytes=user_file_bytes,
           embedding_type=self.embedding_type)
       # TODO: Also embed doc["video"] (video chunk) and
       # potentially doc["audio"] (audio chunk)
 
       # Check to make sure that embeddings for available modalities exist
       for modality in modality_list_sorted:
-        if modality in chunk_embedding.keys() and \
-          isinstance(chunk_embedding[modality][0], float):
+        if modality in chunk_embedding:
           chunk_texts.append(doc["text"])
           chunk_embeddings.append(chunk_embedding[modality])
           # Increment counter