Skip to content

Commit

Permalink
Merge pull request #62 from smcazares/649-txt-files2
Browse files Browse the repository at this point in the history
Broaden multimodal pipeline to process TXT files
  • Loading branch information
smcazares authored Oct 24, 2024
2 parents 3445de7 + 451ea9d commit 16e2983
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 15 deletions.
1 change: 0 additions & 1 deletion components/frontend_streamlit/src/pages/4_Query.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def chat_content():
st.image(chunk_url)
else:
logging.error("Reference modality unknown")
st.write("Reference modality unkown")
query_index = query_index + 1
st.divider()

Expand Down
25 changes: 23 additions & 2 deletions components/llm_service/src/services/query/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,11 @@ def chunk_document_multimodal(self,
try:
doc_extension = doc_name.split(".")[-1]
doc_extension = doc_extension.lower()
if doc_extension != "pdf" and doc_extension not in allowed_image_types:
raise ValueError(f"{doc_name} must be a PDF, PNG, JPG, BMP, or GIF")
if (doc_extension != "pdf" and
doc_extension != "txt" and
doc_extension not in allowed_image_types):
raise ValueError(f"{doc_name} must be a PDF, TXT, "
f"PNG, JPG, BMP, or GIF")
# TODO: Insert elif statements to check for additional types of
# videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
except Exception as e:
Expand Down Expand Up @@ -379,6 +382,24 @@ def chunk_document_multimodal(self,
}
doc_chunks.append(chunk_obj)

elif doc_extension == "txt":
# Chunk text in document
text_chunks = self.chunk_document(doc_name,
doc_url,
doc_filepath,
)
for text_chunk in text_chunks:
#TODO: Consider all characters in text_chunk,
#not just the first 1024
text_chunk = text_chunk[0:1023]
# Push chunk object into chunk array
chunk_obj = {
"image": None,
"image_url": None,
"text": text_chunk,
}
doc_chunks.append(chunk_obj)

# TODO: Insert elif statements to chunk additional types of
# videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
# - For images, set "image" and "text" fields of chunk_obj
Expand Down
17 changes: 8 additions & 9 deletions components/llm_service/src/services/query/query_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,15 +189,14 @@ async def query_generate(
# (from non-text info in query_references)
context_files = []
for ref in query_references:
if hasattr(ref, "modality") and ref.modality != "text":
if hasattr(ref, "chunk_url"):
ref_filename = ref.chunk_url
ref_mimetype = validate_multimodal_file_type(file_name=ref_filename,
file_b64=None)
context_files.append(DataSourceFile(gcs_path=ref_filename,
mime_type=ref_mimetype))
# TODO: If ref is a video chunk, then update new element of
# context_files according to ref.timestamp_start and ref.timestamp_stop
if ref.modality != "text" and ref.chunk_url:
ref_filename = ref.chunk_url
ref_mimetype = validate_multimodal_file_type(file_name=ref_filename,
file_b64=None)
context_files.append(DataSourceFile(gcs_path=ref_filename,
mime_type=ref_mimetype))
# TODO: If ref is a video chunk, then update new element of
# context_files according to ref.timestamp_start and ref.timestamp_stop

# send prompt and additional context to model
question_response = await llm_chat(question_prompt, llm_type,
Expand Down
8 changes: 5 additions & 3 deletions components/llm_service/src/services/query/vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,18 +446,20 @@ async def index_document_multimodal(self,
doc[modality] = None

# Get chunk embeddings
user_file_bytes = None
if doc["image"]:
user_file_bytes = b64decode(doc["image"])
chunk_embedding = \
await embeddings.get_multimodal_embeddings(
user_text=doc["text"],
user_file_bytes=b64decode(doc["image"]),
user_file_bytes=user_file_bytes,
embedding_type=self.embedding_type)
# TODO: Also embed doc["video"] (video chunk) and
# potentially doc["audio"] (audio chunk)

# Check to make sure that embeddings for available modalities exist
for modality in modality_list_sorted:
if modality in chunk_embedding.keys() and \
isinstance(chunk_embedding[modality][0], float):
if modality in chunk_embedding:
chunk_texts.append(doc["text"])
chunk_embeddings.append(chunk_embedding[modality])
# Increment counter
Expand Down

0 comments on commit 16e2983

Please sign in to comment.