Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Proof-of-concept] Added configurable reduction step after longform chunk transcript generation #205

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions podcastfy/content_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from langchain_community.chat_models import ChatLiteLLM
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.llms.llamafile import Llamafile
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from podcastfy.utils.config_conversation import load_conversation_config
Expand Down Expand Up @@ -503,6 +503,7 @@ def clean(self,
# Then apply additional long-form specific cleaning
return self._clean_transcript_response(standard_clean, config)


def _clean_transcript_response(self, transcript: str, config: Dict[str, Any]) -> str:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this code should not be in the _clean_transcript_response function

"""
Clean transcript using a two-step process with LLM-based cleaning.
Expand All @@ -522,7 +523,40 @@ def _clean_transcript_response(self, transcript: str, config: Dict[str, Any]) ->
"""
logger.debug("Starting transcript cleaning process")

final_transcript = self._fix_alternating_tags(transcript)
# Run rewriting chain
llm = self.llm

analysis_prompt = PromptTemplate(
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be off by default.

input_variables=["transcript"],
template=config.get("analysis_prompt_template", "You are a podcast editor. Analyze this podcast transcript and identify duplicated/repeated lines and recommendations to improve flow. Do not remove too many facts or add any new facts: \n\n{transcript} \n\nAnalysis (bullet-points, with line numbers referring to problematic lines.):")
)
analysis_chain = analysis_prompt | llm | StrOutputParser()

rewrite_prompt = PromptTemplate(
input_variables=["transcript", "analysis"],
template=config.get("rewrite_prompt_template", "Rewrite the podcast transcript by applying only the following recommendations. Refrain from shortening the transcript too much.\n\nRecommendations: \n\n{analysis}\n\nOriginal Transcript: \n\n{transcript}\n\nRewritten Transcript:")
)
rewrite_chain = rewrite_prompt | llm | StrOutputParser()

try:
logger.debug("Executing analysis chain")
analysis = analysis_chain.invoke({"transcript": transcript})
logger.debug(f"Successfully analyzed transcript: \n\n{analysis}")

logger.debug("Executing rewriting chain")
rewritten_response = rewrite_chain.invoke({"analysis": analysis, "transcript": transcript})
if not rewritten_response:
logger.warning("Rewriting chain returned empty response")
# Fall back to original
rewritten_response = transcript
logger.debug("Successfully rewrote transcript")
logger.debug(f"Successfully rewrote transcript, BEFORE = \n\n{transcript}")
logger.debug(f"Successfully rewrote transcript, AFTER = \n\n{rewritten_response}")
except Exception as e:
logger.error(f"Error in rewriting chain: {str(e)}")
rewritten_response = transcript # Fall back to original

final_transcript = self._fix_alternating_tags(rewritten_response)

logger.debug("Completed transcript cleaning process")

Expand Down