-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_youtube_transcript_tool.py
120 lines (97 loc) · 4.16 KB
/
get_youtube_transcript_tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
from langchain.tools import tool
from pytube import YouTube # for video title
import re
from youtube_transcript_api import YouTubeTranscriptApi
show_raw_transcript=True
output_as_file=True
delimiter="; " # Choose a unique delimiter
output_file_name='transcript_output.txt'
# Tool for grabbing a YouTube title from the provided url
@tool
def get_youtube_video_title(video_url):
"""
Grabs a specified youtube video's title.
:param video_url: URL of the YouTube video.
:return: The title
"""
yt = YouTube(video_url)
return yt.title
# Tool for extracting YouTube transcript segments
@tool
def get_youtube_transcript_segments(video_url: str) -> list:
"""
Extracts and splits the transcript from a given YouTube video URL into manageable segments.
:param video_url: URL of the YouTube video.
:return: A list of transcript segments.
"""
# Extract the video ID from the URL
video_id = re.search(r"((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)", video_url)
if not video_id:
return ["Unable to extract video ID from the URL."]
video_id = video_id.group(0)
# Fetch the transcript of the video
try:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
except Exception as e:
return [str(e)]
# Remove all instances of "[Music]" and combine the text of the transcript into a single string
full_transcript = ' '.join([entry['text'].replace("[Music]", "") for entry in transcript_list])
# Split transcript into segments using the nearest space approach
segment_length = 800 # Adjust the segment length: Experiment with this number to find a balance
segments = split_transcript_at_nearest_space(full_transcript, segment_length)
# Use a unique delimiter to join segments into a single string
task_description = delimiter.join(segments)
return task_description
def split_transcript_at_nearest_space(full_transcript, max_segment_length):
"""
Splits the transcript into segments, trying to avoid cutting off in the middle of a word.
:param full_transcript: The complete transcript as a single string.
:param max_segment_length: Maximum character length for each segment.
:return: A list of transcript segments.
"""
segments = []
while len(full_transcript) > max_segment_length:
# Find the nearest space before the max_segment_length
split_index = full_transcript.rfind(' ', 0, max_segment_length)
# If no space is found, default to the max length
if split_index == -1:
split_index = max_segment_length
# Split the transcript and add the segment
segment = full_transcript[:split_index].strip()
segments.append(segment)
# Remove the processed part from the transcript
full_transcript = full_transcript[split_index:].lstrip()
# Add the remaining part of the transcript
if full_transcript:
segments.append(full_transcript.strip())
return segments
# Main Flow
# User Input (YouTube Video URL)
video_title = ''
default_url = 'https://www.youtube.com/watch?v=Lty7RAHKT9E'
if len(sys.argv) > 1:
user_input_url = sys.argv[1]
else:
user_input_url = input(">>> Please enter the YouTube video URL: ")
if not user_input_url:
user_input_url = default_url
print('>>> Paging Dr. Steve...')
else:
print(f'Grabbing transcript for {video_title}')
video_title = get_youtube_video_title(user_input_url)
transcript_segments = get_youtube_transcript_segments(user_input_url)
transcript_title="TITLE: "+video_title+"\nTRANSCRIPT: "
# Raw Transcript Print setup for Debugging/Clarity
if show_raw_transcript:
print(transcript_title, end="")
for segment in transcript_segments:
print(segment, end="")
# Open a file in write mode
if output_as_file:
with open(output_file_name, 'w') as file:
file.write(transcript_title)
for segment in transcript_segments:
file.write(segment)
file.write('\n')
print(f'\n.....\n{output_file_name} written!')