-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_many_ytCC.py
46 lines (36 loc) · 1.63 KB
/
get_many_ytCC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Python script to extract the closed captions generated by
automated speech recognition on Youtube videos, and save each transcript
to the hard drive as CSV files.
Uses the easy-to-use YouTubeTranscriptApi module created by Jonas Depoix.
Earl K. Brown, ekbrown byu edu (add appropriate characters to create email)
Note: pip install youtube_transcript_api (v. 0.1.8+) before running.
"""
# def get_ytcc(video_id, lang = "en"):
def get_ytcc(video_id):
"""Get closed caption transcript of Youtube video created by ASR.
param video_id: Youtube video ID, visible in the URL after "v=".
param lang: language of speech in video; defaults to "en".
return value: Pandas DataFrame with three columns: text, start, duration.
"""
try:
txt = YouTubeTranscriptApi.get_transcript(video_id)
except YouTubeTranscriptApi.CouldNotRetrieveTranscript:
txt = [{'text': "Couldn't get text for this video. Double check that the uploader has enabled closed captions for it.", 'start': 0.0, 'duration': 0.0}]
return pd.DataFrame(txt)
### test the function
# load modules
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import os
# change directory to where the transcripts should be saved
os.chdir("/Users/ekb5/Downloads/delete")
# put in the YouTube video IDs in this Python list
all_ids = ["dj9RR4BSqvM", "Y8Tko2YC5hA", "o4grwzpdl38", "vjci1lf0fj4"]
# loop over the list
for id in all_ids:
# progress report
print(f"\tWorking on {id}...")
# call the function defined above during each iteration
res = get_ytcc(id)
# save transcript to hard drive
res.to_csv(id + ".csv", index = False)