get_many_ytCC.py

"""Python script to extract the closed captions generated by
automated speech recognition on Youtube videos, and save each transcript
to the hard drive as CSV files.
Uses the easy-to-use YouTubeTranscriptApi module created by Jonas Depoix.
Earl K. Brown, ekbrown byu edu (add appropriate characters to create email)
Note: pip install youtube_transcript_api (v. 0.1.8+) before running.
"""

# def get_ytcc(video_id, lang = "en"):
def get_ytcc(video_id):
    """Get closed caption transcript of Youtube video created by ASR.
    param video_id: Youtube video ID, visible in the URL after "v=".
    param lang: language of speech in video; defaults to "en".
    return value: Pandas DataFrame with three columns: text, start, duration.
    """

    try:
        txt = YouTubeTranscriptApi.get_transcript(video_id)
    except YouTubeTranscriptApi.CouldNotRetrieveTranscript:
        txt = [{'text': "Couldn't get text for this video. Double check that the uploader has enabled closed captions for it.", 'start': 0.0, 'duration': 0.0}]
    return pd.DataFrame(txt)

### test the function

# load modules
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import os

# change directory to where the transcripts should be saved
os.chdir("/Users/ekb5/Downloads/delete")

# put in the YouTube video IDs in this Python list
all_ids = ["dj9RR4BSqvM", "Y8Tko2YC5hA", "o4grwzpdl38", "vjci1lf0fj4"]

# loop over the list
for id in all_ids:

    # progress report
    print(f"\tWorking on {id}...")

    # call the function defined above during each iteration
    res = get_ytcc(id)

    # save transcript to hard drive
    res.to_csv(id + ".csv", index = False)