-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
111 lines (85 loc) · 3.29 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import requests
import time
import pandas as pd
upload_endpoint = "https://api.assemblyai.com/v2/upload"
transcript_endpoint = "https://api.assemblyai.com/v2/transcript"
# Helper for `upload_file()`
def _read_file(filename, chunk_size=5242880):
with open(filename, "rb") as f:
while True:
data = f.read(chunk_size)
if not data:
break
yield data
# Uploads a file to AAI servers
def upload_file(audio_file, header):
upload_response = requests.post(
upload_endpoint,
headers=header, data=_read_file(audio_file)
)
return upload_response.json()
# Request transcript for file uploaded to AAI servers
def request_transcript(upload_url, header):
transcript_request = {
'audio_url': upload_url['upload_url'],
# 'auto_chapters': True,
"summarization": True,
"summary_model": "informative",
"summary_type": "paragraph",
"entity_detection": True,
"sentiment_analysis": True
}
transcript_response = requests.post(
transcript_endpoint,
json=transcript_request,
headers=header
)
return transcript_response.json()
# Make a polling endpoint
def make_polling_endpoint(transcript_response):
polling_endpoint = "https://api.assemblyai.com/v2/transcript/"
polling_endpoint += transcript_response['id']
return polling_endpoint
# Wait for the transcript to finish
def wait_for_completion(polling_endpoint, header):
while True:
polling_response = requests.get(polling_endpoint, headers=header)
polling_response = polling_response.json()
if polling_response['status'] == 'completed':
return polling_response
time.sleep(5)
# Get the paragraphs of the transcript
def get_paragraphs(polling_endpoint, header):
paragraphs_response = requests.get(polling_endpoint + "/paragraphs", headers=header)
paragraphs_response = paragraphs_response.json()
paragraphs = []
for para in paragraphs_response['paragraphs']:
paragraphs.append(para)
return paragraphs
# Print the chapters of the transcript
def get_chapters(polling_response):
return polling_response['chapters']
# Print the summary of the transcript
def get_summary(polling_response):
summary = polling_response['summary']
print('Here is the summary for the input transcript:')
print(summary)
print()
# Print the entities of the transcript
def get_entities(polling_response):
entities = polling_response['entities']
df = pd.DataFrame(entities)
df = df[['entity_type','text']]
df = df.groupby(['entity_type','text']).text.agg('count').to_frame('count').reset_index()
print('The following table represents all entities present in the transcript:')
print(df)
print()
# Print the sentiment statistics of the transcript
def get_sentiments(polling_response):
sentiments = polling_response['sentiment_analysis_results']
df = pd.DataFrame(sentiments)
df = df[['sentiment']]
df = df.groupby(['sentiment']).sentiment.agg('count').to_frame('count').reset_index()
print('The following represents a count of sentences in the transcript with a sentiment:')
print(df)
print()