From 4969293af279e1164a638342903afb9f69efc38a Mon Sep 17 00:00:00 2001 From: aserfass Date: Thu, 14 Sep 2023 13:58:04 -0700 Subject: [PATCH 1/4] added python script to download and convert AWS TS file to MP3 --- .../awsAudioTS2Mp3DownloadandConvert.py | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 ModelTraining/awsAudioTS2Mp3DownloadandConvert.py diff --git a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py new file mode 100644 index 00000000..7adcbdb5 --- /dev/null +++ b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py @@ -0,0 +1,197 @@ +# Get Started +# pip install ffmpeg-python python-dateutil +# AND +# winget install Gyan.FFmpeg +# you may need to add the FFmpeg path to system/user PATH variable +# "C:\Users\User\AppData\Local\Microsoft\WinGet\Links\ffmpeg.exe" +# or +# "C:\Users\User\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-6.0-full_build\bin\ffmpeg.exe" + +# USAGE +# python .\awsAudioTS2Mp3DownloadandConvert.py --date '2020-09-11 22:14:00 PST' --node rpi_orcasound_lab +# AWS Bucket defaults to: streaming-orcasound-net. Otherwise use --awsBucket streaming-orcasound-net to change bucket name + +# ffmpeg -i '.\live.m3u8' -c copy -bsf:a aac_adtstoasc demo.mp4 + +import datetime, argparse, os, requests, ffmpeg, shutil +from dateutil import parser +from glob import glob + +import boto3 +from botocore import UNSIGNED +from botocore.client import Config + +locations = [ + 'rpi_bush_point', + 'rpi_mast_center', + 'rpi_north_sjc', + 'rpi_orcasound_lab', + 'rpi_port_townsend', + 'rpi_sunset_bay' +] + +CLI=argparse.ArgumentParser() +CLI.add_argument( + "--date", + nargs=1, + type=str, + default=None, + help="The data of the audio data of interest" +) +CLI.add_argument( + "--node", + nargs=1, + type=str, + default='rpi_orcasound_lab', + help="The name of the node where the hydrophone is located. Default: rpi_orcasound_lab" +) +CLI.add_argument( + "--awsBucket", + nargs=1, + type=str, + default='streaming-orcasound-net', + help="The name of the AWS audio data is stored. Default: streaming-orcasound-net" +) +args = CLI.parse_args() + +assert(args.node in locations) + +s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED, region_name='us-east-1')) + +# test_epoch = 1539203407 +# '1543804333' 'rpi_orcasound_lab' +user_directory = os.environ['USERPROFILE'] +temp_dir_name = os.path.join('AppData\Local\Temp', 'orca_ffmpeg_temp') +temp_directory_path = os.path.join(user_directory, temp_dir_name) +os.makedirs(temp_directory_path, exist_ok=True) +streamingBucketURL = "https://streaming-orcasound-net.s3.amazonaws.com/" + +if args.date and len(args.date) == 1: + user_input = args.date[0] +elif len(args.date) > 1: + assert(len(args.date) == 1) # just placeholder failure +else: + user_input = input('Enter Date in format: YYYY-MM-DD HH:MM:SS TZ. Ex: 2020-09-11 22:14:00 PST') + # user_input = '2020-09-11 22:14:00 PST' #'2023-09-12 02:12:00' +if args.node: + node_name = args.node +else: + node_name = 'rpi_orcasound_lab' +if args.awsBucket: + my_bucket = args.awsBucket +else: + my_bucket = 'streaming-orcasound-net' +user_input_formatted = parser.parse(user_input) +# user_input_formatted = time.strptime(user_input, '%Y-%m-%d %H:%M:%S %Z') +print('time_formatted', user_input_formatted) +# user_input_formatted2 = time.strftime(user_input, '%Y-%m-%d %H:%M:%S') +# print('time_formatted2', user_input_formatted2) +user_input_epoch = user_input_formatted.timestamp()#time.mktime(user_input_formatted) +print('user_input_epoch', user_input_epoch) + + +def get_all_aws_objects(bucket_name, filename_prefix): + '''Get all files containing the prefix provided''' + all_object_keys = [e['Key'] for p in s3_client.get_paginator("list_objects_v2")\ + .paginate(Bucket=bucket_name, Prefix=filename_prefix) + for e in p['Contents']] + + return all_object_keys + + +def download(url: str, dest_folder: str): + # https://stackoverflow.com/questions/56950987/download-file-from-url-and-save-it-in-a-folder-python + if not os.path.exists(dest_folder): + os.makedirs(dest_folder) # create folder if it does not exist + + filename = url.split('/')[-1].replace(" ", "_") # be careful with file names + file_path = os.path.join(dest_folder, filename) + + r = requests.get(url, stream=True) + if r.ok: + print("saving to", os.path.abspath(file_path)) + with open(file_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024 * 8): + if chunk: + f.write(chunk) + f.flush() + os.fsync(f.fileno()) + else: # HTTP status code 4XX/5XX + print("Download failed: status code {}\n{}".format(r.status_code, r.text)) + +filename_prefix = node_name + '/hls/' + str(user_input_epoch)[:4] + +# Get all files containing the prefix provided +aws_files = get_all_aws_objects(bucket_name=my_bucket, filename_prefix=filename_prefix) + +# Filter out files to only retain directories (epoch folders) +aws_epochs = [aws_file.split('hls/')[-1].split('/')[0] for aws_file in aws_files] +aws_epochs = sorted(list(set(aws_epochs))) +print('aws_epochs', aws_epochs) +current_epoch = '' +epoch_needed = '' +# find the epoch folder that contains the epoch (i.e. timestamp) being requested +for aws_epoch in aws_epochs: + if current_epoch: + last_epoch = current_epoch + current_epoch = aws_epoch + if int(current_epoch) > int(user_input_epoch): + print(current_epoch, '>', str(int(user_input_epoch))) + epoch_needed = last_epoch + break + +if epoch_needed: + print('Epoch needed', epoch_needed) +else: + print("No Matching Epoch Found") +assert(epoch_needed) +epoch_folder = datetime.datetime.fromtimestamp(int(epoch_needed))#.strftime('%Y-%m-%d %H:%M:%S %Z') +print('Conversion to datetime', epoch_folder) +print(int(user_input_epoch) - int(epoch_needed)) + +# get files from desired epoch folder +filename_prefix = node_name + '/hls/' + epoch_needed +aws_files = get_all_aws_objects(bucket_name=my_bucket, filename_prefix=filename_prefix) + +# download the files locally to a temp folder +for aws_file in aws_files: + if '.' in aws_file: + download(streamingBucketURL+aws_file, dest_folder=temp_directory_path) + +# print("You provided $# arguments: $1, $2, $3, and $4") +# aws s3 sync s3://streaming-orcasound-net/rpi_$1/hls/$2/ . + +glob_path = os.path.join(temp_directory_path, '*.ts') +glob_list_files = glob(glob_path) +for glob_list_file in glob_list_files: + filename = os.path.basename(glob_list_file) + if len(filename) <= 10: + new_filename = filename.replace('live', 'live0') + new_path = os.path.join(temp_directory_path, new_filename) + shutil.move(glob_list_file, new_path) +glob_list_files = glob(glob_path) +# for file in live*; do mv "$file" "${file#live}"; done; +# for i in *.ts ; do +# mv $i `printf '%04d' ${i%.ts}`.ts +# done +# with(mylist.txt +# printf "file '%s'\n" ./*.ts > mylist.txt +mylist_path = os.path.join(temp_directory_path, 'mylist.txt') +with open(mylist_path, "w", encoding="utf-8") as f: + for glob_list_file in glob_list_files: + f.write("file '" + glob_list_file + "'\n") +# assert(os.path.exists(mylist_path)) + +allTS_path = os.path.join(temp_directory_path, 'all.ts') +outputMp4_path = os.path.join(temp_directory_path, 'output.mp4') +outputMp3_path = os.path.join(temp_directory_path, 'output.mp3') +ffmpeg.input(mylist_path, f='concat', safe='0').output(allTS_path, c='copy').run() +# ffmpeg -f concat -safe 0 -i mylist.txt -c copy all.ts +ffmpeg.input(allTS_path).output(outputMp4_path, acodec='copy', bsf='aac_adtstoasc').run() +# ffmpeg -i all.ts -c:v libx264 -c:a copy -bsf:a aac_adtstoasc output.mp4 +ffmpeg.input(outputMp4_path).output(outputMp3_path).run() +# ffmpeg -i output.mp4 output.mp3 +# rm *.ts output.mp4 mylist.txt + +print('Epoch needed', epoch_needed) +print('Conversion to datetime', epoch_folder) \ No newline at end of file From 897c549adfb66f21669e650cc16e8cd569401eb9 Mon Sep 17 00:00:00 2001 From: aserfass Date: Thu, 14 Sep 2023 19:32:46 -0700 Subject: [PATCH 2/4] added awsAudioTS2Mp3DownloadandConvert.py instructions --- ModelTraining/README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ModelTraining/README.md b/ModelTraining/README.md index 4fd4df98..f968482b 100644 --- a/ModelTraining/README.md +++ b/ModelTraining/README.md @@ -78,3 +78,26 @@ The inference.py returns a dictionary - - [Pydub] (https://github.com/jiaaro/pydub/) - [tqdm] (https://github.com/tqdm/tqdm) - [sklearn](https://scikit-learn.org/stable/install.html) + +## AWS Audio Data (.ts files) +Use the `awsAudioTS2Mp3DownloadandConvert.py` python script to retrieve the original audio data streamed from the hydrophones and convert that audio data to an mp3 file. + +This script will: +- convert the provided datetime stamp to epoch format +- look in aws for the epoch closest (less than) the desired datetime +- download the `.ts` files to the users temp (%temp%) folder. Ex: C:\Users\User\AppData\Local\Temp\orca_ffmpeg_temp +- use ffmpeg to convert the `.ts` files into one `.ts` file (`all.ts`), convert that one ts file to an mp4 file, and then convert the mp4 file (`output.mp4`) to the final mp3 file (`output.mp3`). +- delete the `.ts` files, `all.ts`, and `output.mp4`. + +### Setup/dependencies +1. Run `pip install ffmpeg-python python-dateutil` to install the required python dependencies +1. Run `winget install Gyan.FFmpeg`. You may need to add the FFmpeg path to system/user PATH variable. These are example paths: +"C:\Users\User\AppData\Local\Microsoft\WinGet\Links\ffmpeg.exe" or "C:\Users\User\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-6.0-full_build\bin\ffmpeg.exe". Instructions to add an exe to path can be found here: https://medium.com/@kevinmarkvi/how-to-add-executables-to-your-path-in-windows-5ffa4ce61a53 + +### Usage +`python .\awsAudioTS2Mp3DownloadandConvert.py --date '2020-09-11 22:14:00 PST' --node rpi_orcasound_lab` + +- date must be in the format shown and include the timezone. +- node defaults to `rpi_orcasound_lab` but the other hydrophones can be selected: 'rpi_bush_point', 'rpi_mast_center' 'rpi_north_sjc', 'rpi_orcasound_lab', 'rpi_port_townsend', or 'rpi_sunset_bay' + +The `output.mp3` file contains all of the audio data from the individual `.ts` files. \ No newline at end of file From 805aebd339c834c42c06d73ea7a803c82538dded Mon Sep 17 00:00:00 2001 From: aserfass Date: Thu, 14 Sep 2023 19:33:41 -0700 Subject: [PATCH 3/4] removed commented test code --- .../awsAudioTS2Mp3DownloadandConvert.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py index 7adcbdb5..31ed8a5b 100644 --- a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py +++ b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py @@ -11,6 +11,10 @@ # python .\awsAudioTS2Mp3DownloadandConvert.py --date '2020-09-11 22:14:00 PST' --node rpi_orcasound_lab # AWS Bucket defaults to: streaming-orcasound-net. Otherwise use --awsBucket streaming-orcasound-net to change bucket name +# To-Do +# Select a subset of the ts files to download based on the delta from the epoch datetime and user input datetime. This reduces the download and output file size. +# Rename the output file + # ffmpeg -i '.\live.m3u8' -c copy -bsf:a aac_adtstoasc demo.mp4 import datetime, argparse, os, requests, ffmpeg, shutil @@ -58,8 +62,7 @@ s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED, region_name='us-east-1')) -# test_epoch = 1539203407 -# '1543804333' 'rpi_orcasound_lab' +DELETE_TEMP_FILES = True user_directory = os.environ['USERPROFILE'] temp_dir_name = os.path.join('AppData\Local\Temp', 'orca_ffmpeg_temp') temp_directory_path = os.path.join(user_directory, temp_dir_name) @@ -84,9 +87,7 @@ user_input_formatted = parser.parse(user_input) # user_input_formatted = time.strptime(user_input, '%Y-%m-%d %H:%M:%S %Z') print('time_formatted', user_input_formatted) -# user_input_formatted2 = time.strftime(user_input, '%Y-%m-%d %H:%M:%S') -# print('time_formatted2', user_input_formatted2) -user_input_epoch = user_input_formatted.timestamp()#time.mktime(user_input_formatted) +user_input_epoch = user_input_formatted.timestamp() print('user_input_epoch', user_input_epoch) @@ -145,7 +146,7 @@ def download(url: str, dest_folder: str): else: print("No Matching Epoch Found") assert(epoch_needed) -epoch_folder = datetime.datetime.fromtimestamp(int(epoch_needed))#.strftime('%Y-%m-%d %H:%M:%S %Z') +epoch_folder = datetime.datetime.fromtimestamp(int(epoch_needed)) print('Conversion to datetime', epoch_folder) print(int(user_input_epoch) - int(epoch_needed)) @@ -158,9 +159,6 @@ def download(url: str, dest_folder: str): if '.' in aws_file: download(streamingBucketURL+aws_file, dest_folder=temp_directory_path) -# print("You provided $# arguments: $1, $2, $3, and $4") -# aws s3 sync s3://streaming-orcasound-net/rpi_$1/hls/$2/ . - glob_path = os.path.join(temp_directory_path, '*.ts') glob_list_files = glob(glob_path) for glob_list_file in glob_list_files: @@ -174,7 +172,6 @@ def download(url: str, dest_folder: str): # for i in *.ts ; do # mv $i `printf '%04d' ${i%.ts}`.ts # done -# with(mylist.txt # printf "file '%s'\n" ./*.ts > mylist.txt mylist_path = os.path.join(temp_directory_path, 'mylist.txt') with open(mylist_path, "w", encoding="utf-8") as f: @@ -193,5 +190,12 @@ def download(url: str, dest_folder: str): # ffmpeg -i output.mp4 output.mp3 # rm *.ts output.mp4 mylist.txt +# delete temp files +if DELETE_TEMP_FILES: + for f in glob(os.path.join(temp_directory_path, "*.ts")): + os.remove(f) + os.remove(allTS_path) + os.remove(outputMp4_path) + print('Epoch needed', epoch_needed) print('Conversion to datetime', epoch_folder) \ No newline at end of file From bfbb1a13c7a7d8ac22c66353082b4b1f9ca05251 Mon Sep 17 00:00:00 2001 From: aserfass Date: Fri, 15 Sep 2023 17:27:30 -0700 Subject: [PATCH 4/4] removed delete All.ts since it is deleted with all *.ts files --- ModelTraining/awsAudioTS2Mp3DownloadandConvert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py index 31ed8a5b..2e361eb6 100644 --- a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py +++ b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py @@ -194,7 +194,6 @@ def download(url: str, dest_folder: str): if DELETE_TEMP_FILES: for f in glob(os.path.join(temp_directory_path, "*.ts")): os.remove(f) - os.remove(allTS_path) os.remove(outputMp4_path) print('Epoch needed', epoch_needed)