From 4969293af279e1164a638342903afb9f69efc38a Mon Sep 17 00:00:00 2001
From: aserfass <aserfass@microsoft.com>
Date: Thu, 14 Sep 2023 13:58:04 -0700
Subject: [PATCH 1/4] added python script to download and convert AWS TS file
 to MP3

---
 .../awsAudioTS2Mp3DownloadandConvert.py       | 197 ++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 ModelTraining/awsAudioTS2Mp3DownloadandConvert.py

diff --git a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
new file mode 100644
index 00000000..7adcbdb5
--- /dev/null
+++ b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
@@ -0,0 +1,197 @@
+# Get Started
+# pip install ffmpeg-python python-dateutil
+# AND
+# winget install Gyan.FFmpeg
+# you may need to add the FFmpeg path to system/user PATH variable
+# "C:\Users\User\AppData\Local\Microsoft\WinGet\Links\ffmpeg.exe"
+# or
+# "C:\Users\User\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-6.0-full_build\bin\ffmpeg.exe"
+
+# USAGE
+# python .\awsAudioTS2Mp3DownloadandConvert.py --date '2020-09-11 22:14:00 PST' --node rpi_orcasound_lab
+# AWS Bucket defaults to: streaming-orcasound-net. Otherwise use --awsBucket streaming-orcasound-net to change bucket name
+
+# ffmpeg -i '.\live.m3u8' -c copy -bsf:a aac_adtstoasc demo.mp4
+
+import datetime, argparse, os, requests, ffmpeg, shutil
+from dateutil import parser
+from glob import glob
+
+import boto3
+from botocore import UNSIGNED
+from botocore.client import Config
+
+locations = [
+    'rpi_bush_point',
+    'rpi_mast_center',
+    'rpi_north_sjc',
+    'rpi_orcasound_lab',
+    'rpi_port_townsend',
+    'rpi_sunset_bay'
+]
+
+CLI=argparse.ArgumentParser()
+CLI.add_argument(
+    "--date",
+    nargs=1,
+    type=str,
+    default=None,
+    help="The data of the audio data of interest"
+)
+CLI.add_argument(
+    "--node",
+    nargs=1,
+    type=str,
+    default='rpi_orcasound_lab',
+    help="The name of the node where the hydrophone is located. Default: rpi_orcasound_lab"
+)
+CLI.add_argument(
+    "--awsBucket",
+    nargs=1,
+    type=str,
+    default='streaming-orcasound-net',
+    help="The name of the AWS audio data is stored. Default: streaming-orcasound-net"
+)
+args = CLI.parse_args()
+
+assert(args.node in locations)
+
+s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED, region_name='us-east-1'))
+
+# test_epoch = 1539203407
+# '1543804333' 'rpi_orcasound_lab'
+user_directory = os.environ['USERPROFILE']
+temp_dir_name = os.path.join('AppData\Local\Temp', 'orca_ffmpeg_temp')
+temp_directory_path = os.path.join(user_directory, temp_dir_name)
+os.makedirs(temp_directory_path, exist_ok=True)
+streamingBucketURL = "https://streaming-orcasound-net.s3.amazonaws.com/"
+
+if args.date and len(args.date) == 1:
+    user_input = args.date[0]
+elif len(args.date) > 1:
+    assert(len(args.date) == 1) # just placeholder failure
+else:
+    user_input = input('Enter Date in format: YYYY-MM-DD HH:MM:SS TZ. Ex: 2020-09-11 22:14:00 PST')
+    # user_input = '2020-09-11 22:14:00 PST' #'2023-09-12 02:12:00'
+if args.node:
+    node_name = args.node
+else:
+    node_name = 'rpi_orcasound_lab'
+if args.awsBucket:
+    my_bucket = args.awsBucket
+else:
+    my_bucket = 'streaming-orcasound-net'
+user_input_formatted = parser.parse(user_input)
+# user_input_formatted = time.strptime(user_input, '%Y-%m-%d %H:%M:%S %Z')
+print('time_formatted', user_input_formatted)
+# user_input_formatted2 = time.strftime(user_input, '%Y-%m-%d %H:%M:%S')
+# print('time_formatted2', user_input_formatted2)
+user_input_epoch = user_input_formatted.timestamp()#time.mktime(user_input_formatted)
+print('user_input_epoch', user_input_epoch)
+
+
+def get_all_aws_objects(bucket_name, filename_prefix):
+    '''Get all files containing the prefix provided'''
+    all_object_keys = [e['Key'] for p in s3_client.get_paginator("list_objects_v2")\
+        .paginate(Bucket=bucket_name, Prefix=filename_prefix)
+        for e in p['Contents']]
+    
+    return all_object_keys
+
+
+def download(url: str, dest_folder: str):
+    # https://stackoverflow.com/questions/56950987/download-file-from-url-and-save-it-in-a-folder-python
+    if not os.path.exists(dest_folder):
+        os.makedirs(dest_folder)  # create folder if it does not exist
+
+    filename = url.split('/')[-1].replace(" ", "_")  # be careful with file names
+    file_path = os.path.join(dest_folder, filename)
+
+    r = requests.get(url, stream=True)
+    if r.ok:
+        print("saving to", os.path.abspath(file_path))
+        with open(file_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=1024 * 8):
+                if chunk:
+                    f.write(chunk)
+                    f.flush()
+                    os.fsync(f.fileno())
+    else:  # HTTP status code 4XX/5XX
+        print("Download failed: status code {}\n{}".format(r.status_code, r.text))
+
+filename_prefix = node_name + '/hls/' + str(user_input_epoch)[:4]
+
+# Get all files containing the prefix provided
+aws_files = get_all_aws_objects(bucket_name=my_bucket, filename_prefix=filename_prefix)
+
+# Filter out files to only retain directories (epoch folders)
+aws_epochs = [aws_file.split('hls/')[-1].split('/')[0] for aws_file in aws_files]
+aws_epochs = sorted(list(set(aws_epochs)))
+print('aws_epochs', aws_epochs)
+current_epoch = ''
+epoch_needed = ''
+# find the epoch folder that contains the epoch (i.e. timestamp) being requested
+for aws_epoch in aws_epochs:
+    if current_epoch:
+        last_epoch = current_epoch
+    current_epoch = aws_epoch
+    if int(current_epoch) > int(user_input_epoch):
+        print(current_epoch, '>', str(int(user_input_epoch)))
+        epoch_needed = last_epoch
+        break
+    
+if epoch_needed:
+    print('Epoch needed', epoch_needed)
+else:
+    print("No Matching Epoch Found")
+assert(epoch_needed)
+epoch_folder = datetime.datetime.fromtimestamp(int(epoch_needed))#.strftime('%Y-%m-%d %H:%M:%S %Z')
+print('Conversion to datetime', epoch_folder)
+print(int(user_input_epoch) - int(epoch_needed))
+
+# get files from desired epoch folder
+filename_prefix = node_name + '/hls/' + epoch_needed
+aws_files = get_all_aws_objects(bucket_name=my_bucket, filename_prefix=filename_prefix)
+
+# download the files locally to a temp folder
+for aws_file in aws_files:
+    if '.' in aws_file:
+        download(streamingBucketURL+aws_file, dest_folder=temp_directory_path)
+
+# print("You provided $# arguments: $1, $2, $3, and $4")
+# aws s3 sync s3://streaming-orcasound-net/rpi_$1/hls/$2/ .
+
+glob_path = os.path.join(temp_directory_path, '*.ts')
+glob_list_files = glob(glob_path)
+for glob_list_file in glob_list_files:
+    filename = os.path.basename(glob_list_file)
+    if len(filename) <= 10:
+        new_filename = filename.replace('live', 'live0')
+        new_path = os.path.join(temp_directory_path, new_filename)
+        shutil.move(glob_list_file, new_path)
+glob_list_files = glob(glob_path)
+# for file in live*; do mv "$file" "${file#live}"; done;
+# for i in *.ts ; do
+#     mv $i `printf '%04d' ${i%.ts}`.ts
+# done
+# with(mylist.txt
+# printf "file '%s'\n" ./*.ts > mylist.txt
+mylist_path = os.path.join(temp_directory_path, 'mylist.txt')
+with open(mylist_path, "w", encoding="utf-8") as f:
+    for glob_list_file in glob_list_files:
+        f.write("file '" + glob_list_file + "'\n")
+# assert(os.path.exists(mylist_path))
+
+allTS_path = os.path.join(temp_directory_path, 'all.ts')
+outputMp4_path = os.path.join(temp_directory_path, 'output.mp4')
+outputMp3_path = os.path.join(temp_directory_path, 'output.mp3')
+ffmpeg.input(mylist_path, f='concat', safe='0').output(allTS_path, c='copy').run()
+# ffmpeg -f concat -safe 0 -i mylist.txt -c copy all.ts
+ffmpeg.input(allTS_path).output(outputMp4_path, acodec='copy', bsf='aac_adtstoasc').run()
+# ffmpeg -i all.ts -c:v libx264 -c:a copy -bsf:a aac_adtstoasc output.mp4
+ffmpeg.input(outputMp4_path).output(outputMp3_path).run()
+# ffmpeg -i output.mp4 output.mp3
+# rm *.ts output.mp4 mylist.txt
+
+print('Epoch needed', epoch_needed)
+print('Conversion to datetime', epoch_folder)
\ No newline at end of file

From 897c549adfb66f21669e650cc16e8cd569401eb9 Mon Sep 17 00:00:00 2001
From: aserfass <aserfass@microsoft.com>
Date: Thu, 14 Sep 2023 19:32:46 -0700
Subject: [PATCH 2/4] added awsAudioTS2Mp3DownloadandConvert.py instructions

---
 ModelTraining/README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/ModelTraining/README.md b/ModelTraining/README.md
index 4fd4df98..f968482b 100644
--- a/ModelTraining/README.md
+++ b/ModelTraining/README.md
@@ -78,3 +78,26 @@ The inference.py returns a dictionary -
 - [Pydub] (https://github.com/jiaaro/pydub/)
 - [tqdm] (https://github.com/tqdm/tqdm)
 - [sklearn](https://scikit-learn.org/stable/install.html)
+
+## AWS Audio Data (.ts files)
+Use the `awsAudioTS2Mp3DownloadandConvert.py` python script to retrieve the original audio data streamed from the hydrophones and convert that audio data to an mp3 file.
+
+This script will:
+- convert the provided datetime stamp to epoch format
+- look in aws for the epoch closest (less than) the desired datetime
+- download the `.ts` files to the users temp (%temp%) folder. Ex: C:\Users\User\AppData\Local\Temp\orca_ffmpeg_temp
+- use ffmpeg to convert the `.ts` files into one `.ts` file (`all.ts`), convert that one ts file to an mp4 file, and then convert the mp4 file (`output.mp4`) to the final mp3 file (`output.mp3`).
+- delete the `.ts` files, `all.ts`, and `output.mp4`.
+
+### Setup/dependencies
+1. Run `pip install ffmpeg-python python-dateutil` to install the required python dependencies
+1. Run `winget install Gyan.FFmpeg`.  You may need to add the FFmpeg path to system/user PATH variable. These are example paths:
+"C:\Users\User\AppData\Local\Microsoft\WinGet\Links\ffmpeg.exe" or "C:\Users\User\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-6.0-full_build\bin\ffmpeg.exe". Instructions to add an exe to path can be found here: https://medium.com/@kevinmarkvi/how-to-add-executables-to-your-path-in-windows-5ffa4ce61a53
+
+### Usage
+`python .\awsAudioTS2Mp3DownloadandConvert.py --date '2020-09-11 22:14:00 PST' --node rpi_orcasound_lab`
+
+- date must be in the format shown and include the timezone.
+- node defaults to `rpi_orcasound_lab` but the other hydrophones can be selected: 'rpi_bush_point', 'rpi_mast_center' 'rpi_north_sjc', 'rpi_orcasound_lab', 'rpi_port_townsend', or 'rpi_sunset_bay'
+
+The `output.mp3` file contains all of the audio data from the individual `.ts` files.
\ No newline at end of file

From 805aebd339c834c42c06d73ea7a803c82538dded Mon Sep 17 00:00:00 2001
From: aserfass <aserfass@microsoft.com>
Date: Thu, 14 Sep 2023 19:33:41 -0700
Subject: [PATCH 3/4] removed commented test code

---
 .../awsAudioTS2Mp3DownloadandConvert.py       | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
index 7adcbdb5..31ed8a5b 100644
--- a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
+++ b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
@@ -11,6 +11,10 @@
 # python .\awsAudioTS2Mp3DownloadandConvert.py --date '2020-09-11 22:14:00 PST' --node rpi_orcasound_lab
 # AWS Bucket defaults to: streaming-orcasound-net. Otherwise use --awsBucket streaming-orcasound-net to change bucket name
 
+# To-Do
+# Select a subset of the ts files to download based on the delta from the epoch datetime and user input datetime. This reduces the download and output file size.
+# Rename the output file
+
 # ffmpeg -i '.\live.m3u8' -c copy -bsf:a aac_adtstoasc demo.mp4
 
 import datetime, argparse, os, requests, ffmpeg, shutil
@@ -58,8 +62,7 @@
 
 s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED, region_name='us-east-1'))
 
-# test_epoch = 1539203407
-# '1543804333' 'rpi_orcasound_lab'
+DELETE_TEMP_FILES = True
 user_directory = os.environ['USERPROFILE']
 temp_dir_name = os.path.join('AppData\Local\Temp', 'orca_ffmpeg_temp')
 temp_directory_path = os.path.join(user_directory, temp_dir_name)
@@ -84,9 +87,7 @@
 user_input_formatted = parser.parse(user_input)
 # user_input_formatted = time.strptime(user_input, '%Y-%m-%d %H:%M:%S %Z')
 print('time_formatted', user_input_formatted)
-# user_input_formatted2 = time.strftime(user_input, '%Y-%m-%d %H:%M:%S')
-# print('time_formatted2', user_input_formatted2)
-user_input_epoch = user_input_formatted.timestamp()#time.mktime(user_input_formatted)
+user_input_epoch = user_input_formatted.timestamp()
 print('user_input_epoch', user_input_epoch)
 
 
@@ -145,7 +146,7 @@ def download(url: str, dest_folder: str):
 else:
     print("No Matching Epoch Found")
 assert(epoch_needed)
-epoch_folder = datetime.datetime.fromtimestamp(int(epoch_needed))#.strftime('%Y-%m-%d %H:%M:%S %Z')
+epoch_folder = datetime.datetime.fromtimestamp(int(epoch_needed))
 print('Conversion to datetime', epoch_folder)
 print(int(user_input_epoch) - int(epoch_needed))
 
@@ -158,9 +159,6 @@ def download(url: str, dest_folder: str):
     if '.' in aws_file:
         download(streamingBucketURL+aws_file, dest_folder=temp_directory_path)
 
-# print("You provided $# arguments: $1, $2, $3, and $4")
-# aws s3 sync s3://streaming-orcasound-net/rpi_$1/hls/$2/ .
-
 glob_path = os.path.join(temp_directory_path, '*.ts')
 glob_list_files = glob(glob_path)
 for glob_list_file in glob_list_files:
@@ -174,7 +172,6 @@ def download(url: str, dest_folder: str):
 # for i in *.ts ; do
 #     mv $i `printf '%04d' ${i%.ts}`.ts
 # done
-# with(mylist.txt
 # printf "file '%s'\n" ./*.ts > mylist.txt
 mylist_path = os.path.join(temp_directory_path, 'mylist.txt')
 with open(mylist_path, "w", encoding="utf-8") as f:
@@ -193,5 +190,12 @@ def download(url: str, dest_folder: str):
 # ffmpeg -i output.mp4 output.mp3
 # rm *.ts output.mp4 mylist.txt
 
+# delete temp files
+if DELETE_TEMP_FILES:
+    for f in glob(os.path.join(temp_directory_path, "*.ts")):
+        os.remove(f)
+    os.remove(allTS_path)
+    os.remove(outputMp4_path)
+
 print('Epoch needed', epoch_needed)
 print('Conversion to datetime', epoch_folder)
\ No newline at end of file

From bfbb1a13c7a7d8ac22c66353082b4b1f9ca05251 Mon Sep 17 00:00:00 2001
From: aserfass <aserfass@microsoft.com>
Date: Fri, 15 Sep 2023 17:27:30 -0700
Subject: [PATCH 4/4] removed delete All.ts since it is deleted with all *.ts
 files

---
 ModelTraining/awsAudioTS2Mp3DownloadandConvert.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
index 31ed8a5b..2e361eb6 100644
--- a/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
+++ b/ModelTraining/awsAudioTS2Mp3DownloadandConvert.py
@@ -194,7 +194,6 @@ def download(url: str, dest_folder: str):
 if DELETE_TEMP_FILES:
     for f in glob(os.path.join(temp_directory_path, "*.ts")):
         os.remove(f)
-    os.remove(allTS_path)
     os.remove(outputMp4_path)
 
 print('Epoch needed', epoch_needed)