Skip to content

Commit

Permalink
Feature/blended TROPOMI+GOSAT download script (#205)
Browse files Browse the repository at this point in the history
* blended TROPOMI+GOSAT automated download script, plus change name of default satellite data directory
  • Loading branch information
laestrada authored May 1, 2024
1 parent 5729e75 commit aac774a
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 9 deletions.
20 changes: 15 additions & 5 deletions run_imi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ export PYTHONPATH=${PYTHONPATH}:${InversionPath}
## Download the TROPOMI data
##=======================================================================

# Download TROPOMI data from AWS. You will be charged if your ec2 instance is not in the eu-central-1 region.
# Download TROPOMI or blended dataset from AWS
mkdir -p -v ${RunDirs}
tropomiCache=${RunDirs}/data_TROPOMI
tropomiCache=${RunDirs}/satellite_data
if "$isAWS"; then
{ # test if instance has access to TROPOMI bucket
stdout=`aws s3 ls s3://meeo-s5p`
Expand All @@ -133,9 +133,19 @@ if "$isAWS"; then
exit 1
}
mkdir -p -v $tropomiCache
printf "Downloading TROPOMI data from S3\n"
python src/utilities/download_TROPOMI.py $StartDate $EndDate $tropomiCache
printf "\nFinished TROPOMI download\n"

if "$BlendedTROPOMI"; then
sbatch --mem $SimulationMemory \
-c $SimulationCPUs \
-t $RequestedTime \
-p $SchedulerPartition \
-o imi_output.tmp \
-W src/utilities/download_blended_TROPOMI.py $StartDate $EndDate $tropomiCache; wait;
cat imi_output.tmp >> ${InversionPath}/imi_output.log
rm imi_output.tmp
else
python src/utilities/download_TROPOMI.py $StartDate $EndDate $tropomiCache
fi
else
# use existing tropomi data and create a symlink to it
if [[ ! -L $tropomiCache ]]; then
Expand Down
2 changes: 1 addition & 1 deletion src/components/inversion_component/inversion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ run_inversion() {
if "$KalmanMode"; then
cd ${RunDirs}/kf_inversions/period${period_i}
# Modify inversion driver script to reflect current inversion period
sed -i "s|data_TROPOMI\"|data_TROPOMI\"\n\n# Defined via run_kf.sh:\nStartDate=${StartDate_i}\nEndDate=${EndDate_i}|g" run_inversion.sh
sed -i "s|satellite_data\"|satellite_data\"\n\n# Defined via run_kf.sh:\nStartDate=${StartDate_i}\nEndDate=${EndDate_i}|g" run_inversion.sh
if (( period_i > 1 )); then
FirstSimSwitch=false
fi
Expand Down
2 changes: 1 addition & 1 deletion src/components/preview_component/preview.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ run_preview() {
config_path=${InversionPath}/${ConfigFile}
state_vector_path=${RunDirs}/StateVector.nc
preview_dir=${RunDirs}/${runDir}
tropomi_cache=${RunDirs}/data_TROPOMI
tropomi_cache=${RunDirs}/satellite_data
preview_file=${InversionPath}/src/inversion_scripts/imi_preview.py

# Run preview script
Expand Down
2 changes: 1 addition & 1 deletion src/components/statevector_component/statevector.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ reduce_dimension() {
native_state_vector_path=${RunDirs}/NativeStateVector.nc

preview_dir=${RunDirs}/preview_run
tropomi_cache=${RunDirs}/data_TROPOMI
tropomi_cache=${RunDirs}/satellite_data
aggregation_file=${InversionPath}/src/components/statevector_component/aggregation.py

if [[ ! -f ${RunDirs}/NativeStateVector.nc ]]; then
Expand Down
2 changes: 1 addition & 1 deletion src/inversion_scripts/run_inversion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ StateVectorFile={STATE_VECTOR_PATH}
GCDir="./data_geoschem"
JacobianDir="./data_converted"
sensiCache="./data_sensitivities"
tropomiCache="${OutputPath}/${RunName}/data_TROPOMI"
tropomiCache="${OutputPath}/${RunName}/satellite_data"

# For Kalman filter: assume first inversion period (( period_i = 1 )) by default
# Switch is flipped to false automatically if (( period_i > 1 ))
Expand Down
2 changes: 2 additions & 0 deletions src/utilities/download_TROPOMI.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@ def download_TROPOMI(startdate, enddate, Sat_datadir):
# Run the data download script
# Remove the file afterwards
os.chmod(DATA_DOWNLOAD_SCRIPT, 0o755)
print("=============Downloading TROPOMI Operational Data=============")
status = subprocess.call(DATA_DOWNLOAD_SCRIPT)
os.remove(DATA_DOWNLOAD_SCRIPT)
print("==================Finished TROPOMI Download ==================")


if __name__ == "__main__":
Expand Down
141 changes: 141 additions & 0 deletions src/utilities/download_blended_TROPOMI.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import boto3
import multiprocessing
from datetime import datetime, timedelta
from botocore import UNSIGNED
from botocore.client import Config

# Description: Download TROPOMI data from aws S3 bucket for desired dates.
# Function can be called from another script or run as a
# directly as a script.
# Example Usage as a script:
# $ python download_blended_TROPOMI.py 20190101 20190214 TROPOMI_data


def initialize_boto3():
"""
Initialize s3 service with boto3
Returns
s3 [object] : boto3 s3 client
"""
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
return s3


def download_from_s3(args):
"""
Download s3 path to local directory if it doesn't already exist locally
Arguments
args [tuple] : (s3_path, bucket, storage_dir)
"""
s3_path, bucket, storage_dir = args
s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
file = os.path.basename(s3_path)
local_file_path = os.path.join(storage_dir, file)

# Check if the file already exists locally
if not os.path.exists(local_file_path):
# If not, download it
s3.download_file(bucket, s3_path, local_file_path)
else:
print(f"File {local_file_path} already exists locally. Skipping download.")


def get_file_prefixes(start_date, end_date):
"""
Gets month prefixes and file prefixes for
supplied date range (excluding end_date)
Arguments
start_date [datetime] : start date of data download
end_date [datetime] : end date of data download
Returns
months_list [list] : prefix dirs of all months
days_list [list] : acceptable file path prefixes
"""

# default prefixes of blended dataset bucket
subdir = "data/"
file_prefix = "S5P_BLND_L2__CH4____"

# Initialize set to store month strings
# using a set prevents duplicates
months_set = set()
days_set = set()

# Iterate through months between start and end dates
# use 27 days to prevent missing a month
current_date = start_date
while current_date < end_date:
month_dir = current_date.strftime("%Y-%m") + "/"
months_set.add(subdir + month_dir)
days_set.add(subdir + month_dir + file_prefix + current_date.strftime("%Y%m%d"))
current_date += timedelta(days=1)

return sorted(months_set), sorted(days_set)


def get_s3_paths(start_date, end_date, bucket):
"""
Gets s3 paths for download.
Arguments
start_date [datetime] : start date of data download (yyyymmdd)
end_date [datetime] : end date of data download (yyyymmdd)
bucket [str] : s3 bucket name
Returns
s3_paths [list] : list of s3 paths for download
"""
month_prefix_list, file_prefix_list = get_file_prefixes(start_date, end_date)
s3 = initialize_boto3()
s3_paths = []
for month_prefix in month_prefix_list:
response = s3.list_objects(Bucket=bucket, Prefix=month_prefix)
if "Contents" in response:
for key in response["Contents"]:
day_match = any(
key["Key"].startswith(prefix) for prefix in file_prefix_list
)
if day_match:
s3_paths.append(key["Key"])
else:
raise Exception(
f"s3://{bucket}/{month_prefix} does not exist. Check if blended TROPOMI+GOSAT data exists for time period."
)
return s3_paths


def download_blended(start_date, end_date, storage_dir):
"""
Download blended TROPOM+GOSAT dataset from s3 to desired
directory.
Arguments
start_date [datetime] : start date of data download (yyyymmdd)
end_date [datetime] : end date of data download (yyyymmdd)
storage_dir [str] : local directory to store downloaded files
"""
bucket = "blended-tropomi-gosat-methane"
s3_paths = get_s3_paths(start_date, end_date, bucket)
os.makedirs(storage_dir, exist_ok=True)

print("=============Downloading Blended TROPOMI+GOSAT Data=============")
# Download the files using multiple cores
with multiprocessing.Pool(112) as pool:
pool.map(
download_from_s3, [(s3_path, bucket, storage_dir) for s3_path in s3_paths]
)
pool.close()
pool.join()

print("==============Finished Downloading Blended Dataset==============")


if __name__ == "__main__":
start = sys.argv[1]
end = sys.argv[2]
Sat_datadir = sys.argv[3]
start_date = datetime.strptime(start, "%Y%m%d")
end_date = datetime.strptime(end, "%Y%m%d")
download_blended(start_date, end_date, Sat_datadir)

0 comments on commit aac774a

Please sign in to comment.