From d9e2076885055ec24c97c365632b9781a1df80a9 Mon Sep 17 00:00:00 2001
From: Kilian Vos <voskilian@gmail.com>
Date: Thu, 23 Jul 2020 18:45:01 +1000
Subject: [PATCH 1/3] Update SDS_download.py

Sentinel-2 geometric quality property has changed named in latest part of the archive
---
 coastsat/SDS_download.py | 276 +++++++++++++++++++--------------------
 1 file changed, 135 insertions(+), 141 deletions(-)

diff --git a/coastsat/SDS_download.py b/coastsat/SDS_download.py
index ca7fe00d..4c4fa279 100644
--- a/coastsat/SDS_download.py
+++ b/coastsat/SDS_download.py
@@ -1,7 +1,7 @@
 """
-This module contains all the functions needed to download the satellite images 
+This module contains all the functions needed to download the satellite images
 from the Google Earth Engine server
-    
+
 Author: Kilian Vos, Water Research Laboratory, University of New South Wales
 """
 
@@ -77,26 +77,26 @@ def retrieve_images(inputs):
         date, filename, georeferencing accuracy and image coordinate reference system
 
     """
-    
+
     # initialise connection with GEE server
     ee.Initialize()
-    
+
     # check image availabiliy and retrieve list of images
     im_dict_T1, im_dict_T2 = check_images_available(inputs)
-    
+
     # if user also wants to download T2 images, merge both lists
     if 'include_T2' in inputs.keys():
         for key in inputs['sat_list']:
             if key == 'S2': continue
             else: im_dict_T1[key] += im_dict_T2[key]
-    
+
     # remove UTM duplicates in S2 collections (they provide several projections for same images)
-    if 'S2' in inputs['sat_list'] and len(im_dict_T1['S2'])>0: 
+    if 'S2' in inputs['sat_list'] and len(im_dict_T1['S2'])>0:
         im_dict_T1['S2'] = filter_S2_collection(im_dict_T1['S2'])
-        
+
     # create a new directory for this site with the name of the site
     im_folder = os.path.join(inputs['filepath'],inputs['sitename'])
-    if not os.path.exists(im_folder): os.makedirs(im_folder)    
+    if not os.path.exists(im_folder): os.makedirs(im_folder)
 
     print('\nDownloading images:')
     suffix = '.tif'
@@ -107,17 +107,17 @@ def retrieve_images(inputs):
         # initialise variables and loop through images
         georef_accs = []; filenames = []; all_names = []; im_epsg = []
         for i in range(len(im_dict_T1[satname])):
-            
+
             im_meta = im_dict_T1[satname][i]
-            
+
             # get time of acquisition (UNIX time) and convert to datetime
             t = im_meta['properties']['system:time_start']
             im_timestamp = datetime.fromtimestamp(t/1000, tz=pytz.utc)
-            im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S') 
+            im_date = im_timestamp.strftime('%Y-%m-%d-%H-%M-%S')
 
             # get epsg code
             im_epsg.append(int(im_meta['bands'][0]['crs'][5:]))
-            
+
             # get geometric accuracy
             if satname in ['L5','L7','L8']:
                 if 'GEOMETRIC_RMSE_MODEL' in im_meta['properties'].keys():
@@ -128,31 +128,26 @@ def retrieve_images(inputs):
                 # Sentinel-2 products don't provide a georeferencing accuracy (RMSE as in Landsat)
                 # but they have a flag indicating if the geometric quality control was passed or failed
                 # if passed a value of 1 is stored if failed a value of -1 is stored in the metadata
-                skip_geo_check = False
-                if 'GEOMETRIC_QUALITY_FLAG' in im_meta['properties'].keys(): 
-                    key = 'GEOMETRIC_QUALITY_FLAG'
-                elif 'quality_check' in im_meta['properties'].keys():
-                    key = 'quality_check'
-                else:
-                    acc_georef = -1
-                    skip_geo_check = True
-                if not skip_geo_check:
-                    if im_meta['properties'][key] == 'PASSED': acc_georef = 1
-                    else: acc_georef = -1 
+                # the name of the property containing the flag changes across the S2 archive
+                # check which flag name is used for the image and store the 1/-1 for acc_georef
+                flag_names = ['GEOMETRIC_QUALITY_FLAG', 'GEOMETRIC_QUALITY', 'quality_check']
+                for key in flag_names: if key in im_meta['properties'].keys(): break
+                if im_meta['properties'][key] == 'PASSED': acc_georef = 1
+                else: acc_georef = -1
             georef_accs.append(acc_georef)
-            
+
             bands = dict([])
             im_fn = dict([])
             # first delete dimensions key from dictionnary
             # otherwise the entire image is extracted (don't know why)
             im_bands = im_meta['bands']
             for j in range(len(im_bands)): del im_bands[j]['dimensions']
-            
+
             # Landsat 5 download
             if satname == 'L5':
                 bands[''] = [im_bands[0], im_bands[1], im_bands[2], im_bands[3],
-                             im_bands[4], im_bands[7]] 
-                im_fn[''] = im_date + '_' + satname + '_' + inputs['sitename'] + suffix 
+                             im_bands[4], im_bands[7]]
+                im_fn[''] = im_date + '_' + satname + '_' + inputs['sitename'] + suffix
                 # if two images taken at the same date add 'dup' to the name (duplicate)
                 if any(im_fn[''] in _ for _ in all_names):
                     im_fn[''] = im_date + '_' + satname + '_' + inputs['sitename'] + '_dup' + suffix
@@ -176,8 +171,8 @@ def retrieve_images(inputs):
                 filename_txt = im_fn[''].replace('.tif','')
                 metadict = {'filename':im_fn[''],'acc_georef':georef_accs[i],
                             'epsg':im_epsg[i]}
-                  
-            # Landsat 7 and 8 download                
+
+            # Landsat 7 and 8 download
             elif satname in ['L7', 'L8']:
                 if satname == 'L7':
                     bands['pan'] = [im_bands[8]] # panchromatic band
@@ -192,9 +187,9 @@ def retrieve_images(inputs):
                 # if two images taken at the same date add 'dup' to the name (duplicate)
                 if any(im_fn['pan'] in _ for _ in all_names):
                     for key in bands.keys():
-                        im_fn[key] = im_date + '_' + satname + '_' + inputs['sitename'] + '_' + key + '_dup' + suffix                    
+                        im_fn[key] = im_date + '_' + satname + '_' + inputs['sitename'] + '_' + key + '_dup' + suffix
                 all_names.append(im_fn['pan'])
-                filenames.append(im_fn['pan'])           
+                filenames.append(im_fn['pan'])
                 # download .tif from EE (panchromatic band and multispectral bands)
                 while True:
                     try:
@@ -209,18 +204,18 @@ def retrieve_images(inputs):
                     os.rename(local_data_pan, os.path.join(filepaths[1], im_fn['pan']))
                 except: # overwrite if already exists
                     os.remove(os.path.join(filepaths[1], im_fn['pan']))
-                    os.rename(local_data_pan, os.path.join(filepaths[1], im_fn['pan'])) 
+                    os.rename(local_data_pan, os.path.join(filepaths[1], im_fn['pan']))
                 try: # multispectral
                     os.rename(local_data_ms, os.path.join(filepaths[2], im_fn['ms']))
                 except: # overwrite if already exists
                     os.remove(os.path.join(filepaths[2], im_fn['ms']))
-                    os.rename(local_data_ms, os.path.join(filepaths[2], im_fn['ms']))   
+                    os.rename(local_data_ms, os.path.join(filepaths[2], im_fn['ms']))
                 # metadata for .txt file
                 filename_txt = im_fn['pan'].replace('_pan','').replace('.tif','')
                 metadict = {'filename':im_fn['pan'],'acc_georef':georef_accs[i],
                             'epsg':im_epsg[i]}
-                 
-            # Sentinel-2 download                
+
+            # Sentinel-2 download
             elif satname in ['S2']:
                 bands['10m'] = [im_bands[1], im_bands[2], im_bands[3], im_bands[7]] # multispectral bands
                 bands['20m'] = [im_bands[11]] # SWIR band
@@ -230,13 +225,13 @@ def retrieve_images(inputs):
                 # if two images taken at the same date add 'dup' to the name (duplicate)
                 if any(im_fn['10m'] in _ for _ in all_names):
                     for key in bands.keys():
-                        im_fn[key] = im_date + '_' + satname + '_' + inputs['sitename'] + '_' + key + '_dup' + suffix 
+                        im_fn[key] = im_date + '_' + satname + '_' + inputs['sitename'] + '_' + key + '_dup' + suffix
                     # also check for triplicates (only on S2 imagery) and add 'tri' to the name
                     if im_fn['10m'] in all_names:
                         for key in bands.keys():
-                            im_fn[key] = im_date + '_' + satname + '_' + inputs['sitename'] + '_' + key + '_tri' + suffix    
+                            im_fn[key] = im_date + '_' + satname + '_' + inputs['sitename'] + '_' + key + '_tri' + suffix
                 all_names.append(im_fn['10m'])
-                filenames.append(im_fn['10m']) 
+                filenames.append(im_fn['10m'])
                 # download .tif from EE (multispectral bands at 3 different resolutions)
                 while True:
                     try:
@@ -252,12 +247,12 @@ def retrieve_images(inputs):
                     os.rename(local_data_10m, os.path.join(filepaths[1], im_fn['10m']))
                 except: # overwrite if already exists
                     os.remove(os.path.join(filepaths[1], im_fn['10m']))
-                    os.rename(local_data_10m, os.path.join(filepaths[1], im_fn['10m'])) 
+                    os.rename(local_data_10m, os.path.join(filepaths[1], im_fn['10m']))
                 try: # 20m
                     os.rename(local_data_20m, os.path.join(filepaths[2], im_fn['20m']))
                 except: # overwrite if already exists
                     os.remove(os.path.join(filepaths[2], im_fn['20m']))
-                    os.rename(local_data_20m, os.path.join(filepaths[2], im_fn['20m'])) 
+                    os.rename(local_data_20m, os.path.join(filepaths[2], im_fn['20m']))
                 try: # 60m
                     os.rename(local_data_60m, os.path.join(filepaths[3], im_fn['60m']))
                 except: # overwrite if already exists
@@ -267,19 +262,19 @@ def retrieve_images(inputs):
                 filename_txt = im_fn['10m'].replace('_10m','').replace('.tif','')
                 metadict = {'filename':im_fn['10m'],'acc_georef':georef_accs[i],
                             'epsg':im_epsg[i]}
-           
+
             # write metadata
             with open(os.path.join(filepaths[0],filename_txt + '.txt'), 'w') as f:
                 for key in metadict.keys():
-                    f.write('%s\t%s\n'%(key,metadict[key]))                                
+                    f.write('%s\t%s\n'%(key,metadict[key]))
             # print percentage completion for user
-            print('\r%d%%' %int((i+1)/len(im_dict_T1[satname])*100), end='')            
-            
+            print('\r%d%%' %int((i+1)/len(im_dict_T1[satname])*100), end='')
+
         print('')
-           
+
     # once all images have been downloaded, load metadata from .txt files
     metadata = get_metadata(inputs)
-          
+
     # merge overlapping images (necessary only if the polygon is at the boundary of an image)
     if 'S2' in metadata.keys():
         if int(ee.__version__[-3:]) <= 201:
@@ -303,40 +298,40 @@ def retrieve_images(inputs):
 def check_images_available(inputs):
     """
     Create the structure of subfolders for each satellite mission
-     
-    KV WRL 2018       
-  
+
+    KV WRL 2018
+
     Arguments:
     -----------
-    inputs: dict 
+    inputs: dict
         inputs dictionnary
-    
+
     Returns:
     -----------
     im_dict_T1: list of dict
         list of images in Tier 1 and Level-1C
     im_dict_T2: list of dict
-        list of images in Tier 2 (Landsat only)   
+        list of images in Tier 2 (Landsat only)
     """
-    
+
     # check if EE was initialised or not
     try:
         ee.ImageCollection('LANDSAT/LT05/C01/T1_TOA')
     except:
         ee.Initialize()
-    
-    print('Images available between %s and %s:'%(inputs['dates'][0],inputs['dates'][1]), end='\n')          
+
+    print('Images available between %s and %s:'%(inputs['dates'][0],inputs['dates'][1]), end='\n')
     # check how many images are available in Tier 1 and Sentinel Level-1C
     col_names_T1 = {'L5':'LANDSAT/LT05/C01/T1_TOA',
                  'L7':'LANDSAT/LE07/C01/T1_TOA',
                  'L8':'LANDSAT/LC08/C01/T1_TOA',
                  'S2':'COPERNICUS/S2'}
-    
+
     print('- In Landsat Tier 1 & Sentinel-2 Level-1C:')
     im_dict_T1 = dict([])
     sum_img = 0
     for satname in inputs['sat_list']:
-        
+
         # get list of images in EE collection
         while True:
             try:
@@ -352,13 +347,13 @@ def check_images_available(inputs):
         sum_img = sum_img + len(im_list_upt)
         print('  %s: %d images'%(satname,len(im_list_upt)))
         im_dict_T1[satname] = im_list_upt
-              
+
     print('  Total: %d images'%sum_img)
-    
+
     # in only S2 is in sat_list, stop here
     if len(inputs['sat_list']) == 1 and inputs['sat_list'][0] == 'S2':
         return im_dict_T1, []
-    
+
     # otherwise check how many images are available in Landsat Tier 2
     col_names_T2 = {'L5':'LANDSAT/LT05/C01/T2_TOA',
                  'L7':'LANDSAT/LE07/C01/T2_TOA',
@@ -384,21 +379,21 @@ def check_images_available(inputs):
         print('  %s: %d images'%(satname,len(im_list_upt)))
         im_dict_T2[satname] = im_list_upt
 
-    print('  Total: %d images'%sum_img) 
+    print('  Total: %d images'%sum_img)
 
     return im_dict_T1, im_dict_T2
 
 
 def download_tif(image, polygon, bandsId, filepath):
     """
-    Downloads a .TIF image from the ee server. The image is downloaded as a 
-    zip file then moved to the working directory, unzipped and stacked into a 
+    Downloads a .TIF image from the ee server. The image is downloaded as a
+    zip file then moved to the working directory, unzipped and stacked into a
     single .TIF file.
-    
+
     Two different codes based on which version of the earth-engine-api is being
     used.
-       
-    KV WRL 2018       
+
+    KV WRL 2018
 
     Arguments:
     -----------
@@ -410,13 +405,13 @@ def download_tif(image, polygon, bandsId, filepath):
     bandsId: list of dict
         list of bands to be downloaded
     filepath: location where the temporary file should be saved
-    
+
     Returns:
     -----------
-    Downloads an image in a file named data.tif     
-      
+    Downloads an image in a file named data.tif
+
     """
-    
+
     # for the old version of ee only
     if int(ee.__version__[-3:]) <= 201:
         url = ee.data.makeDownloadUrl(ee.data.getDownloadId({
@@ -444,7 +439,7 @@ def download_tif(image, polygon, bandsId, filepath):
         # move zipfile from temp folder to data folder
         dest_file = os.path.join(filepath, 'imagezip')
         shutil.move(local_zip,dest_file)
-        # unzip file 
+        # unzip file
         with zipfile.ZipFile(dest_file) as local_zipfile:
             for fn in local_zipfile.namelist():
                 local_zipfile.extract(fn, filepath)
@@ -469,22 +464,22 @@ def download_tif(image, polygon, bandsId, filepath):
 def create_folder_structure(im_folder, satname):
     """
     Create the structure of subfolders for each satellite mission
-     
-    KV WRL 2018       
-  
+
+    KV WRL 2018
+
     Arguments:
     -----------
-    im_folder: str 
+    im_folder: str
         folder where the images are to be downloaded
     satname:
         name of the satellite mission
-    
+
     Returns:
     -----------
     filepaths: list of str
         filepaths of the folders that were created
-    """ 
-    
+    """
+
     # one folder for the metadata (common to all satellites)
     filepaths = [os.path.join(im_folder, satname, 'meta')]
     # subfolders depending on satellite mission
@@ -493,38 +488,38 @@ def create_folder_structure(im_folder, satname):
     elif satname in ['L7','L8']:
         filepaths.append(os.path.join(im_folder, satname, 'pan'))
         filepaths.append(os.path.join(im_folder, satname, 'ms'))
-    elif satname in ['S2']: 
+    elif satname in ['S2']:
         filepaths.append(os.path.join(im_folder, satname, '10m'))
         filepaths.append(os.path.join(im_folder, satname, '20m'))
         filepaths.append(os.path.join(im_folder, satname, '60m'))
     # create the subfolders if they don't exist already
-    for fp in filepaths: 
+    for fp in filepaths:
         if not os.path.exists(fp): os.makedirs(fp)
-    
-    return filepaths        
+
+    return filepaths
 
 
 def remove_cloudy_images(im_list, satname, prc_cloud_cover=95):
     """
     Removes from the EE collection very cloudy images (>95% cloud cover)
 
-    KV WRL 2018       
-   
+    KV WRL 2018
+
     Arguments:
     -----------
-    im_list: list 
+    im_list: list
         list of images in the collection
     satname:
         name of the satellite mission
     prc_cloud_cover: int
         percentage of cloud cover acceptable on the images
-    
+
     Returns:
     -----------
     im_list_upt: list
         updated list of images
     """
-    
+
     # remove very cloudy images from the collection (>95% cloud)
     if satname in ['L5','L7','L8']:
         cloud_property = 'CLOUD_COVER'
@@ -536,7 +531,7 @@ def remove_cloudy_images(im_list, satname, prc_cloud_cover=95):
         im_list_upt = [x for k,x in enumerate(im_list) if k not in idx_delete]
     else:
         im_list_upt = im_list
-        
+
     return im_list_upt
 
 
@@ -544,14 +539,14 @@ def filter_S2_collection(im_list):
     """
     Removes duplicates from the EE collection of Sentinel-2 images (many duplicates)
     Finds the images that were acquired at the same time but have different utm zones.
-    
-    KV WRL 2018       
+
+    KV WRL 2018
 
     Arguments:
     -----------
-    im_list: list 
+    im_list: list
         list of images in the collection
-    
+
     Returns:
     -----------
     im_list_flt: list
@@ -563,7 +558,7 @@ def filter_S2_collection(im_list):
                                          tz=pytz.utc) for _ in im_list]
     # get utm zone projections
     utm_zones = np.array([int(_['bands'][0]['crs'][5:]) for _ in im_list])
-    if len(np.unique(utm_zones)) == 1: 
+    if len(np.unique(utm_zones)) == 1:
         return im_list
     else:
         utm_zone_selected =  np.max(np.unique(utm_zones))
@@ -589,23 +584,23 @@ def filter_S2_collection(im_list):
                 i = np.where(idx_covered)[0][0]
             else:
                 break
-        # update the collection by deleting all those images that have same timestamp 
+        # update the collection by deleting all those images that have same timestamp
         # and different utm projection
         im_list_flt = [x for k,x in enumerate(im_list) if k not in idx_delete]
-    
+
     return im_list_flt
 
 
 def merge_overlapping_images(metadata,inputs):
     """
     Merge simultaneous overlapping images that cover the area of interest.
-    When the area of interest is located at the boundary between 2 images, there 
+    When the area of interest is located at the boundary between 2 images, there
     will be overlap between the 2 images and both will be downloaded from Google
-    Earth Engine. This function merges the 2 images, so that the area of interest 
+    Earth Engine. This function merges the 2 images, so that the area of interest
     is covered by only 1 image.
-    
+
     KV WRL 2018
-        
+
     Arguments:
     -----------
     metadata: dict
@@ -622,27 +617,27 @@ def merge_overlapping_images(metadata,inputs):
             [151.3, -33.7]]]
             ```
         'dates': list of str
-            list that contains 2 strings with the initial and final dates in 
+            list that contains 2 strings with the initial and final dates in
             format 'yyyy-mm-dd':
             ```
             dates = ['1987-01-01', '2018-01-01']
             ```
         'sat_list': list of str
-            list that contains the names of the satellite missions to include: 
+            list that contains the names of the satellite missions to include:
             ```
             sat_list = ['L5', 'L7', 'L8', 'S2']
             ```
         'filepath_data': str
             filepath to the directory where the images are downloaded
-        
+
     Returns:
     -----------
     metadata_updated: dict
         updated metadata
-            
+
     """
-    
-    # only for Sentinel-2 at this stage (not sure if this is needed for Landsat images)    
+
+    # only for Sentinel-2 at this stage (not sure if this is needed for Landsat images)
     sat = 'S2'
     filepath = os.path.join(inputs['filepath'], inputs['sitename'])
     filenames = metadata[sat]['filenames']
@@ -663,27 +658,27 @@ def merge_overlapping_images(metadata,inputs):
             idx_dup = np.where(boolvec)[0][0]
             pairs.append([i,idx_dup])
     # because they could be triplicates in S2 images, adjust the  for consecutive merges
-    for i in range(1,len(pairs)): 
-        if pairs[i-1][1] == pairs[i][0]: 
+    for i in range(1,len(pairs)):
+        if pairs[i-1][1] == pairs[i][0]:
             pairs[i][0] = pairs[i-1][0]
-    
+
     # for each pair of image, create a mask and add no_data into the .tif file (this is needed before merging .tif files)
     for i,pair in enumerate(pairs):
         fn_im = []
-        for index in range(len(pair)): 
+        for index in range(len(pair)):
             # get filenames of all the files corresponding to the each image in the pair
             fn_im.append([os.path.join(filepath, 'S2', '10m', filenames[pair[index]]),
                   os.path.join(filepath, 'S2', '20m',  filenames[pair[index]].replace('10m','20m')),
                   os.path.join(filepath, 'S2', '60m',  filenames[pair[index]].replace('10m','60m')),
                   os.path.join(filepath, 'S2', 'meta', filenames[pair[index]].replace('_10m','').replace('.tif','.txt'))])
             # read that image
-            im_ms, georef, cloud_mask, im_extra, im_QA, im_nodata = SDS_preprocess.preprocess_single(fn_im[index], sat, False) 
-            # im_RGB = SDS_preprocess.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9) 
-            
-            # in Sentinel2 images close to the edge of the image there are some artefacts, 
-            # that are squares with constant pixel intensities. They need to be masked in the 
-            # raster (GEOTIFF). It can be done using the image standard deviation, which 
-            # indicates values close to 0 for the artefacts.      
+            im_ms, georef, cloud_mask, im_extra, im_QA, im_nodata = SDS_preprocess.preprocess_single(fn_im[index], sat, False)
+            # im_RGB = SDS_preprocess.rescale_image_intensity(im_ms[:,:,[2,1,0]], cloud_mask, 99.9)
+
+            # in Sentinel2 images close to the edge of the image there are some artefacts,
+            # that are squares with constant pixel intensities. They need to be masked in the
+            # raster (GEOTIFF). It can be done using the image standard deviation, which
+            # indicates values close to 0 for the artefacts.
             if len(im_ms) > 0:
                 # calculate image std for the first 10m band
                 im_std = SDS_tools.image_std(im_ms[:,:,0],1)
@@ -699,34 +694,34 @@ def merge_overlapping_images(metadata,inputs):
                 # create another mask for the 20m band (SWIR1)
                 im_std = SDS_tools.image_std(im_extra,1)
                 im_binary = np.logical_or(im_std < 1e-6, np.isnan(im_std))
-                mask20 = morphology.dilation(im_binary, morphology.square(3))     
+                mask20 = morphology.dilation(im_binary, morphology.square(3))
                 im_extra[mask20] = np.nan
                 # mask the 20m .tif file (im_extra)
-                SDS_tools.mask_raster(fn_im[index][1], mask20) 
+                SDS_tools.mask_raster(fn_im[index][1], mask20)
                 # use the 20m mask to create a mask for the 60m QA band (by resampling)
                 mask60 = ndimage.zoom(mask20,zoom=1/3,order=0)
                 mask60 = transform.resize(mask60, im_QA.shape, mode='constant', order=0,
                                           preserve_range=True)
                 mask60 = mask60.astype(bool)
                 # mask the 60m .tif file (im_QA)
-                SDS_tools.mask_raster(fn_im[index][2], mask60)    
+                SDS_tools.mask_raster(fn_im[index][2], mask60)
             else:
                 continue
-            
+
             # make a figure for quality control
             # fig,ax= plt.subplots(2,2,tight_layout=True)
             # ax[0,0].imshow(im_RGB)
             # ax[0,0].set_title('RGB original')
             # ax[1,0].imshow(mask10)
             # ax[1,0].set_title('Mask 10m')
-            # ax[0,1].imshow(mask20)  
+            # ax[0,1].imshow(mask20)
             # ax[0,1].set_title('Mask 20m')
             # ax[1,1].imshow(mask60)
             # ax[1,1].set_title('Mask 60 m')
-        
+
         # once all the pairs of .tif files have been masked with no_data, merge the using gdal_merge
         fn_merged = os.path.join(filepath, 'merged.tif')
-        
+
         # merge masked 10m bands and remove duplicate file
         gdal_merge.main(['', '-o', fn_merged, '-n', '0', fn_im[0][0], fn_im[1][0]])
         os.chmod(fn_im[0][0], 0o777)
@@ -735,7 +730,7 @@ def merge_overlapping_images(metadata,inputs):
         os.remove(fn_im[1][0])
         os.chmod(fn_merged, 0o777)
         os.rename(fn_merged, fn_im[0][0])
-        
+
         # merge masked 20m band (SWIR band)
         gdal_merge.main(['', '-o', fn_merged, '-n', '0', fn_im[0][1], fn_im[1][1]])
         os.chmod(fn_im[0][1], 0o777)
@@ -744,7 +739,7 @@ def merge_overlapping_images(metadata,inputs):
         os.remove(fn_im[1][1])
         os.chmod(fn_merged, 0o777)
         os.rename(fn_merged, fn_im[0][1])
-    
+
         # merge QA band (60m band)
         gdal_merge.main(['', '-o', fn_merged, '-n', '0', fn_im[0][2], fn_im[1][2]])
         os.chmod(fn_im[0][2], 0o777)
@@ -753,13 +748,13 @@ def merge_overlapping_images(metadata,inputs):
         os.remove(fn_im[1][2])
         os.chmod(fn_merged, 0o777)
         os.rename(fn_merged, fn_im[0][2])
-        
+
         # remove the metadata .txt file of the duplicate image
         os.chmod(fn_im[1][3], 0o777)
         os.remove(fn_im[1][3])
-        
+
     print('%d Sentinel-2 images were merged (overlapping or duplicate)' % len(pairs))
-    
+
     # update the metadata dict
     metadata_updated = copy.deepcopy(metadata)
     idx_removed = []
@@ -769,17 +764,17 @@ def merge_overlapping_images(metadata,inputs):
         if not idx in idx_removed: idx_kept.append(idx)
     for key in metadata_updated[sat].keys():
         metadata_updated[sat][key] = [metadata_updated[sat][key][_] for _ in idx_kept]
-        
-    return metadata_updated  
+
+    return metadata_updated
 
 
 def get_metadata(inputs):
     """
-    Gets the metadata from the downloaded images by parsing .txt files located 
-    in the \meta subfolder. 
-    
+    Gets the metadata from the downloaded images by parsing .txt files located
+    in the \meta subfolder.
+
     KV WRL 2018
-        
+
     Arguments:
     -----------
     inputs: dict with the following fields
@@ -787,13 +782,13 @@ def get_metadata(inputs):
             name of the site
         'filepath_data': str
             filepath to the directory where the images are downloaded
-    
+
     Returns:
     -----------
     metadata: dict
         contains the information about the satellite images that were downloaded:
-        date, filename, georeferencing accuracy and image coordinate reference system 
-           
+        date, filename, georeferencing accuracy and image coordinate reference system
+
     """
     # directory containing the images
     filepath = os.path.join(inputs['filepath'],inputs['sitename'])
@@ -827,10 +822,9 @@ def get_metadata(inputs):
                 metadata[satname]['acc_georef'].append(acc_georef)
                 metadata[satname]['epsg'].append(epsg)
                 metadata[satname]['dates'].append(date)
-                
+
     # save a .pkl file containing the metadata dict
     with open(os.path.join(filepath, inputs['sitename'] + '_metadata' + '.pkl'), 'wb') as f:
         pickle.dump(metadata, f)
-    
+
     return metadata
-                 
\ No newline at end of file

From 06aedb1bfd2e82da0f417f35a51cb9bbd733dcca Mon Sep 17 00:00:00 2001
From: Kilian Vos <voskilian@gmail.com>
Date: Thu, 23 Jul 2020 19:13:46 +1000
Subject: [PATCH 2/3] add tools

added tools to remove duplicates and images with bad georeferencing
---
 coastsat/SDS_download.py |  3 +-
 coastsat/SDS_tools.py    | 79 ++++++++++++++++++++++++++++++++++++++++
 example.py               |  5 +++
 example_jupyter.ipynb    | 19 +++++++++-
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/coastsat/SDS_download.py b/coastsat/SDS_download.py
index 4c4fa279..b57d1384 100644
--- a/coastsat/SDS_download.py
+++ b/coastsat/SDS_download.py
@@ -131,7 +131,8 @@ def retrieve_images(inputs):
                 # the name of the property containing the flag changes across the S2 archive
                 # check which flag name is used for the image and store the 1/-1 for acc_georef
                 flag_names = ['GEOMETRIC_QUALITY_FLAG', 'GEOMETRIC_QUALITY', 'quality_check']
-                for key in flag_names: if key in im_meta['properties'].keys(): break
+                for key in flag_names: 
+                    if key in im_meta['properties'].keys(): break
                 if im_meta['properties'][key] == 'PASSED': acc_georef = 1
                 else: acc_georef = -1
             georef_accs.append(acc_georef)
diff --git a/coastsat/SDS_tools.py b/coastsat/SDS_tools.py
index 478795c5..657a4d9b 100644
--- a/coastsat/SDS_tools.py
+++ b/coastsat/SDS_tools.py
@@ -413,6 +413,85 @@ def merge_output(output):
 
     return output_all
 
+
+def remove_duplicates(output):
+    """
+    Function to remove from the output dictionnary entries containing shorelines for 
+    the same date and satellite mission. This happens when there is an overlap between 
+    adjacent satellite images.
+
+    Arguments:
+    -----------
+        output: dict
+            contains output dict with shoreline and metadata
+
+    Returns:
+    -----------
+        output_no_duplicates: dict
+            contains the updated dict where duplicates have been removed
+
+    """
+
+    # nested function
+    def duplicates_dict(lst):
+        "return duplicates and indices"
+        def duplicates(lst, item):
+                return [i for i, x in enumerate(lst) if x == item]
+        return dict((x, duplicates(lst, x)) for x in set(lst) if lst.count(x) > 1)
+
+    dates = output['dates']
+    # make a list with year/month/day
+    dates_str = [_.strftime('%Y%m%d') for _ in dates]
+    # create a dictionnary with the duplicates
+    dupl = duplicates_dict(dates_str)
+    # if there are duplicates, only keep the first element
+    if dupl:
+        output_no_duplicates = dict([])
+        idx_remove = []
+        for k,v in dupl.items():
+            idx_remove.append(v[0])
+        idx_remove = sorted(idx_remove)
+        idx_all = np.linspace(0, len(dates_str)-1, len(dates_str))
+        idx_keep = list(np.where(~np.isin(idx_all,idx_remove))[0])
+        for key in output.keys():
+            output_no_duplicates[key] = [output[key][i] for i in idx_keep]
+        print('%d duplicates' % len(idx_remove))
+        return output_no_duplicates
+    else:
+        print('0 duplicates')
+        return output
+
+def remove_inaccurate_georef(output, accuracy):
+    """
+    Function to remove from the output dictionnary entries containing shorelines 
+    that were mapped on images with inaccurate georeferencing:
+        - RMSE > accuracy for Landsat images
+        - failed geometric test for Sentinel images (flagged with -1)
+
+    Arguments:
+    -----------
+        output: dict
+            contains the extracted shorelines and corresponding metadata
+        accuracy: int
+            minimum horizontal georeferencing accuracy (metres) for a shoreline to be accepted
+
+    Returns:
+    -----------
+        output_filtered: dict
+            contains the updated dictionnary
+
+    """
+
+    # find indices of shorelines to be removed
+    idx = np.where(~np.logical_or(np.array(output['geoaccuracy']) == -1,
+                                  np.array(output['geoaccuracy']) >= accuracy))[0]
+    # idx = np.where(~(np.array(output['geoaccuracy']) >= accuracy))[0]
+    output_filtered = dict([])
+    for key in output.keys():
+        output_filtered[key] = [output[key][i] for i in idx]
+    print('%d bad georef' % (len(output['geoaccuracy']) - len(idx)))
+    return output_filtered
+
 ###################################################################################################
 # CONVERSIONS FROM DICT TO GEODATAFRAME AND READ/WRITE GEOJSON
 ###################################################################################################
diff --git a/example.py b/example.py
index 3541c7ff..6464f0b0 100644
--- a/example.py
+++ b/example.py
@@ -91,6 +91,11 @@
 # extract shorelines from all images (also saves output.pkl and shorelines.kml)
 output = SDS_shoreline.extract_shorelines(metadata, settings)
 
+# remove duplicates (images taken on the same date by the same satellite)
+output = SDS_tools.remove_duplicates(output)
+# remove inaccurate georeferencing (set threshold to 10 m)
+output = SDS_tools.remove_inaccurate_georef(output, 10)
+
 # plot the mapped shorelines
 fig = plt.figure()
 plt.axis('equal')
diff --git a/example_jupyter.ipynb b/example_jupyter.ipynb
index 09c25116..5d5c46b0 100644
--- a/example_jupyter.ipynb
+++ b/example_jupyter.ipynb
@@ -207,6 +207,23 @@
     "output = SDS_shoreline.extract_shorelines(metadata, settings)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then remove duplicates and images with inaccurate georeferencing (threhsold at 10m)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = SDS_tools.remove_duplicates(output) # removes duplicates (images taken on the same date by the same satellite)\n",
+    "output = SDS_tools.remove_inaccurate_georef(output, 10) # remove inaccurate georeferencing (set threshold to 10 m)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -387,7 +404,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.7.4"
   },
   "toc": {
    "base_numbering": 1,

From f67e22cf872ebc64d0e1eb580375f4ec058387a8 Mon Sep 17 00:00:00 2001
From: Kilian Vos <voskilian@gmail.com>
Date: Thu, 23 Jul 2020 19:47:12 +1000
Subject: [PATCH 3/3] make geojson as multipoint

the linestring with the shorelines was somewhat confusing, the new output is as a multipoint which is better for visualisation purposes
---
 coastsat/SDS_tools.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/coastsat/SDS_tools.py b/coastsat/SDS_tools.py
index 657a4d9b..cfb014e5 100644
--- a/coastsat/SDS_tools.py
+++ b/coastsat/SDS_tools.py
@@ -580,7 +580,9 @@ def output_to_gdf(output):
             continue
         else:
             # save the geometry + attributes
-            geom = geometry.LineString(output['shorelines'][i])
+            coords = output['shorelines'][i]
+            geom = geometry.MultiPoint([(coords[_,0], coords[_,1]) for _ in range(coords.shape[0])])
+            # geom = geometry.LineString(output['shorelines'][i])
             gdf = gpd.GeoDataFrame(geometry=gpd.GeoSeries(geom))
             gdf.index = [i]
             gdf.loc[i,'date'] = output['dates'][i].strftime('%Y-%m-%d %H:%M:%S')