First pass of confidence column from max local score array values

- Demonstrated in tutorial notebook - added into IsoAutio - Handled in annotation_chunker()
UCSD-E4E · Jan 20, 2024 · cec1213 · cec1213
1 parent 42da684
commit cec1213
Show file tree

Hide file tree

Showing 3 changed files with 460 additions and 357 deletions.
diff --git a/PyHa/IsoAutio.py b/PyHa/IsoAutio.py
@@ -125,6 +125,36 @@ def build_isolation_parameters_microfaune(
 
     return isolation_parameters
 
+def write_confidence(local_score_arr, automated_labels_df):
+    """
+    Function that adds a new column to a clip dataframe that has had automated labels generated.
+    Goes through all of the annotations and adding to said row a confidence metric based on the
+    maximum value of said annotation.
+
+    Args:
+        local_score_arr (np.ndarray or list of floats)
+            - Array of small predictions of bird presence
+        automated_labels_df (pd.DataFrame)
+            - labels derived from the local_score_arr from the def isolate() method for the "IN FILE"
+            column clip
+    returns:
+        Pandas DataFrame with an additional column of the confidence scores from the local score array
+    """
+    assert isinstance(local_score_arr, np.ndarray) or isinstance(local_score_arr, list)
+    assert isinstance(automated_labels_df, pd.DataFrame)
+    assert len(automated_labels_df) > 0
+
+    time_ratio = len(local_score_arr)/automated_labels_df["CLIP LENGTH"][0]
+    confidences = []
+    for row in automated_labels_df.index:
+        start_ndx = int(automated_labels_df["OFFSET"][row] * time_ratio)
+        end_ndx = start_ndx + int(automated_labels_df["DURATION"][row] * time_ratio)
+        cur_confidence = np.max(local_score_arr[start_ndx:end_ndx])
+        confidences.append(cur_confidence)
+
+    automated_labels_df["CONFIDENCE"] = confidences
+    return automated_labels_df
+
 
 def isolate(
         local_scores,
@@ -225,6 +255,10 @@ def isolate(
             filename,
             isolation_parameters,
             manual_id=manual_id)
+
+    if "write_confidence" in isolation_parameters.keys():
+        if isolation_parameters["write_confidence"]:
+            isolation_df = write_confidence(local_scores, isolation_df)
 
     return isolation_df
 

diff --git a/PyHa/annotation_post_processing.py b/PyHa/annotation_post_processing.py
@@ -15,17 +15,23 @@ def annotation_chunker(kaleidoscope_df, chunk_length):
         kaleidoscope_df (Dataframe)
             - Dataframe of annotations in kaleidoscope format
 
-        chunk_length (int)
+        chunk_length (int, float)
             - duration to set all annotation chunks
     Returns:
         Dataframe of labels with chunk_length duration 
         (elements in "OFFSET" are divisible by chunk_length).
     """
-
+    assert isinstance(kaleidoscope_df, pd.DataFrame)
+    assert isinstance(chunk_length, int) or isinstance(chunk_length, float)
+    assert chunk_length > 0
     #Init list of clips to cycle through and output dataframe
     clips = kaleidoscope_df["IN FILE"].unique()
     df_columns = {'IN FILE' :'str', 'CLIP LENGTH' : 'float64', 'CHANNEL' : 'int64', 'OFFSET' : 'float64',
                 'DURATION' : 'float64', 'SAMPLE RATE' : 'int64','MANUAL ID' : 'str'}
+    set_confidence = False
+    if "CONFIDENCE" in kaleidoscope_df.keys():
+        df_columns["CONFIDENCE"] = 'float64'
+        set_confidence = True
     output_df = pd.DataFrame({c: pd.Series(dtype=t) for c, t in df_columns.items()})
 
     # going through each clip
@@ -57,14 +63,18 @@ def annotation_chunker(kaleidoscope_df, chunk_length):
                         1000,
                         0))
                 # Placing the label relative to the clip
-                human_arr[minval:maxval] = 1
+                if set_confidence:
+                    human_arr[minval:maxval] = species_df["CONFIDENCE"][annotation]
+                else:
+                    human_arr[minval:maxval] = 1
             # performing the chunk isolation technique on the human array
 
             for index in range(potential_annotation_count):
                 chunk_start = index * (chunk_length*1000)
                 chunk_end = min((index+1)*chunk_length*1000,arr_len)
                 chunk = human_arr[int(chunk_start):int(chunk_end)]
-                if max(chunk) >= 0.5:
+                chunk_max = max(chunk)
+                if chunk_max > 1e-4:
                     row = pd.DataFrame(index = [0])
                     annotation_start = chunk_start / 1000
                     #updating the dictionary
@@ -75,5 +85,7 @@ def annotation_chunker(kaleidoscope_df, chunk_length):
                     row["SAMPLE RATE"] = sr
                     row["MANUAL ID"] = bird
                     row["CHANNEL"] = 0
+                    if set_confidence:
+                        row["CONFIDENCE"] = chunk_max
                     output_df = pd.concat([output_df,row], ignore_index=True)
     return output_df