Refactor data processing in packet loss and throughput model training…

…; update datetime handling and improve file writing methods
sand-ci · Dec 12, 2024 · e358e5a · e358e5a
1 parent b23957e
commit e358e5a
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 21 deletions.
diff --git a/src/ml/packet_loss_one_month_onehot.py b/src/ml/packet_loss_one_month_onehot.py
@@ -2,27 +2,22 @@
 import datetime
 
 def one_month_data(plsDf_custom):
-
-    #Preprocessing
-    plsDf_custom = plsDf_custom.drop(['src','dest','pair','src_host','dest_host'], axis=1)
-    plsDf_custom['dt'] = plsDf_custom['to']
+    # Preprocessing
+    plsDf_custom = plsDf_custom.drop(['src', 'dest', 'pair', 'src_host', 'dest_host'], axis=1)
+    plsDf_custom['dt'] = pd.to_datetime(plsDf_custom['to'], utc=True)
 
     plsDf_custom['tests_done'] = plsDf_custom['tests_done'].str.rstrip('%').astype('float') / 100.0
 
-    #ONE HOT encoding
-    plsDf_onehot = pd.get_dummies(plsDf_custom,dtype=int)
+    # ONE HOT encoding
+    plsDf_onehot = pd.get_dummies(plsDf_custom, dtype=int)
 
-    # taking the index of the first 28 days for further training
-    date_s = list(pd.to_datetime(plsDf_onehot['dt'], utc=True)[:1])[0]
-    date_s = date_s.date()
-    date_s = (date_s + datetime.timedelta(days=28))
-    try:
-        end_index = plsDf_onehot.loc[(pd.to_datetime(plsDf_onehot['dt'], utc=True).dt.date == date_s)][:1].index[0]
-        percentile = plsDf_onehot.index.get_loc(end_index) / len(plsDf_onehot)
-    except:
-        percentile = 0.8
+    # Determine the number of days in the dataframe
+    num_days = (plsDf_custom['dt'].max() - plsDf_custom['dt'].min()).days
 
-    first_month_n = round(len(plsDf_onehot.index)*percentile)
+    if num_days >= 60:
+        first_month_n = (plsDf_custom['dt'] < (plsDf_custom['dt'].min() + pd.Timedelta(days=28))).sum()
+    else:
+        first_month_n = int(len(plsDf_custom) * 0.8)
 
     return plsDf_onehot.iloc[:first_month_n], plsDf_onehot
 
diff --git a/src/ml/thrpt_dataset_model_train.py b/src/ml/thrpt_dataset_model_train.py
@@ -22,7 +22,7 @@ def preprocess(rawDf_custom):
 
     #onehot encoding the dataset
     rawDf_onehot = pd.get_dummies(rawDf_custom,dtype=int)
-    rawDf_onehot.ipv6 = rawDf_onehot.ipv6.replace({True: 1, False: 0})
+    rawDf_onehot.ipv6 = rawDf_onehot.ipv6.replace({True: 1, False: 0}).infer_objects(copy=False)
     return rawDf_onehot
 
 def trainMLmodel(rawDf):

diff --git a/src/model/Updater.py b/src/model/Updater.py
@@ -242,7 +242,7 @@ def storePathChangeDescDf(self):
 
             df['jumpedFrom'] = df['jumpedFrom'].astype(int)
             df['diff'] = df['diff'].astype(int)
-            self.pq.writeToFile(df, f"parquet/prev_next_asn.parquet")
+            self.pq.writeToFile(df, f"{self.location}prev_next_asn.parquet")
 
 
     def createLocation(self, required_folders):
@@ -291,7 +291,6 @@ def storePacketLossDataAndModel(self):
 
         plsDf = createPcktDataset(start_date, end_date)
         self.pq.writeToFile(plsDf, f'{self.location}ml-datasets/packet_loss_Df.parquet')
-        print('packet_loss_Df.parquet created', len(plsDf))
 
         # onehot encode the whole dataset and leave only one month for further ML training
         plsDf_onehot_month, plsDf_onehot = one_month_data(plsDf)

diff --git a/src/utils/parquet.py b/src/utils/parquet.py
@@ -18,9 +18,9 @@ def writeToFile(df, filename):
                 df[col] = df[col].dt.strftime(DATE_FORMAT)
             table = pa.Table.from_pandas(df, preserve_index=True)
             pq.write_table(table, filename)
-            logging.info(f"Successfully wrote to file: {filename}")
+            print(f"Successfully wrote to file: {filename}")
         except Exception as e:
-            logging.error(f"Error writing to file: {filename}, Exception: {e}")
+            print(f"Error writing to file: {filename}, Exception: {e}")
 
     @staticmethod
     def readSequenceOfFiles(location, prefix):