Refactor packet loss data loading and model training to support batch…

… processing
sand-ci · Dec 12, 2024 · a9f3394 · a9f3394
1 parent a5fe306
commit a9f3394
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 33 deletions.
diff --git a/src/ml/create_packet_loss_dataset.py b/src/ml/create_packet_loss_dataset.py
@@ -5,15 +5,18 @@
 from utils.helpers import timer
 
 @timer
-def loadPacketLossData(dateFrom, dateTo):
+def loadPacketLossData(dateFrom, dateTo, batch_size=10000):
     data = []
     intv = int(hp.CalcMinutes4Period(dateFrom, dateTo) / 60)
     time_list = hp.GetTimeRanges(dateFrom, dateTo, intv)
     for i in range(len(time_list) - 1):
         print(f' {i+1}/{len(time_list)-1} packetloss query', time_list[i], time_list[i + 1])
         data.extend(qrs.query4Avg('ps_packetloss', time_list[i], time_list[i + 1]))
-
-    return pd.DataFrame(data)
+        if len(data) >= batch_size:
+            yield pd.DataFrame(data)
+            data = []
+    if data:
+        yield pd.DataFrame(data)
 
 
 def getPercentageMeasuresDone(df, dateFrom, dateTo):
@@ -34,24 +37,22 @@ def findRatio(row, total_minutes):
     return df
 
 
+def setFlag(x):
+    if x >= 0 and x < 0.02:
+        return 0
+    elif x >= 0.02 and x < 1:
+        return 1
+    elif x == 1:
+        return 2
+    return 'something is wrong'
+
+
 @timer
 def markPairs(dateFrom, dateTo):
     dataDf = loadPacketLossData(dateFrom, dateTo)
     dataDf = dataDf[~dataDf['value'].isnull()]
     df = getPercentageMeasuresDone(dataDf, dateFrom, dateTo)
 
-    # set value to 0 - we consider there is no issue bellow 2% loss
-    # set value to 1 - the pair is marked problematic between 2% and 100% loss
-    # set value to 2 - the pair shows 100% loss
-    def setFlag(x):
-        if x >= 0 and x < 0.02:
-            return 0
-        elif x >= 0.02 and x < 1:
-            return 1
-        elif x == 1:
-            return 2
-        return 'something is wrong'
-
     df['flag'] = df['value'].apply(lambda val: setFlag(val))
     df.rename(columns={'value': 'avg_value'}, inplace=True)
     df = df.round({'avg_value': 3})
@@ -67,12 +68,17 @@ def setFlag(x):
                                     that there should be 1 measure per minute
 """
 @timer
-def createPcktDataset(dateFrom, dateTo):
-    # dateFrom, dateTo = ['2023-10-01 03:00', '2023-10-03 03:00']
-    plsDf = markPairs(dateFrom, dateTo)
+def createPcktDataset(dateFrom, dateTo, batch_size=10000):
+    all_data = []
+    for batch in loadPacketLossData(dateFrom, dateTo, batch_size):
+        batch = batch[~batch['value'].isnull()]
+        batch = getPercentageMeasuresDone(batch, dateFrom, dateTo)
+        batch['flag'] = batch['value'].apply(lambda val: setFlag(val))
+        batch.rename(columns={'value': 'avg_value'}, inplace=True)
+        batch = batch.round({'avg_value': 3})
+        all_data.append(batch)
+    plsDf = pd.concat(all_data)
     plsDf = plsDf[plsDf['tests_done'] != '0%']
-
     plsDf['src_site'] = plsDf['src_site'].str.upper()
     plsDf['dest_site'] = plsDf['dest_site'].str.upper()
-
     return plsDf
diff --git a/src/ml/packet_loss_train_model.py b/src/ml/packet_loss_train_model.py
@@ -1,37 +1,32 @@
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, f1_score
-from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 import xgboost as xgb
 import matplotlib.pyplot as plt
 
-def packet_loss_train_model(plsDf_onehot):
-
+def packet_loss_train_model(plsDf_onehot, batch_size=10000):
     plsDf_custom_y = plsDf_onehot['flag']
     plsDf_custom_x = plsDf_onehot.drop(['flag'], axis=1)
     del plsDf_onehot
 
     # Train test split (training on one month)
-    X_train, X_test, y_train, y_test = train_test_split(plsDf_custom_x, plsDf_custom_y, test_size=0.20, random_state=0,
-                                                        shuffle=False)
+    X_train, X_test, y_train, y_test = train_test_split(plsDf_custom_x, plsDf_custom_y, test_size=0.20, random_state=0, shuffle=False)
 
     del plsDf_custom_y
     del plsDf_custom_x
 
-    #Training the XGB Classifier
+    # Training the XGB Classifier in batches
     model = xgb.XGBClassifier(random_state=0, objective='multi:softmax')
-    model.fit(X_train, y_train)
+    for i in range(0, len(X_train), batch_size):
+        end = i + batch_size
+        model.fit(X_train[i:end], y_train[i:end], xgb_model=model if i > 0 else None)
+
     y_pred = model.predict(X_test)
 
     # Evaluation metrics
     print("Accuracy of the XGB Classifier:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
-    # print("F1 score of the XGB Classifier:", f1_score(y_test, y_pred), "\n")
     print(classification_report(y_test, y_pred))
     confusion_matrix_data = confusion_matrix(y_test, y_pred, labels=model.classes_)
     print(confusion_matrix_data, "\n")
-    # labels = ['0','1']
-    # disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_data, display_labels=model.classes_)
-    # disp = disp.plot(cmap=plt.cm.YlGnBu, values_format='g')
-    # plt.show()
 
     del X_train, X_test, y_train, y_test