Skip to content

Commit

Permalink
Refactor packet loss data loading and model training to support batch…
Browse files Browse the repository at this point in the history
… processing
  • Loading branch information
petya-vasileva committed Dec 12, 2024
1 parent a5fe306 commit a9f3394
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 33 deletions.
46 changes: 26 additions & 20 deletions src/ml/create_packet_loss_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@
from utils.helpers import timer

@timer
def loadPacketLossData(dateFrom, dateTo):
def loadPacketLossData(dateFrom, dateTo, batch_size=10000):
data = []
intv = int(hp.CalcMinutes4Period(dateFrom, dateTo) / 60)
time_list = hp.GetTimeRanges(dateFrom, dateTo, intv)
for i in range(len(time_list) - 1):
print(f' {i+1}/{len(time_list)-1} packetloss query', time_list[i], time_list[i + 1])
data.extend(qrs.query4Avg('ps_packetloss', time_list[i], time_list[i + 1]))

return pd.DataFrame(data)
if len(data) >= batch_size:
yield pd.DataFrame(data)
data = []
if data:
yield pd.DataFrame(data)


def getPercentageMeasuresDone(df, dateFrom, dateTo):
Expand All @@ -34,24 +37,22 @@ def findRatio(row, total_minutes):
return df


def setFlag(x):
if x >= 0 and x < 0.02:
return 0
elif x >= 0.02 and x < 1:
return 1
elif x == 1:
return 2
return 'something is wrong'


@timer
def markPairs(dateFrom, dateTo):
dataDf = loadPacketLossData(dateFrom, dateTo)
dataDf = dataDf[~dataDf['value'].isnull()]
df = getPercentageMeasuresDone(dataDf, dateFrom, dateTo)

# set value to 0 - we consider there is no issue bellow 2% loss
# set value to 1 - the pair is marked problematic between 2% and 100% loss
# set value to 2 - the pair shows 100% loss
def setFlag(x):
if x >= 0 and x < 0.02:
return 0
elif x >= 0.02 and x < 1:
return 1
elif x == 1:
return 2
return 'something is wrong'

df['flag'] = df['value'].apply(lambda val: setFlag(val))
df.rename(columns={'value': 'avg_value'}, inplace=True)
df = df.round({'avg_value': 3})
Expand All @@ -67,12 +68,17 @@ def setFlag(x):
that there should be 1 measure per minute
"""
@timer
def createPcktDataset(dateFrom, dateTo):
# dateFrom, dateTo = ['2023-10-01 03:00', '2023-10-03 03:00']
plsDf = markPairs(dateFrom, dateTo)
def createPcktDataset(dateFrom, dateTo, batch_size=10000):
all_data = []
for batch in loadPacketLossData(dateFrom, dateTo, batch_size):
batch = batch[~batch['value'].isnull()]
batch = getPercentageMeasuresDone(batch, dateFrom, dateTo)
batch['flag'] = batch['value'].apply(lambda val: setFlag(val))
batch.rename(columns={'value': 'avg_value'}, inplace=True)
batch = batch.round({'avg_value': 3})
all_data.append(batch)
plsDf = pd.concat(all_data)
plsDf = plsDf[plsDf['tests_done'] != '0%']

plsDf['src_site'] = plsDf['src_site'].str.upper()
plsDf['dest_site'] = plsDf['dest_site'].str.upper()

return plsDf
21 changes: 8 additions & 13 deletions src/ml/packet_loss_train_model.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,32 @@
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt

def packet_loss_train_model(plsDf_onehot):

def packet_loss_train_model(plsDf_onehot, batch_size=10000):
plsDf_custom_y = plsDf_onehot['flag']
plsDf_custom_x = plsDf_onehot.drop(['flag'], axis=1)
del plsDf_onehot

# Train test split (training on one month)
X_train, X_test, y_train, y_test = train_test_split(plsDf_custom_x, plsDf_custom_y, test_size=0.20, random_state=0,
shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(plsDf_custom_x, plsDf_custom_y, test_size=0.20, random_state=0, shuffle=False)

del plsDf_custom_y
del plsDf_custom_x

#Training the XGB Classifier
# Training the XGB Classifier in batches
model = xgb.XGBClassifier(random_state=0, objective='multi:softmax')
model.fit(X_train, y_train)
for i in range(0, len(X_train), batch_size):
end = i + batch_size
model.fit(X_train[i:end], y_train[i:end], xgb_model=model if i > 0 else None)

y_pred = model.predict(X_test)

# Evaluation metrics
print("Accuracy of the XGB Classifier:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
# print("F1 score of the XGB Classifier:", f1_score(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred))
confusion_matrix_data = confusion_matrix(y_test, y_pred, labels=model.classes_)
print(confusion_matrix_data, "\n")
# labels = ['0','1']
# disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_data, display_labels=model.classes_)
# disp = disp.plot(cmap=plt.cm.YlGnBu, values_format='g')
# plt.show()

del X_train, X_test, y_train, y_test

Expand Down

0 comments on commit a9f3394

Please sign in to comment.