Skip to content

Commit

Permalink
Refactor data processing in packet loss and throughput model training…
Browse files Browse the repository at this point in the history
…; update datetime handling and improve file writing methods
  • Loading branch information
petya-vasileva committed Dec 12, 2024
1 parent b23957e commit e358e5a
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 21 deletions.
27 changes: 11 additions & 16 deletions src/ml/packet_loss_one_month_onehot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,22 @@
import datetime

def one_month_data(plsDf_custom):

#Preprocessing
plsDf_custom = plsDf_custom.drop(['src','dest','pair','src_host','dest_host'], axis=1)
plsDf_custom['dt'] = plsDf_custom['to']
# Preprocessing
plsDf_custom = plsDf_custom.drop(['src', 'dest', 'pair', 'src_host', 'dest_host'], axis=1)
plsDf_custom['dt'] = pd.to_datetime(plsDf_custom['to'], utc=True)

plsDf_custom['tests_done'] = plsDf_custom['tests_done'].str.rstrip('%').astype('float') / 100.0

#ONE HOT encoding
plsDf_onehot = pd.get_dummies(plsDf_custom,dtype=int)
# ONE HOT encoding
plsDf_onehot = pd.get_dummies(plsDf_custom, dtype=int)

# taking the index of the first 28 days for further training
date_s = list(pd.to_datetime(plsDf_onehot['dt'], utc=True)[:1])[0]
date_s = date_s.date()
date_s = (date_s + datetime.timedelta(days=28))
try:
end_index = plsDf_onehot.loc[(pd.to_datetime(plsDf_onehot['dt'], utc=True).dt.date == date_s)][:1].index[0]
percentile = plsDf_onehot.index.get_loc(end_index) / len(plsDf_onehot)
except:
percentile = 0.8
# Determine the number of days in the dataframe
num_days = (plsDf_custom['dt'].max() - plsDf_custom['dt'].min()).days

first_month_n = round(len(plsDf_onehot.index)*percentile)
if num_days >= 60:
first_month_n = (plsDf_custom['dt'] < (plsDf_custom['dt'].min() + pd.Timedelta(days=28))).sum()
else:
first_month_n = int(len(plsDf_custom) * 0.8)

return plsDf_onehot.iloc[:first_month_n], plsDf_onehot

2 changes: 1 addition & 1 deletion src/ml/thrpt_dataset_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def preprocess(rawDf_custom):

#onehot encoding the dataset
rawDf_onehot = pd.get_dummies(rawDf_custom,dtype=int)
rawDf_onehot.ipv6 = rawDf_onehot.ipv6.replace({True: 1, False: 0})
rawDf_onehot.ipv6 = rawDf_onehot.ipv6.replace({True: 1, False: 0}).infer_objects(copy=False)
return rawDf_onehot

def trainMLmodel(rawDf):
Expand Down
3 changes: 1 addition & 2 deletions src/model/Updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def storePathChangeDescDf(self):

df['jumpedFrom'] = df['jumpedFrom'].astype(int)
df['diff'] = df['diff'].astype(int)
self.pq.writeToFile(df, f"parquet/prev_next_asn.parquet")
self.pq.writeToFile(df, f"{self.location}prev_next_asn.parquet")


def createLocation(self, required_folders):
Expand Down Expand Up @@ -291,7 +291,6 @@ def storePacketLossDataAndModel(self):

plsDf = createPcktDataset(start_date, end_date)
self.pq.writeToFile(plsDf, f'{self.location}ml-datasets/packet_loss_Df.parquet')
print('packet_loss_Df.parquet created', len(plsDf))

# onehot encode the whole dataset and leave only one month for further ML training
plsDf_onehot_month, plsDf_onehot = one_month_data(plsDf)
Expand Down
4 changes: 2 additions & 2 deletions src/utils/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ def writeToFile(df, filename):
df[col] = df[col].dt.strftime(DATE_FORMAT)
table = pa.Table.from_pandas(df, preserve_index=True)
pq.write_table(table, filename)
logging.info(f"Successfully wrote to file: {filename}")
print(f"Successfully wrote to file: {filename}")
except Exception as e:
logging.error(f"Error writing to file: {filename}, Exception: {e}")
print(f"Error writing to file: {filename}, Exception: {e}")

@staticmethod
def readSequenceOfFiles(location, prefix):
Expand Down

0 comments on commit e358e5a

Please sign in to comment.