Skip to content

Commit

Permalink
clean up classifier refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
simplymathematics committed Nov 30, 2024
1 parent bd51593 commit 97a851b
Showing 1 changed file with 24 additions and 178 deletions.
202 changes: 24 additions & 178 deletions examples/gzip/classifier_refactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups, make_classification
from sklearn.preprocessing import LabelEncoder


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -373,8 +369,8 @@ class DistanceMatrixKernelizer(BaseEstimator, TransformerMixin):
# From https://pdfs.semanticscholar.org/a9ee/f3769fe3686591a88cc831f9f685632f1b95.pdf
def __init__(
self,
coef0=0,
degree=0,
coef0=None,
degree=None,
gamma=1,
form: Literal[
"exp",
Expand Down Expand Up @@ -415,7 +411,7 @@ def fit(self, X, y=None):
lambda x: (self.gamma * x + self.coef0) ** self.degree
)
elif self.form == "quadratic":
assert self.degree in [2], "Degree must be 2 for quadratic form"
assert self.degree == 2, "Degree must be 2 for quadratic form"
assert self.gamma == 1, "Gamma must be 1 for quadratic form"
self.kernel_function = lambda x: (x + self.coef0) ** self.degree
elif self.form == "rational":
Expand All @@ -437,170 +433,6 @@ def fit_transform(self, X, y=None):
return self.transform(X)


def load_data(dataset, **kwargs):
if dataset == "20newsgroups":
X, y = fetch_20newsgroups(
subset="train",
categories=["alt.atheism", "talk.religion.misc"],
shuffle=True,
random_state=42,
return_X_y=True,
)
y = (
LabelEncoder().fit(y).transform(y)
) # Turns the labels "alt.atheism" and "talk.religion.misc" into 0 and 1
elif dataset == "kdd_nsl":
df = pd.read_csv("raw_data/kdd_nsl_undersampled_5000.csv")
y = df["label"]
X = df.drop("label", axis=1)
elif dataset == "make_classification":
X, y = make_classification(
n_samples=1000,
n_features=20,
n_classes=2,
random_state=42,
)
y = LabelEncoder().fit(y).transform(y)
elif dataset == "truthseeker":
df = pd.read_csv("raw_data/truthseeker_undersampled_8000.csv")
y = df["BotScoreBinary"]
X = df.drop("BotScoreBinary", axis=1)
elif dataset == "sms-spam":
df = pd.read_csv("raw_data/sms-spam_undersampled_1450.csv")
y = df["label"]
X = df.drop("label", axis=1)
elif dataset == "ddos":
df = pd.read_csv("raw_data/ddos.csv")
y = df["Label"]
X = df.drop("Label", axis=1)
else:
raise ValueError(
f"Dataset {dataset} not found. Options are: 20newsgroups, kdd_nsl, make_classification, truthseeker, sms-spam, ddos.",
)
if isinstance(X, pd.DataFrame):
X = [str(x) for x in X.values]
elif isinstance(X, (list, np.ndarray)):
X = [str(x) for x in X]
else:
raise ValueError(f"Unknown type {type(X)}")
X = np.array(X)
if len(kwargs) > 0:
X, _, y, _ = train_test_split(X, y, **kwargs)
return X, y


# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
# from sklearn.metrics import roc_curve, auc
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# from sklearn.model_selection import ParameterGrid


# model1 = LogisticRegression(max_iter=1000)
# model2 = KNeighborsClassifier()
# model3 = SVC(kernel="precomputed")


# logistic_params = {
# "model__tol": [1e-4, 1e-3, 1e-2],
# "model__C": [0.1, 1, 10, 100],
# "model__penalty": ["l1", "l2"],
# "model__solver" : ["saga"]
# }

# knn_params = {
# "model__n_neighbors": [1, 3, 5, 7, 9],
# "model__weights": ["uniform", "distance"],
# }

# svc_params = {
# "model__C": [0.1, 1, 10, 100],
# }

# exp_form = {
# "kernelizer__degree" : [1, 2],
# "kernelizer__gamma": [.0001, .001, .01, .1, 1, 10, 100, 1000],
# "kernelizer__coef0" : [0],
# }
# exp_neg_form = {
# "kernelizer__degree" : [1, 2],
# "kernelizer__gamma": [.0001, .001, .01, .1, 1, 10, 100, 1000],
# "kernelizer__coef0" : [0],
# }
# poly_form = {
# "kernelizer__degree" : [1, 2, 3],
# "kernelizer__gamma": [.0001, .001, .01, .1, 1, 10, 100, 1000],
# "kernelizer__coef0": [0, 1, 10, 100],
# }
# quadratic_form = {
# "kernelizer__gamma": [1],
# "kernelizer__coef0": [0, 1, 10, 100],
# "kernelizer__degree" : [2]
# }
# rational_form = {
# "kernelizer__gamma": [1],
# "kernelizer__coef0": [0, 1, 10, 100],
# "kernelizer__degree" : [1]
# }
# multiquadric_form = {
# "kernelizer__coef0": [0, 1, 10, 100],
# "kernelizer__degree" : [2],
# "kernelizer__gamma": [1],
# }

# kernelizers = [
# exp_form,
# exp_neg_form,
# poly_form,
# quadratic_form,
# rational_form,
# multiquadric_form,
# ]
# kernelizer_grid = list(ParameterGrid(kernelizers))

# transformer = StringDistanceTransformer(metric="gzip", n_jobs=-1)
# kernelizer = DistanceMatrixKernelizer(form="exp", gamma=1, degree=2)

# svc_list = []
# knn_list = []
# logistic_list = []
# lists_in_order = [logistic_list, knn_list, svc_list]
# i = 0
# for model_params in [logistic_params, knn_params, svc_params]:
# model_list = lists_in_order[i]
# for kernelizer_params in kernelizer_grid:
# new_dict = {**model_params, **kernelizer_params, }
# # Ensure that all values are lists
# for key in new_dict:
# if not isinstance(new_dict[key], list):
# new_dict[key] = [new_dict[key]]
# model_list.append(new_dict)
# i += 1


# pipeline1 = Pipeline([
# ("transformer", transformer),
# ("kernelizer", kernelizer),
# ("model", model1)
# ])
# pipeline2 = Pipeline([
# ("transformer", transformer),
# ("kernelizer", kernelizer),
# ("model", model2)
# ])
# pipeline3 = Pipeline([
# ("transformer", transformer),
# ("kernelizer", kernelizer),
# ("model", model3)
# ])


if __name__ == "__main__":

Expand All @@ -625,7 +457,7 @@ def load_data(dataset, **kwargs):
name: sklearn.preprocessing.LabelBinarizer
y: True
transformer:
name: tmp.StringDistanceTransformer
name: classifier_refactor.StringDistanceTransformer
metric : gzip
algorithm: sort
n_jobs: -1
Expand All @@ -641,7 +473,7 @@ def load_data(dataset, **kwargs):
probability: True
sklearn_pipeline:
kernelizer:
name: tmp.DistanceMatrixKernelizer
name: classifier_refactor.DistanceMatrixKernelizer
coef0: 0
degree: 2
gamma: 1
Expand All @@ -658,19 +490,24 @@ def load_data(dataset, **kwargs):
init:
model: ${model}
_target_: deckard.base.attack.AttackInitializer
name: art.attacks.evasion.ProjectedGradientDescent
eps: .01
# eps_step : ${eval:'(.1)*${.eps}'}
name: art.attacks.evasion.HopSkipJump
batch_size : ${data.sample.test_size}
targeted : false
max_iter : 100
max_eval : 100
init_eval : 10
attack_size : ${data.sample.test_size}
method : evasion
files:
data_file: tmp
data_type: pkl
data_type: .pkl
reports : tmp
model_dir : models
model_file : tmp
model_type : pkl
model_type : .pkl
directory: tmp
reports: reports
score_dict_file: score_dict.json
scorers:
accuracy:
name : sklearn.metrics.accuracy_score
Expand Down Expand Up @@ -698,6 +535,15 @@ def load_data(dataset, **kwargs):
- precision
- recall
- f1
- adv_success
- adv_precision
- adv_recall
- adv_f1
- adv_accuracy
- adv_fit_time
optimisers:
- accuracy
- adv_accuracy
_target_: deckard.Experiment
"""

Expand Down

0 comments on commit 97a851b

Please sign in to comment.