-
Notifications
You must be signed in to change notification settings - Fork 0
/
prompt_classify.py
140 lines (109 loc) · 5.62 KB
/
prompt_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
This is a script to test our malicious prompt categorization methodology.
For preparation, an experiment folder should be prepared, contain a `config.yaml` for all the experiment configs.
Step1: Load the dataset
the dataset is a mixed sample of normal and malicious prompts.
"""
import argparse
import yaml
import os
from src.classification.dataset_construction import sample_data
import pandas as pd
import time
import os
from sklearn.metrics import recall_score, confusion_matrix
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Sample data based on configuration in experiment folder.')
parser.add_argument('experiment_folder', type=str, help='Path to the experiment folder.')
args = parser.parse_args()
experiment_folder = args.experiment_folder
# Construct the path to the config file
config_path = os.path.join(experiment_folder, 'config.yaml')
# Load the config file
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
# Extract the data config
data_config = config['data']
# Call the sample_data function
sample_data(
experiment_folder=experiment_folder,
**data_config
)
# load the model
from src.hookedLLM import HookedLLM # this is slow
hooked_llm = HookedLLM(**config['model'])
# load the train data
train_data = pd.read_csv(f'{experiment_folder}/data/train_data.csv')
positive_samples = train_data[train_data['label'] == 'malicious']['prompt'].tolist()
negative_samples = train_data[train_data['label'] == 'benign']['prompt'].tolist()
config['classifier']['positive_samples'] = positive_samples
config['classifier']['negative_samples'] = negative_samples
# configure training feature export
if config['export_features']:
config['classifier']['export_path'] = f'{experiment_folder}/features/train'
# construct the classifying direction
from src.classification.classifier import ActivationUsage
start_time = time.time()
classifier = ActivationUsage.from_config(config['classifier'], hooked_llm)
train_time = time.time() - start_time
# Assuming classifier and experiment_folder are defined elsewhere in your code
# score every prompt in the training data
start_time = time.time()
train_data['score'], train_data['pred'] = classifier.batch_score_and_classify(train_data['prompt'].tolist())
classify_train_time = time.time() - start_time
print(f"Scoring and classifying training data took {time.time() - start_time:.2f} seconds")
print(train_data)
output_folder = f'{experiment_folder}/stats'
os.makedirs(output_folder, exist_ok=True)
# save the scored data
start_time = time.time()
train_data.to_csv(f'{output_folder}/train_data.csv', index=False)
print(f"Saving training data took {time.time() - start_time:.2f} seconds")
# score, classify and save the test data
start_time = time.time()
test_data = pd.read_csv(f'{experiment_folder}/data/test_data.csv')
print(f"Loading test data took {time.time() - start_time:.2f} seconds")
start_time = time.time()
test_data['score'], test_data['pred'] = classifier.batch_score_and_classify(test_data['prompt'].tolist())
classify_test_time = time.time() - start_time
print(f"Scoring and classifying test data took {time.time() - start_time:.2f} seconds")
start_time = time.time()
test_data.to_csv(f'{output_folder}/test_data.csv', index=False)
print(f"Saving test data took {time.time() - start_time:.2f} seconds")
classification_threshold = classifier.classify_threshold
# if we want features exported, get out another classifier using the test set as training set, so we can get the test data features exported
if config['export_features']:
config['classifier']['export_path'] = f'{experiment_folder}/features/test'
positive_samples = test_data[test_data['label'] == 'malicious']['prompt'].tolist()
negative_samples = test_data[test_data['label'] == 'benign']['prompt'].tolist()
config['classifier']['positive_samples'] = positive_samples
config['classifier']['negative_samples'] = negative_samples
ActivationUsage.from_config(config['classifier'], hooked_llm)
def plot_and_save(data: pd.DataFrame, savepath: str, statspath: str):
# Calculate recall
y_true = data['label']
y_pred = data['pred']
# Calculate false positive rate and precision
tp, fn, fp, tn = confusion_matrix(y_true, y_pred, labels=['malicious', 'benign']).ravel()
fpr = fp / (fp + tn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)
# Calculate F1 score
f1 = 2 * (precision * recall) / (precision + recall)
# Save statistics to CSV
stats = pd.DataFrame({
'Metric': ['Recall', 'False Positive Rate', 'Precision', 'F1 Score'],
'Value': [recall, fpr, precision, f1]
})
stats.to_csv(statspath, index=False)
start_time = time.time()
plot_and_save(train_data, f'{output_folder}/score_train_histogram.png', f'{output_folder}/train_stats.csv')
plot_and_save(test_data, f'{output_folder}/score_test_histogram.png', f'{output_folder}/test_stats.csv')
print(f"Saving stats and plot took {time.time() - start_time:.2f} seconds")
# save train and classify time as a csv
pd.DataFrame({
'Train Set Size': [len(train_data)],
'Epochs': [config['classifier']['NNCfg']['training']['epochs']],
'Train Time': [train_time],
'Average Classify Time': [(classify_train_time + classify_test_time) / (len(train_data) + len(test_data))],
}).to_csv(f'{output_folder}/time.csv', index=False)