forked from Shashankss1205/TeamTitans
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tri_model.py
86 lines (62 loc) · 3.49 KB
/
tri_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
from transformers import BertTokenizer, BertForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from torch.nn.functional import softmax
import torch
import re
# Paths to the fine-tuned models
bert_model_path = r"/home/shashank/Desktop/dpbh/Titans_dpbh23/models/Titans_DPBH_BERT_Fine_Tuned_Model"
xlnet_model_path = r"/home/shashank/Desktop/dpbh/Titans_dpbh23/models/Titans_DPBH_XLNet_Fine_Tuned_Model"
roberta_model_path = r"/home/shashank/Desktop/dpbh/Titans_dpbh23/models/Titans_DPBH_ROBERT_Fine_Tuned_Model"
# Load models and tokenizers
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_path)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_path)
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
xlnet_model = XLNetForSequenceClassification.from_pretrained(xlnet_model_path)
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_model_path)
max_seq_length = 512
def preprocess_text(tokenizer, text):
# print(1)
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text, add_special_tokens=True, max_length=max_seq_length, truncation=True)))
return tokens
def predict_dark_patterns(models, tokenizers, input_text):
# print(2)
votes = []
for model, tokenizer in zip(models, tokenizers):
input_ids = tokenizer.encode(preprocess_text(tokenizer, input_text), return_tensors='pt', max_length=max_seq_length, truncation=True)
with torch.no_grad():
outputs = model(input_ids)
probs = softmax(outputs.logits, dim=1).squeeze()
predicted_category = torch.argmax(probs).item()
votes.append(predicted_category)
return votes
def count_dark_patterns_with_text(text_file):
# print(3)
with open(text_file, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Map category names to numeric labels
category_mapping = {"Urgency": 0, "Not Dark Pattern": 1, "Scarcity": 2, "Misdirection": 3, "Social Proof": 4,
"Obstruction": 5, "Sneaking": 6, "Forced Action": 7}
dark_patterns = {category: {"count": 0, "text_strings": []} for category in category_mapping}
for line in lines:
if not line.strip():
continue
individual_predictions = predict_dark_patterns([bert_model, xlnet_model, roberta_model],
[bert_tokenizer, xlnet_tokenizer, roberta_tokenizer],
line)
# Get majority voted prediction
majority_category = max(set(individual_predictions), key=individual_predictions.count)
category_name = next(key for key, value in category_mapping.items() if value == majority_category)
if category_name == "Not Dark Pattern":
continue # Skip "Not Dark Pattern" category
dark_patterns[category_name]["count"] += 1
dark_patterns[category_name]["text_strings"].append(line.strip())
return dark_patterns
# Assuming 'scraped.txt' is in the same directory as the models
result_with_text = count_dark_patterns_with_text('scraped.txt')
for category, info in result_with_text.items():
print(f"{category}: {info['count']} occurrences")
print(f"Text strings: {info['text_strings']}")
# Save the result to a JSON file
with open('result.json', 'w') as json_file:
json.dump(result_with_text, json_file)