-
Notifications
You must be signed in to change notification settings - Fork 0
/
eda.py
84 lines (70 loc) · 3.21 KB
/
eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Import Modules
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import *
import matplotlib.pyplot as plt
import seaborn as sns
# Custom Code
from dataset import *
# Constants
MAX_LEN = 512
CACHE_DIR = './'
# Set Model Type
# Set to the following:
# 1. 'bert-base-multilingual-cased' for Multilingual BERT model
# 2. 'xlm-roberta-base' for Multi-lingual XLM-RoBERTa model
model_type = 'xlm-roberta-base'
print(f'Model Type: {model_type}')
# Set Config
config = AutoConfig.from_pretrained(model_type, num_labels = 2) # Binary classification so set num_labels = 2
print(config)
# Set Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type, add_prefix_space = False, do_lower_case = False)
print(tokenizer)
# Download DpgNews Files
download_articles_by_publisher(CACHE_DIR)
# Get DpgNews Dataframe
dpgnews_df = get_dpgnews_df(CACHE_DIR)
# Set Seaborn Style
sns.set_theme(style = "whitegrid")
# Get Counts for Title, Text, Sentences and Partisan Labels
dpgnews_df['title_wordcount'] = dpgnews_df.title.str.split(' ').str.len()
dpgnews_df['text_wordcount'] = dpgnews_df.text.str.split(' ').str.len()
dpgnews_df['text_sentence_count'] = dpgnews_df.text.str.split('.').str.len()
print('\n===== Partisan - Label Count')
print(dpgnews_df.partisan.value_counts().sort_index())
print('\n===== Title - Word Count')
print(dpgnews_df.title_wordcount.value_counts().sort_index())
print('\n===== Text - Word Count')
print(dpgnews_df.text_wordcount.value_counts().sort_index())
print('\n===== Text - Sentence Count')
print(dpgnews_df.text_sentence_count.value_counts().sort_index())
# Plot Words Count
g = sns.displot(dpgnews_df, kind = 'kde', rug = True, x = 'text_wordcount', hue = 'partisan')
g.set_axis_labels('Words Per Article Count', 'Density', labelpad = 10)
g.fig.set_size_inches(10, 6)
g.savefig(f'{model_type}_plot_text_words_count.png', dpi = 200)
plt.close()
# Plot Sentence Count
g = sns.displot(dpgnews_df, kind = 'kde', rug = True, x = 'text_sentence_count', hue = 'partisan')
g.set_axis_labels('Sentences Per Article Count', 'Density', labelpad = 10)
g.fig.set_size_inches(10, 6)
g.savefig(f'{model_type}_plot_text_sentences_count.png', dpi = 200)
plt.close()
# Get Token Count / PLot Token Count
dpgnews_df['text_token_count'] = 0
for index, row in tqdm(dpgnews_df.iterrows(), total = dpgnews_df.shape[0]):
# Get title and description as strings
text = row['text']
# Get the full tokenized Text... No Max Length, No Truncation etc...
input_encoded = tokenizer.encode_plus(text, add_special_tokens = True)
dpgnews_df.loc[index, 'text_token_count'] = len(input_encoded['input_ids'])
g = sns.displot(dpgnews_df, kind = 'kde', rug = True, x = 'text_token_count', hue = 'partisan')
g.set_axis_labels(f'Tokens Per Article Count - Model Type: {model_type}', 'Density', labelpad = 10)
g.fig.set_size_inches(10, 6)
g.savefig(f'{model_type}_plot_text_token_count.png', dpi = 200)
plt.close()
# Text Token Counts Smaller/Greater than 512 .. which is max input size for Transformers model
print(f'\nArticles with 512 or less tokens: {dpgnews_df[dpgnews_df["text_token_count"] <= 512].shape[0]}')
print(f'Articles with more than 512 tokens: {dpgnews_df[dpgnews_df["text_token_count"] > 512].shape[0]}')