-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_preprocessing.py
102 lines (82 loc) ยท 3.18 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#-*- coding: utf-8 -*-
import pandas as pd
import datetime
import numpy as np
import re
import multiprocessing
import os
from hanspell import spell_checker
from konlpy.tag import Hannanum
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# ์ด์ ํ์ผ ์ญ์
def remove_file(folder_name):
file = "/home/maria_dev/PresidentMoon-analysis/data/preprocessing/" + folder_name + "/clean_data.csv"
if os.path.isfile(file):
os.remove(file)
# ํ์ผ ์ฝ๊ธฐ
def read_data(filepath):
df = pd.read_csv(filepath,
parse_dates=[['date', 'time']],
dtype={
'username': str
},
error_bad_lines=False,
warn_bad_lines=True,
)
return df
# ๋ช
์ฌ, ์ฉ์ธ ์ถ์ถ
def extract_pos(text):
h = Hannanum()
pos = h.pos(text, ntags=22, flatten=True)
pos_list = [item for item in pos if item[1] == 'NC' or item[1] == 'NQ' or item[1] == 'NN' or item[1] == 'PV' or item[1] == 'PA']
dct = dict(pos_list)
for stopword in stopwords.itertuples(): # ๋ถ์ฉ์ด ์ฒดํฌ
if dct.get(stopword._1):
del dct[stopword._1]
split_pos = "|".join("%s,%s" % tup for tup in dct.items())
return split_pos
# ์ ์ฒ๋ฆฌ
def preprocess(folder_name):
df = read_data('/home/maria_dev/PresidentMoon-analysis/data/scraping/' + folder_name + "/tweets.csv")
# ์ค๋ณต ์ ๊ฑฐ
df = df.drop_duplicates()
# date_time์ ๋ง์ง ์๋ ๋ฐ์ดํฐ ์ญ์
df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df['conversation_id'] = pd.to_numeric(df['conversation_id'], errors='coerce')
# UTC ์๊ฐ ์กฐ์
df['date_time'] = df['date_time'] - datetime.timedelta(hours=16)
df['date'] = df['date_time'].dt.date
# tweet column ํ์
string์ผ๋ก ๋ณ๊ฒฝ & ์๋ฌธ์๋ก ๋ณ๊ฒฝ
df['tweet'] = df.tweet.astype(str)
df['tweet'] = df['tweet'].apply(lambda x: x.lower())
# ๋จ์ด๋ณ๋ก ์๋ฅธ ๊ฒ ๋ฃ์ ์๋ก์ด column ๋ง๋ค๊ธฐ
df['word'] = ''
hangul = re.compile("[^"
u"\U0000AC00-\U0000D7AF"
"]+", flags=re.UNICODE)
sum = 0
for row in df.itertuples():
sum = sum + 1
print(sum)
tweet = str(row.tweet).decode('utf-8', errors='replace')
hangul_text = re.sub(hangul, ' ', tweet)
if(hangul_text.isspace() == False):
try:
spell_ok = spell_checker.check(str(hangul_text))
except ValueError:
word_str = extract_pos(str(hangul_text))
else:
word_str = extract_pos(str(spell_ok.checked))
df.at[row.Index, 'word'] = word_str
df = df[['date', 'word']].groupby('date').sum()
# df = df[['date', 'word']].groupby('date')
# csv ์ ์ฅ
df.to_csv("/home/maria_dev/PresidentMoon-analysis/data/preprocessing/" + folder_name + "/clean_data.csv", mode="w")
if __name__ == '__main__':
[remove_file(folder_name) for folder_name in ["moon", "unification", "dprk"]]
stopwords = pd.read_json('/home/maria_dev/PresidentMoon-analysis/data/stopwords/stopwords_ko.json')
pool = multiprocessing.Pool(processes=4)
pool.map(preprocess, ["moon", "unification", "dprk"])