-
Notifications
You must be signed in to change notification settings - Fork 0
/
traitement2.py
110 lines (93 loc) · 3.18 KB
/
traitement2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# ©2018 Jean-Hugues Roy. GNU GPL v3.
# coding: utf-8
import csv, os, glob
import pymysql.cursors
from motsvides import rien
import nltk
from nltk.tokenize import word_tokenize
import treetaggerwrapper
rep = input("On veut mots seuls (1), 2-grams (2) ou 3-grams (3)?")
tag = treetaggerwrapper.TreeTagger(TAGLANG='fr')
connection = pymysql.connect(host='localhost',
user='root',
password="",
db='facebook',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
sql = "SELECT * FROM posts"
cursor.execute(sql)
posts = cursor.fetchall()
nb = 0
t = 0
m = 0
liste = []
for post in posts:
# print(post)
nb += 1
# print(nb)
engagement = post["partages"] + post["reactions"] + post["commentaires"] + post["likes_commentaires"] + post["commentaires_commentaires"]
# print(engagement)
if engagement != 0:
textes = [post["message"],post["nom"],post["description"]]
for item in textes:
if item != "?" and "Timeline" not in item and "cover" not in item:
t += 1
# print(t,nb,item)
mots = word_tokenize(item)
## Pour compter mots pondérés
if rep == "1":
fichierOUT = "facebook-mots-medias.csv"
for mot in mots:
mot = mot.lower()
if mot not in rien:
if mot.isalpha():
lemme = tag.tag_text(mot)
lemme = lemme[0].split("\t")
if mot != "http" or mot != "https":
m += 1
print(mot,lemme[2],engagement,m,t,nb)
ajout = [lemme[2],engagement]
ying = open(fichierOUT, "a")
yang = csv.writer(ying)
yang.writerow(ajout)
### Pour compter 2-grams pondérés
elif rep == "2":
fichierOUT = "facebook-bigrams-medias.csv"
i = 0
for mot in mots[:-1]:
i += 1
mot = mot.lower()
if mot.isalpha():
if len(mot) > 1:
if mot != mots[i].lower():
if mot not in rien or mots[i].lower() not in rien:
if "." not in mot and "," not in mot and "-" not in mot and "’" not in mot and "!" not in mot and ":" not in mot and "." not in mots[i] and "," not in mots[i] and "-" not in mots[i] and "’" not in mots[i] and "!" not in mots[i] and ":" not in mots[i]:
bigram = "{} {}".format(mot,mots[i])
# print(nb,bigram)
ajout = [bigram, engagement]
# liste.append(bigram)
print(ajout,nb)
ying = open(fichierOUT, "a")
yang = csv.writer(ying)
yang.writerow(ajout)
### Pour compter 3-grams pondérés
elif rep == "3":
fichierOUT = "facebook-trigrams-medias.csv"
i = 0
for mot in mots[:-2]:
i += 1
mot = mot.lower()
if mot.isalpha():
if len(mot) > 1:
if mot != mots[i].lower() and mots[i].lower() != mots[i+1].lower() and mot != mots[i+1].lower():
if (mot not in rien or mots[i].lower() not in rien) and (mots[i].lower() not in rien or mots[i+1].lower() not in rien):
trigram = "{} {} {}".format(mot,mots[i].lower(),mots[i+1].lower())
ajout = [trigram, engagement]
# liste.append(bigram)
print(ajout,nb)
ying = open(fichierOUT, "a")
yang = csv.writer(ying)
yang.writerow(ajout)
else:
print("Mauvaise réponse")