-
Notifications
You must be signed in to change notification settings - Fork 0
/
dir_claster.py
43 lines (36 loc) · 1.44 KB
/
dir_claster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import random as rd
'''
кластеризует собранные данные по папкам и удаляет пустные
важно знать на сколько видов делить!!!!
работать с бэкапом данных!!!!
'''
li_dir = os.listdir(path="truck-link/sample")
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(li_dir)
kmeans = KMeans(n_clusters=2).fit(X)
print(kmeans.labels_)
dick_paths = {}
for i in range(len(li_dir)):
if kmeans.labels_[i] not in dick_paths.keys():
dick_paths[kmeans.labels_[i]] = []
dick_paths[kmeans.labels_[i]].append(li_dir[i])
else:
dick_paths[kmeans.labels_[i]].append(li_dir[i])
print(dick_paths)
for key, value in dick_paths.items():
for i in range(len(value)):
if i == 0:
temp_path = 'truck-link/sample/{0}/'.format(value[0])
print('into', temp_path)
else:
for item in os.listdir('truck-link/sample/{0}'.format(value[i])):
sourse = 'truck-link/sample/' + value[i] + '/' + item
destin = temp_path + '{}'.format(rd.randint(10,100000))
if os.stat(sourse).st_size < 25000:
os.remove(sourse)
else:
os.rename(sourse, destin)
os.rmdir('truck-link/sample/{0}'.format(value[i]))