-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser-nolink-nowater.py
91 lines (78 loc) · 2.88 KB
/
parser-nolink-nowater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os, shutil, zipfile, numpy as np
from PIL import Image, ImageChops
# ----- finding the only Word file in folder
for file in os.listdir():
if file.endswith('docx'):
wordfile = file
# ----- extracting it as archive to temporary folder
with zipfile.ZipFile(wordfile, 'r') as zip_ref:
zip_ref.extractall("targetdir")
# ----- copying all images to folder with watermarks
os.chdir(r'.\targetdir\word\media')
for file in os.listdir():
shutil.copyfile(file, '..\..\..\html\\' + file)
# ----- changing folder to where watermarks are
os.chdir('..\..\..\html')
# ----- removing links to Dr. Explain
pages = [i for i in os.listdir() if i.endswith('htm')]
for page in pages:
inp = open(page, 'r', encoding='utf-8')
lines = inp.readlines()
output = ''
flag = True
for i in lines:
if 'h6' not in i and 'Unregistered version' not in i and flag:
output += i
elif 'h6' in i:
flag = not flag
continue
else:
output += '</div>'
inp.close()
out = open(page, 'w', encoding='utf-8')
print(output, file=out)
out.close()
# ----- removing watermarks
water = [i for i in os.listdir() if i.startswith('drex') and i.endswith('png') and 'header' not in i and 'index' not in i]
nowater = [i for i in os.listdir() if i.startswith('image') and i.endswith('png')]
cnt = 0
for watered_img in water:
cnt += 1
img1 = Image.open(watered_img)
width, height = img1.width, img1.height
if width < 120 and height < 120:
shutil.copyfile(watered_img, watered_img[:-4] + '_0.png')
continue
maxdif = 10 ** 9
for i in nowater:
img = Image.open(i)
if img.width < 120 and img.height < 120:
continue
img = img.resize((width, height))
res = ImageChops.difference(img1.convert('RGB'), img.convert('RGB'))
mean = np.mean(np.array(res))
if mean < maxdif:
maxdif = mean
ans = i
shutil.copyfile(ans, watered_img[:-4] + '__0.png')
print('%.2f' % (cnt / len(water) * 100), '% done...', sep='')
# ----- removing watermarks files
for i in os.listdir():
if i.startswith('drex') and i.endswith('png') and '__0' not in i and 'header' not in i and 'index' not in i:
os.remove(i)
# ----- restoring images from what's found
for i in os.listdir():
if i.startswith('drex') and i.endswith('png'):
os.rename(i, i.replace('__0', ''))
# ----- deleting images from Word
# ----- (some are not removable for some reason)
for i in os.listdir():
if i.startswith('image') and i.endswith('png'):
try:
os.remove(i)
except:
print('Could not delete ' + i)
# ----- deleting temporary folder with Word file content
os.chdir(r'..')
shutil.rmtree('targetdir')
a = input('Press Enter to finish...')