forked from ypspy/dart-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
(D-7-1) extractGovernance
133 lines (105 loc) · 4.56 KB
/
(D-7-1) extractGovernance
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 15:19:08 2020
@author: user
References
1. 이미지 인식 참고 https://blog.daum.net/geoscience/1266 (Tesseract at UB Mannheim 설치 C:\Program Files\Tesseract-OCR)
2. CV2 설치 https://pypi.org/project/opencv-python/
3. tesseract 환경변수에 추가 https://joyhong.tistory.com/79
4. tesseract config 사용법 https://m.blog.naver.com/hn03049/221957851802
6. skimage 설치 https://scikit-image.org/docs/stable/install.html
7. 웹페이지 이미지를 다운로드 없이 이미지 링크로 array 직접 변환 https://stackoverflow.com/questions/21061814/how-can-i-read-an-image-from-an-internet-url-in-python-cv2-scikit-image-and-mah
8. 리스트를 뒤에서부터 읽기 https://stackoverflow.com/questions/529424/traverse-a-list-in-reverse-order-in-python
"""
from bs4 import BeautifulSoup
import os
import glob
import pandas as pd
import numpy as np
from pytesseract import image_to_string
from skimage import io
import time
import random
def governanceClassifier(textData):
content = ''.join(textData.split())
matchesPos = ['감사위원회', '監査委員會']
matchesNeg = ['감사의선임', '감사는', '監事는']
returnText = ''
if any(x in content for x in matchesPos) and all(x not in content for x in matchesNeg):
returnText = "감사위원회"
if any(x in content for x in matchesNeg):
returnText = "감사"
return returnText
def imgParser(soup):
config = ('-l kor --oem 3 --psm 4')
imglist = soup.find_all("img")
result = ''
for i in imglist[::-1]: # 감사위원회/감사 등은 뒤에서부터 찾는게 더 빠르다. https://stackoverflow.com/questions/529424/traverse-a-list-in-reverse-order-in-python
link = r"http://dart.fss.or.kr/" + str(i["src"])
try:
image = io.imread(link)
except ValueError: # "이 파일은 서비스하지 않습니다!"라는 메시지가 뜨는 경우
continue
strings = image_to_string(image, config=config)
result = governanceClassifier(strings)
if result:
break
time.sleep(random.uniform(0.8, 1)) # 이렇게 하면 블락 안당하나
return result
# 1. 작업 폴더로 변경
os.chdir("C:\data\\") # 작업 폴더로 변경
# 2. 타겟 폴더에 있는 필요 문서 경로 리스트업
pathList = []
for path in [".\A001_2017\\", ".\A001_2018\\",
".\A001_2019\\", ".\A001_2020\\"
]:
path = path + "*정관_정관*.*" # 필요한 Keyword 입력
pathList = pathList + glob.glob(path)
# 3. 입수 과정에서 중복입수되어 표시된 duplicated 표시 파일 제거
pathList = [x for x in pathList if "duplicated" not in x]
# 4. 분리
pathList = [x for x in pathList if "(2017." in x] + [x for x in pathList if "(2018." in x] + [x for x in pathList if "(2019." in x]
# 5. Preprocess
PathListDf = pd.DataFrame(pathList)
df = pd.DataFrame([x.split("_") for x in pathList])
df["path"] = PathListDf[0]
df["con"] = df[6].str.contains("연결")
df['con'] = np.where(df['con']==True, "C", "S")
df['amend'] = df[6].str.contains("정정")
df['amend'] = np.where(df['amend']==True, "A", "B")
df["key"] = df[2] + df[6].str.slice(stop=10) + df["con"] + df["amend"] + df[5] + df[8] + df[10]
# key = 접수일 + 보고서 제출일(DRT 인증일) + 연결(C/S) + 정정(A/B) + 보고기간종료월 + 종목코드 + 법인등록번호
df["duplc"] = df.duplicated(subset=["key"], keep=False)
isTrue = df[df["duplc"] == True]
df = df.drop_duplicates(subset=["key"])
pathListOut = df["path"].tolist()
result = []
count = 0
for file in pathListOut:
html = open(file, "r", encoding="utf-8")
soup = BeautifulSoup(html, "lxml")
html.close()
returnText = ""
content = ''.join(soup.text.split())
img = "text" # 정관의 입력 형태
returnText = img + "_" + governanceClassifier(content)
if soup.img:
img = "image"
returnText = img + "_" + imgParser(soup)
result.append(returnText)
count += 1
print(count, returnText)
df["AC2"] = result
df = df.drop([0, 1, 14, "path", "duplc"], axis=1)
df = df.sort_values(by=[10, 5, "con", 2, 6, "amend"],
ascending=[True, True, True, False, False, True])
df["toDrop"] = 1
for i in range(1, len(df)):
if df.iloc[i,3] == df.iloc[i-1,3] and df.iloc[i,8] == df.iloc[i-1,8]:
df.iloc[i, 16] = df.iloc[i-1, 16] + 1
else:
df.iloc[i, 16] = 1
df = df[df["toDrop"] == 1]
df = df.drop("toDrop", axis=1)
os.chdir(r"C:\\output\\")
df.to_csv("wp01.data20.output.csv", sep="\t")