-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path--python转换pdf
27 lines (25 loc) · 1.18 KB
/
--python转换pdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
方法1:
pdf转换html再转txt
cd C:\Users\hasee\AppData\Local\Programs\Python\Python35-32\pdf2htmlEX-win32-0.13.6 先把要转换的1.pdf放到左边的文件夹
pdf2htmlEX (--zoom 1.3) 1.pdf #zoom是缩放倍率,1.pdf是要转换的pdf文件,将生成1.html在这个文件夹
在用beautifulsoup(或者lxml)提取文本
=============================================================
from pdfminer.pdfinterp import PDFResourceManager, process_pdf #方法2,保存一份在wode:pdf2txt.py
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
def convert_pdf(path, page=1):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, pageno=page, laparams=laparams)
fp = open(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
file = r'C:\Users\hasee\AppData\Local\Programs\Python\Python35-32\111\1.pdf'
with open(r'C:\Users\hasee\AppData\Local\Programs\Python\Python35-32\111\1.txt','w',encoding='utf-8') as f:
f.write(convert_pdf(file))