-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDFminerTextScript.py
68 lines (59 loc) · 2.49 KB
/
PDFminerTextScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import sys
import os.path
import time
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
def makeOutfileName():
timeID=str(time.time())
name="C:\\Users\\srobbins\\Desktop\\textScriptOutput"+timeID+".txt"
return name
def getPDFDir(directory, outfile):
for filename in os.listdir(directory):
if filename.endswith('.pdf') or filename.endswith('.PDF'):
filepath=directory+'\\'+filename
print filename[:-4]
getPDFInfo(filepath, outfile)
return
def getPDFInfo(filename, outfile):
# Open a PDF file. SR: add some assignments to use textconverter
#fp = open('C:/Users/SDR/Desktop/pdfMiner/pdfminer-20110515/samples/0001388.pdf', 'rb')
fp = open(filename, 'rb')
outfp=open(outfile, 'a')
codec = 'utf-8'
laparams = LAParams()
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize('')
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
# device = PDFDevice(rsrcmgr)
# SR: I overrode this in hopes that I could use the text
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
#device = TagExtractor(rsrcmgr, outfp, codec=codec)
#device = xmlConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for i,page in enumerate(doc.get_pages()):
#added this line as test
if i==1:
outfp.write(filename[-11:-4]+'\n')
interpreter.process_page(page)
#PDFInfo=page
return
outfile=makeOutfileName()
getPDFDir(r"\\libgrsurya\IDEALS_ETDS\ProQuestDigitization\Illinois_Retro1\Illinois_1_2", outfile)