-
Notifications
You must be signed in to change notification settings - Fork 0
/
PDFminerScriptTagged.py
104 lines (88 loc) · 3.72 KB
/
PDFminerScriptTagged.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys
import os.path
import time
from pdfminer.processETDs import *
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor, TagExtractor2Memory
from pdfminer.converter import TextConverter, XMLConverter
from pdfminer.layout import LAParams
def makeOutfileName():
timeID=str(time.time())
name="C:\\Users\\srobbins\\Desktop\\taggedScriptOutput"+timeID+".txt"
return name
def getPDFDir(directory, outfile, thisTrainingData):
count=0
misscount=0
for filename in os.listdir(directory):
if filename.endswith('.pdf') or filename.endswith('.PDF'):
filepath=directory+'\\'+filename
result=getPDFInfo(filepath, outfile, thisTrainingData)
return
def getPDFInfo(filename, outfile, thisTrainingData):
# Open a PDF file. SR: add some assignments to use textconverter
fp = open(filename, 'rb')
#outfp=file(outfile, 'a')
#outfp=sys.stdout#use this to print to screen instead of file
codec = 'utf-8'
laparams = LAParams()
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument()
# Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize('')
# Check if the document allows text extraction. If not, abort.
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
# device = PDFDevice(rsrcmgr)
# SR: I overrode this in hopes that I could use the text
#device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
device = TagExtractor2Memory(rsrcmgr, codec=codec)
#device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir="C:\\Users\\srobbins\\Desktop\\")
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
#outfp.write(filename[-11:-4]+"\n")
#print filename[-11:-4]+"\n"#uncomment for testing
PDFInfo=''
for i,page in enumerate(doc.get_pages()):
#added this line as test
PDFInfo+=interpreter.process_page_to_mem(page)
if i==10:
deptInfo=thisTrainingData.processETDStrings(PDFInfo)
#outfp.write(deptInfo+'\n')
#for testing purposes: instead of writing to file, uncomment following line:
#print deptInfo+'\n'
fileDict[filename[-11:-4]]=deptInfo
return
fileDict={}
outfile=makeOutfileName()
thisTrainingData=TrainingData()
count=getPDFDir(r"\\libgrsurya\IDEALS_ETDS\ProQuestDigitization\Illinois_Retro5\Illinois_5_2", outfile, thisTrainingData)
print fileDict
print thisTrainingData.trainingDataDict
fileDict=thisTrainingData.cleanTrainingData(fileDict)
print fileDict
#iteractionTwo=
IterationTwo=metadataFinder(thisTrainingData, r"\\libgrsurya\IDEALS_ETDS\ProQuestDigitization\Illinois_Retro5\Illinois_5_2", fileDict)
IterationTwo.checkForAlternateString('doctor of education', 'education')
IterationTwo.checkForAlternateString('doctor of musical arts', 'music')
IterationTwo.getTextBetweenTwoStrings('submitted', '</page>')
fileDict=IterationTwo.testString()
print IterationTwo.fileDict
count=0.0
missCount=0.0
for key in fileDict.keys():
count+=1.0
if fileDict[key]=="no match":
missCount+=1.0
print missCount/count
print IterationTwo.trainingDataDict