-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimageExtractor.py
161 lines (139 loc) · 5.79 KB
/
imageExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
"""
Reads the alto XML page and extracts the Illustration objects from the TIFF files.
"""
__authors__ = 'User:Seb35, User:Jean-Frédéric, User:Plyd for BNF partnership with Wikimedia France'
import xml.dom.minidom, gzip, shutil, os, sys,codecs,locale,re
from subprocess import check_call
from os.path import basename
# Save standard output and error (they will be replaced by log files for each book)
stdoutBak = sys.stdout
stderrBak = sys.stderr
bookFolder = u'/home/projetbnf/bouquins'
exeFolder = os.getcwd()
#bookFolder = u'./Bouquins'
workingFolder = exeFolder+'/work/'
outputFolder = exeFolder+"/output/"
def bookProcessor(bookId):
"""Processes one book based on its id."""
if bookId not in metadata.keys() :
print u"Unable to do the book %s! no metadata"%folder
return
metadatabook = metadata[bookId]
bookName = re.sub(r' ','_',metadatabook[1][:-5])
folderCreated=False
print "Processing book %s - %s"%(bookId,bookName)
manageTiff(bookFolder,bookId,workingFolder)
generationFolder=outputFolder+bookId
#Iteration over the pages
xmlPagesPath='%s/%s/X'%(bookFolder,bookId)
if(not os.path.exists(xmlPagesPath)):
print "No XML ALTO file"
return 0
xmlPagesList = os.listdir(xmlPagesPath)
xmlPagesList.sort()
imageCounterOfBook=0
for xmlPage in xmlPagesList:
pageNumber=int(os.path.splitext(os.path.splitext(basename(xmlPage))[0])[0][1:])
#Reading the XML
doc = xml.dom.minidom.parse(gzip.open(xmlPagesPath+'/'+xmlPage))
imageCounterOfPage=0
if doc.getElementsByTagName('Illustration').length > 0:
if(not folderCreated):
if(os.path.exists(generationFolder)):
shutil.rmtree(generationFolder)
os.mkdir(generationFolder)
folderCreated=True
imageBasename=u'%s_-_%s'%(bookName,pageNumber)
illustrations = doc.getElementsByTagName('Illustration')
moreThanOnePage=(illustrations.length > 1)
for nodeIndex in range(0,illustrations.length):
illustration = illustrations.item(nodeIndex)
HPOS = illustration.getAttribute('HPOS')
VPOS = illustration.getAttribute('VPOS')
HEIGHT = illustration.getAttribute('HEIGHT')
WIDTH = illustration.getAttribute('WIDTH')
TYPE = illustration.getAttribute('TYPE')
if(not moreThanOnePage):
imageName=imageBasename+'.tif'
else:
#If there is more than one image per page, the naming convention is "bookname - page -
imageName=imageBasename+str(letters[1+imageCounterOfPage])+'.tif'
extractImage(workingFolder,pageNumber,HPOS,VPOS,HEIGHT,WIDTH,generationFolder,imageName)
imageCounterOfPage+=1
imageCounterOfBook+=imageCounterOfPage
return imageCounterOfBook
def manageTiff(inputFolder,bookId,outputFolder):
"""Processes the TIFF file: take it from the inputFolder based on its bookId, splits it and renames the resulting files."""
tifPath1 = "%s/%s/D%s.tif"%(inputFolder,bookId,bookId)
tifPath2 = "%s/%s/D%s.TIF"%(inputFolder,bookId,bookId)
if os.path.exists(tifPath1):
tifPath=tifPath1
elif os.path.exists(tifPath2):
tifPath=tifPath2
else:
print u'* no tif file'
return
newTiff="%s.tif"%(bookId)
shutil.copyfile(tifPath, "%s/%s"%(workingFolder,newTiff))
os.chdir(workingFolder)
check_call( ['tiffsplit', newTiff, "img"], stdout=sys.stdout, stderr=sys.stderr )
os.remove(newTiff)
#Rename tif images with integer instead of letters
listtif = os.listdir( '.' )
listtif.sort()
imagecounter = 1
for imagetif in listtif:
os.rename(imagetif,"%d.tif"%imagecounter)
imagecounter+=1
os.chdir(exeFolder)
def extractImage(tiffsFolder,pageNumber,HPOS,VPOS,HEIGHT,WIDTH,outputFolder,imageName):
"""Extracts an image from a TIFF file, using the given coordinates, and puts it in the given folder under the given name."""
print "Extracting image %s from page %s in %s into %s)"%(imageName,pageNumber,tiffsFolder,outputFolder)
cropCmd= WIDTH+'x'+HEIGHT+'+'+HPOS+'+'+VPOS
#completeCmd= u'convert %s %s%s.tif %s/%s'%(cropCmd,tiffsFolder,str(pageNumber),outputFolder,imageName)
#print u"Commande : %s"%(completeCmd)
check_call( ['convert','-crop',cropCmd,'+repage','%s%s.tif'%(tiffsFolder,str(pageNumber)),'%s/%s'%(outputFolder,imageName)], stdout=sys.stdout,stderr=sys.stderr)
def main():
if os.path.exists(workingFolder):
shutil.rmtree(workingFolder)
os.mkdir(workingFolder)
global metadata
metadata = getMetadata()
imageCounter=0
# Sets output in log files of the book
logoutfile = codecs.open( u'log.out.txt', 'w', 'utf-8' )
logerrfile = codecs.open( u'log.err.txt', 'w', 'utf-8' )
sys.stdout = logoutfile
sys.stderr = logerrfile
if len(sys.argv) > 1:
bookProcessor(sys.argv[1])
else:
#toreadfile = codecs.open( u'toRead.txt', 'r', 'utf-8' )
#toread = [ word[:-1] for word in toreadfile.readlines() ]
#toreadfile.close()
for bookId in metadata.keys():
if os.path.exists(workingFolder):
shutil.rmtree(workingFolder)
os.mkdir(workingFolder)
#print bookId
imageCounter=imageCounter+bookProcessor(bookId)
print "Done. %s images extracted"%(imageCounter)
logoutfile.close()
logerrfile.close()
sys.stdout = stdoutBak
sys.stderr = stderrBak
letters = {1:'a',2:'b',3:'c',4:'d',5:'e',6:'f',7:'g',8:'h',9:'i',10:'j',11:'k',12:'l',13:'m',14:'n',15:'o',16:'p',17:'q',18:'r',19:'s',20:'t',21:'u',22:'v',23:'w',24:'x',25:'y',26:'z' }
# format :
# ordre|titrecourt|titredjvu|ark|auteur|titre|resolution|pagelist|bonsauteurs
def getMetadata():
"""Retrieves the metadata from the text file"""
metadatafile = codecs.open( 'metadatabooks.txt', 'r', 'utf-8' )
metadatalist = [ line[:-1].split('|') for line in metadatafile.readlines() ]
metadatafile.close()
metadata = {}
for book in metadatalist :
metadata[book[0]] = book[1:]
return metadata
if __name__ == "__main__":
main()