Skip to content

Commit

Permalink
Merge pull request pdfminer#141 from timb07/speedup_layout
Browse files Browse the repository at this point in the history
Speed up layout of text boxes
  • Loading branch information
tataganesh authored Nov 8, 2018
2 parents 1ea9446 + 1cbeaeb commit e03ecab
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 23 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ install:
- pip install six
- pip install pycryptodome
- pip install chardet
- pip install sortedcontainers
script:
nosetests --nologcapture
29 changes: 14 additions & 15 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from sortedcontainers import SortedListWithKey

from .utils import INF
from .utils import Plane
from .utils import get_bound
from .utils import uniq
from .utils import csort
from .utils import fsplit
from .utils import bbox2str
from .utils import matrix2str
Expand Down Expand Up @@ -441,7 +441,7 @@ class LTTextBoxHorizontal(LTTextBox):

def analyze(self, laparams):
LTTextBox.analyze(self, laparams)
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
self._objs.sort(key=lambda obj: -obj.y1)
return

def get_writing_mode(self):
Expand All @@ -452,7 +452,7 @@ class LTTextBoxVertical(LTTextBox):

def analyze(self, laparams):
LTTextBox.analyze(self, laparams)
self._objs = csort(self._objs, key=lambda obj: -obj.x1)
self._objs.sort(key=lambda obj: -obj.x1)
return

def get_writing_mode(self):
Expand All @@ -474,7 +474,7 @@ class LTTextGroupLRTB(LTTextGroup):
def analyze(self, laparams):
LTTextGroup.analyze(self, laparams)
# reorder the objects from top-left to bottom-right.
self._objs = csort(self._objs, key=lambda obj:
self._objs.sort(key=lambda obj:
(1-laparams.boxes_flow)*(obj.x0) -
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
return
Expand All @@ -485,7 +485,7 @@ class LTTextGroupTBRL(LTTextGroup):
def analyze(self, laparams):
LTTextGroup.analyze(self, laparams)
# reorder the objects from top-right to bottom-left.
self._objs = csort(self._objs, key=lambda obj:
self._objs.sort(key=lambda obj:
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
- (1-laparams.boxes_flow)*(obj.y1))
return
Expand Down Expand Up @@ -639,21 +639,18 @@ def key_obj(t):
(c,d,_,_) = t
return (c,d)

# XXX this still takes O(n^2) :(
dists = []
dists = SortedListWithKey(key=key_obj)
for i in range(len(boxes)):
obj1 = boxes[i]
for j in range(i+1, len(boxes)):
obj2 = boxes[j]
dists.append((0, dist(obj1, obj2), obj1, obj2))
# We could use dists.sort(), but it would randomize the test result.
dists = csort(dists, key=key_obj)
dists.add((0, dist(obj1, obj2), obj1, obj2))
plane = Plane(self.bbox)
plane.extend(boxes)
while dists:
(c, d, obj1, obj2) = dists.pop(0)
if c == 0 and isany(obj1, obj2):
dists.append((1, d, obj1, obj2))
dists.add((1, d, obj1, obj2))
continue
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
Expand All @@ -662,11 +659,13 @@ def key_obj(t):
group = LTTextGroupLRTB([obj1, obj2])
plane.remove(obj1)
plane.remove(obj2)
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
if (obj1 in plane and obj2 in plane) ]
removed = [obj1, obj2]
to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
if (obj1 in removed or obj2 in removed) ]
for r in to_remove:
dists.remove(r)
for other in plane:
dists.append((0, dist(group, other), group, other))
dists = csort(dists, key=key_obj)
dists.add((0, dist(group, other), group, other))
plane.add(group)
assert len(plane) == 1, str(len(plane))
return list(plane)
Expand Down
7 changes: 0 additions & 7 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,6 @@ def uniq(objs):
return


# csort
def csort(objs, key):
"""Order-preserving sorting function."""
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))


# fsplit
def fsplit(pred, objs):
"""Split a list into two classes according to the predicate."""
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pdfminer as package

requires = ['six', 'pycryptodome']
requires = ['six', 'pycryptodome', 'sortedcontainers']
if sys.version_info >= (3, 0):
requires.append('chardet')

Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ deps =
pycryptodome
chardet
nose
sortedcontainers

0 comments on commit e03ecab

Please sign in to comment.