-
Notifications
You must be signed in to change notification settings - Fork 0
/
linebreak.py
467 lines (389 loc) · 17.2 KB
/
linebreak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
"""texlib.wrap
Implements TeX's algorithm for breaking paragraphs into lines.
This module provides a straightforward implementation of the algorithm
used by TeX to format paragraphs. The algorithm uses dynamic
programming to find a globally optimal division into lines, enabling
it to produce more attractive results than a first-fit or best-fit
algorithm can. For a full description, see the reference.
The module provides the NodeList class, which is a list of Box,
Glue, and Penalty instances. The elements making up a paragraph of
text should be assembled into a single NodeList. Boxes represent
characters of type, and their only attribute is width. Glue
represents a space of variable size; in addition to a preferred width,
glue can also stretch and shrink, to an amount that's specified by the
user. Penalties are used to encourage or discourage breaking a line
at a given point. Positive values discourage line breaks at a given
point, and a value of INFINITY forbids breaking the line at the
penalty. Negative penalty values encourage line breaks at a given
point, and a value of -INFINITY forces a line break at a particular
point.
The compute_breakpoints() method of NodeList returns a list of
integers containing the indexes at which the paragraph should be
broken. If you're setting the text to be ragged-right (or
ragged-left, I suppose), then simply loop over the text and insert
breaks at the appropriate points. For full justification, you'll have
to loop over each line's contents, calculate its adjustment ratio by
calling compute_adjustment_ratio(), and for each bit of glue, call its
compute_width() method to figure out how long this dab of glue should
be.
Reference:
"Breaking Paragraphs into Lines", D.E. Knuth and M.F. Plass,
chapter 3 of _Digital Typography_, CSLI Lecture Notes #78.
"""
from __future__ import print_function
import sys
__version__ = "1.01"
INFINITY = 1000
# Three classes defining the three different types of nides that
# can go into an NodeList.
class Item:
def __init__(self, width=0, stretch=0, shrink=0, penalty=0, flagged=0):
self.width, self.stretch, self.shrink = width, stretch, shrink
self.penalty, self.flagged = penalty, flagged
self.ratio = None
self.is_box = self.is_glue = self.is_penalty = False
self._forced_break = None
def compute_width(self):
r = self.ratio
if r is None:
return self.width
if r < 0:
return self.width + r * self.shrink
else:
return self.width + r * self.stretch
@property
def is_forced_break(self):
if self._forced_break is None:
self._forced_break = self.is_penalty and self.penalty == -INFINITY
return self._forced_break
class Box(Item):
def __init__(self, **args):
super().__init__(**args)
self.is_box = True
class Glue(Item):
def __init__(self, **args):
super().__init__(**args)
self.is_glue = True
class Penalty(Item):
def __init__(self, **args):
super().__init__(**args)
self.is_penalty = True
class _BreakNode:
"Internal class representing an active breakpoint."
def __init__(
self,
position,
line,
fitness_class,
totalwidth,
totalstretch,
totalshrink,
demerits,
previous=None,
):
self.position, self.line = position, line
self.fitness_class = fitness_class
self.totalwidth, self.totalstretch = totalwidth, totalstretch
self.totalshrink, self.demerits = totalshrink, demerits
self.previous = previous
def __repr__(self):
return "<_BreakNode at %i>" % self.position
class NodeList(list):
"""Class representing a list of Box, Glue, and Penalty nodes.
Supports the same methods as regular Python lists.
"""
# Set this to True to trace the execution of the algorithm.
debug = False
def add_closing_penalty(self):
"Add the standard glue and penalty for the end of a paragraph"
self.append(Penalty(width=0, penalty=INFINITY, flagged=0))
self.append(Glue(width=0, stretch=INFINITY, shrink=0))
self.append(Penalty(width=0, penalty=-INFINITY, flagged=1))
def is_feasible_breakpoint(self, i):
"Return true if position 'i' is a feasible breakpoint."
node = self[i]
if node.is_penalty and node.penalty < INFINITY:
return True
elif i > 0 and node.is_glue and self[i - 1].is_box:
return True
else:
return False
def measure_width(self, pos1, pos2):
"Add up the widths between positions 1 and 2"
return self.sum_width[pos2] - self.sum_width[pos1]
def measure_stretch(self, pos1, pos2):
"Add up the stretch between positions 1 and 2"
return self.sum_stretch[pos2] - self.sum_stretch[pos1]
def measure_shrink(self, pos1, pos2):
"Add up the shrink between positions 1 and 2"
return self.sum_shrink[pos2] - self.sum_shrink[pos1]
def compute_adjustment_ratio(self, pos1, pos2, line, line_lengths):
"Compute adjustment ratio for the line between pos1 and pos2"
length = self.measure_width(pos1, pos2)
if self[pos2].is_penalty:
length = length + self[pos2].width
if self.debug:
print("\tline length=", length)
# Get the length of the current line; if the line_lengths list
# is too short, the last value is always used for subsequent
# lines.
if line < len(line_lengths):
available_length = line_lengths[line]
else:
available_length = line_lengths[-1]
# Compute how much the contents of the line would have to be
# stretched or shrunk to fit into the available space.
if length < available_length:
y = self.measure_stretch(pos1, pos2)
if self.debug:
print(
"\tLine too short: shortfall = %i, stretch = %i"
% (available_length - length, y)
)
if y > 0:
r = (available_length - length) / float(y)
else:
r = INFINITY
elif length > available_length:
z = self.measure_shrink(pos1, pos2)
if self.debug:
print(
"\tLine too long: extra = %s, shrink = %s"
% (available_length - length, z)
)
if z > 0:
r = (available_length - length) / float(z)
else:
r = INFINITY
else:
# Exactly the right length!
r = 0
return r
def add_active_nodes(self, active_nodes, nodes):
"""Add nodes to the active node list.
The nodes are added so that the list of active nodes is always
sorted by line number, and so that the set of (position, line,
fitness_class) tuples has no repeated values.
"""
for node in nodes:
index = 0
# Find the first index at which the active node's line number
# is equal to or greater than the line for 'node'. This gives
# us the insertion point.
while index < len(active_nodes) and active_nodes[index].line < node.line:
index = index + 1
insert_index = index
# Check if there's a node with the same line number and
# position and fitness. This lets us ensure that the list of
# active nodes always has unique (line, position, fitness)
# values.
while index < len(active_nodes) and active_nodes[index].line == node.line:
if (
active_nodes[index].fitness_class == node.fitness_class
and active_nodes[index].position == node.position
):
# A match, so just return without adding the node
return
index = index + 1
active_nodes.insert(insert_index, node)
def compute_breakpoints(
self,
line_lengths,
looseness=0, # q in the paper
tolerance=1, # rho in the paper
fitness_demerit=100, # gamma (XXX?) in the paper
flagged_demerit=100, # alpha in the paper
):
"""Compute a list of optimal breakpoints for the paragraph
represented by this NodeList, returning them as a list of
integers, each one the index of a breakpoint.
line_lengths : a list of integers giving the lengths of each
line. The last element of the list is reused
for subsequent lines.
looseness : An integer value. If it's positive, the paragraph
will be set to take that many lines more than the
optimum value. If it's negative, the paragraph is
set as tightly as possible. Defaults to zero,
meaning the optimal length for the paragraph.
tolerance : the maximum adjustment ratio allowed for a line.
Defaults to 1.
fitness_demerit : additional value added to the demerit score
when two consecutive lines are in different
fitness classes.
flagged_demerit : additional value added to the demerit score
when breaking at the second of two flagged
penalties.
"""
m = len(self)
if m == 0:
return [] # No text, so no breaks
# Precompute lists containing the numeric values for each node.
# The variable names follow those in Knuth's description.
w = [0] * m
y = [0] * m
z = [0] * m
p = [0] * m
f = [0] * m
for i in range(m):
node = self[i]
w[i] = node.width
y[i] = node.stretch
z[i] = node.shrink
p[i] = node.penalty
f[i] = node.flagged
# Precompute the running sums of width, stretch, and shrink
# (W,Y,Z in the original paper). These make it easy to measure the
# width/stretch/shrink between two indexes; just compute
# sum_*[pos2] - sum_*[pos1]. Note that sum_*[i] is the total
# up to but not including the box at position i.
self.sum_width = {}
self.sum_stretch = {}
self.sum_shrink = {}
width_sum = stretch_sum = shrink_sum = 0
for i in range(m):
self.sum_width[i] = width_sum
self.sum_stretch[i] = stretch_sum
self.sum_shrink[i] = shrink_sum
node = self[i]
width_sum += node.width
stretch_sum += node.stretch
shrink_sum += node.shrink
# Initialize list of active nodes to a single break at the
# beginning of the text.
A = _BreakNode(
position=0,
line=0,
fitness_class=1,
totalwidth=0,
totalstretch=0,
totalshrink=0,
demerits=0,
)
active_nodes = [A]
if self.debug:
print("Looping over %i nodes" % m)
for i in range(m):
B = self[i]
# Determine if this box is a feasible breakpoint and
# perform the main loop if it is.
if self.is_feasible_breakpoint(i):
if self.debug:
print("Feasible breakpoint at %i:" % i)
print("\tCurrent active node list:", active_nodes)
# Print the list of active nodes, sorting them
# so they can be visually checked for uniqueness.
def key_f(n):
return (n.line, n.position, n.fitness_class)
active_nodes.sort(key=key_f)
for A in active_nodes:
print(A.position, A.line, A.fitness_class)
print
print
# Loop over the list of active nodes, and compute the fitness
# of the line formed by breaking at A and B. The resulting
breaks = [] # List of feasible breaks
for A in active_nodes[:]:
r = self.compute_adjustment_ratio(
A.position, i, A.line, line_lengths
)
if self.debug:
print("\tr=", r)
print("\tline=", A.line)
# XXX is 'or' really correct here? This seems to
# remove all active nodes on encountering a forced break!
if r < -1 or B.is_forced_break:
# Deactivate node A
if len(active_nodes) == 1:
if self.debug:
print("Can't remove last node!")
# XXX how should this be handled?
# Raise an exception?
else:
if self.debug:
print("\tRemoving node", A)
active_nodes.remove(A)
if -1 <= r <= tolerance:
# Compute demerits and fitness class
if p[i] >= 0:
demerits = (1 + 100 * abs(r) ** 3 + p[i]) ** 3
elif B.is_forced_break:
demerits = (1 + 100 * abs(r) ** 3) ** 2 - p[i] ** 2
else:
demerits = (1 + 100 * abs(r) ** 3) ** 2
demerits = demerits + (flagged_demerit * f[i] * f[A.position])
# Figure out the fitness class of this line (tight, loose,
# very tight or very loose).
if r < -0.5:
fitness_class = 0
elif r <= 0.5:
fitness_class = 1
elif r <= 1:
fitness_class = 2
else:
fitness_class = 3
# If two consecutive lines are in very
# different fitness classes, add to the
# demerit score for this break.
if abs(fitness_class - A.fitness_class) > 1:
demerits = demerits + fitness_demerit
if self.debug:
print("\tDemerits=", demerits)
print("\tFitness class=", fitness_class)
# Record a feasible break from A to B
brk = _BreakNode(
position=i,
line=A.line + 1,
fitness_class=fitness_class,
totalwidth=self.sum_width[i],
totalstretch=self.sum_stretch[i],
totalshrink=self.sum_shrink[i],
demerits=demerits,
previous=A,
)
breaks.append(brk)
if self.debug:
print("\tRecording feasible break", B)
print("\t\tDemerits=", demerits)
print("\t\tFitness class=", fitness_class)
# end for A in active_nodes
if breaks:
if self.debug:
print("List of breaks at ", i, ":", breaks)
self.add_active_nodes(active_nodes, breaks)
# end if self.feasible_breakpoint()
# end for i in range(m)
if self.debug:
print("Main loop completed")
print("Active nodes=", active_nodes)
# Find the active node with the lowest number of demerits.
A = min(active_nodes, key=lambda A: A.demerits)
if looseness != 0:
# The search for the appropriate active node is a bit more
# complicated; we look for a node with a paragraph length
# that's as close as possible to (A.line+looseness), and
# with the minimum number of demerits.
best = 0
d = INFINITY
for br in active_nodes:
delta = br.line - A.line
# The two branches of this 'if' statement
# are for handling values of looseness that are
# either positive or negative.
if (looseness <= delta < best) or (best < delta < looseness):
s = delta
d = br.demerits
b = br
elif delta == best and br.demerits < d:
# This break is of the same length, but has fewer
# demerits and hence is a more attractive one.
d = br.demerits
b = br
A = b
# Use the chosen node A to determine the optimum breakpoints,
# and return the resulting list of breakpoints.
breaks = []
while A is not None:
breaks.append(A.position)
A = A.previous
breaks.reverse()
return breaks