Skip to content

Commit

Permalink
Add strikeout/underline detection to stext device.
Browse files Browse the repository at this point in the history
  • Loading branch information
robinwatts committed Sep 23, 2024
1 parent 0d6f6f2 commit 6a41622
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 8 deletions.
9 changes: 8 additions & 1 deletion include/mupdf/fitz/structured-text.h
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,8 @@ struct fz_stext_line
struct fz_stext_char
{
int c; /* unicode character value */
int bidi; /* even for LTR, odd for RTL */
uint16_t bidi; /* even for LTR, odd for RTL - probably only needs 8 bits? */
uint16_t flags;
int color; /* sRGB hex color */
fz_point origin;
fz_quad quad;
Expand All @@ -319,6 +320,12 @@ struct fz_stext_char
fz_stext_char *next;
};

enum
{
FZ_STEXT_STRIKEOUT = 1,
FZ_STEXT_UNDERLINE = 2
};

/**
When we are collecting the structure information from
PDF structure trees/tags, we end up with a tree of
Expand Down
291 changes: 286 additions & 5 deletions source/fitz/stext-device.c
Original file line number Diff line number Diff line change
Expand Up @@ -1410,21 +1410,302 @@ fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *stri
return opts;
}

typedef struct
{
int fail;
int count;
fz_point corners[4];
} is_rect_data;

static void
fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
stash_point(is_rect_data *rd, float x, float y)
{
fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
if (rd->count > 3)
{
rd->fail = 1;
return;
}

rd->corners[rd->count].x = x;
rd->corners[rd->count].y = y;
rd->count++;
}

static void
is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
{
is_rect_data *rd = arg;
if (rd->fail)
return;

if (rd->count != 0)
{
rd->fail = 1;
return;
}
stash_point(rd, x, y);
}

static void
is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
{
is_rect_data *rd = arg;
if (rd->fail)
return;

if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
return;

stash_point(rd, x, y);
}

static void
is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
{
is_rect_data *rd = arg;
rd->fail = 1;
}

static void
is_rect_closepath(fz_context *ctx, void *arg)
{
is_rect_data *rd = arg;
if (rd->fail)
return;
if (rd->count == 3)
stash_point(rd, rd->corners[0].x, rd->corners[0].y);
if (rd->count != 4)
rd->fail = 1;
}

static int feq(float a,float b)
{
#define EPSILON 0.00001
a -= b;
if (a < 0)
a = -a;
return a < EPSILON;
}

static int
is_path_rect(fz_context *ctx, fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
{
float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
is_rect_data rd = { 0 };
static const fz_path_walker walker =
{
is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
};
int i;

fz_walk_path(ctx, path, &walker, &rd);

if (rd.fail)
return 0;

if (rd.count == 2)
{
stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
}

for (i = 0 ; i < 4; i++)
{
fz_point p = fz_transform_point(rd.corners[i], ctm);

rd.corners[i].x = p.x;
rd.corners[i].y = p.y;
}

/* So we have a 4 cornered path. Hopefully something like:
* 0---------1
* | |
* 3---------2
* but it might be:
* 0---------3
* | |
* 1---------2
*/
while (1)
{
d01x = rd.corners[1].x - rd.corners[0].x;
d01y = rd.corners[1].y - rd.corners[0].y;
d01 = d01x * d01x + d01y * d01y;
d03x = rd.corners[3].x - rd.corners[0].x;
d03y = rd.corners[3].y - rd.corners[0].y;
d03 = d03x * d03x + d03y * d03y;
if(d01 < d03)
{
/* We are the latter case. Transpose it. */
fz_point p = rd.corners[1];
rd.corners[1] = rd.corners[3];
rd.corners[3] = p;
}
else
break;
}
d32x = rd.corners[3].x - rd.corners[2].x;
d32y = rd.corners[3].y - rd.corners[2].y;

/* So d32x and d01x need to be the same for this to be a strikeout. */
if (!feq(d32x, d01x) || !feq(d32y, d01y))
return 0;

/* We are plausibly a rectangle. */
*thickness = sqrtf(d32x * d32x + d32y * d32y);

from->x = (rd.corners[0].x + rd.corners[3].x)/2;
from->y = (rd.corners[0].y + rd.corners[3].y)/2;
to->x = (rd.corners[1].x + rd.corners[2].x)/2;
to->y = (rd.corners[1].y + rd.corners[2].y)/2;

return 1;
}

static void
advance_x(fz_point *a, fz_point b, float d)
{
a->y += (b.y - a->y) * d / (b.x - a->x);
a->x += d;
}

if (bounds == NULL)
static void
advance_y(fz_point *a, fz_point b, float d)
{
a->x += (b.x - a->x) * d / (b.y - a->y);
a->y += d;
}

static int
line_crosses_rect(fz_point a, fz_point b, fz_rect r)
{
/* Cope with trivial exclusions */
if (a.x < r.x0 && b.x < r.x0)
return 0;
if (a.x > r.x1 && b.x > r.x1)
return 0;
if (a.y < r.y0 && b.y < r.y0)
return 0;
if (a.y > r.y1 && b.y > r.y1)
return 0;

if (a.x < r.x0)
advance_x(&a, b, r.x0 - a.x);
if (a.x > r.x1)
advance_x(&a, b, r.x1 - a.x);
if (a.y < r.y0)
advance_y(&a, b, r.y0 - a.y);
if (a.y > r.y1)
advance_y(&a, b, r.y1 - a.y);

return fz_is_point_inside_rect(a, r);
}

static void
check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
{
fz_stext_block *block = page->last_block;
int is_rect;
float thickness;
fz_point from, to, dir;
union {
fz_path *p;
const fz_path *cp;
} u;

u.cp = path;

/* Is this path a thin rectangle (possibly rotated)? If so, then we need to
* consider it as being a strikeout or underline. */
is_rect = is_path_rect(ctx, u.p, &from, &to, &thickness, ctm);
if (!is_rect)
return;

*bounds = fz_union_rect(*bounds, fz_bound_path(ctx, path, ss, ctm));
dir.x = to.x - from.x;
dir.y = to.y - from.y;
dir = fz_normalize_vector(dir);

/* Does this line nicely cover a recent span? */
while (block)
{
fz_stext_line *line;
if (block->type != FZ_STEXT_BLOCK_TEXT)
{
block = block->prev;
continue;
}
line = block->u.t.last_line;
while(line)
{
if ((feq(line->dir.x, dir.x) && feq(line->dir.y, dir.y)) ||
(feq(line->dir.x, -dir.x) && feq(line->dir.y, -dir.y)))
{
/* Matching directions... */

/* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
* chars. */
fz_stext_char *ch;
for (ch = line->first_char; ch; ch = ch->next)
{
fz_rect ch_box = fz_rect_from_quad(ch->quad);

if (line_crosses_rect(from, to, ch_box))
{
float dx, dy, dot;
/* Is this a strikeout or an underline? */

/* The baseline moves from ch->origin in the direction line->dir */
fz_point up;
up.x = line->dir.y;
up.y = -line->dir.x;

/* How far is our line displaced from the line through the origin? */
dx = from.x - ch->origin.x;
dy = from.y - ch->origin.y;
/* Dot product with up. up is normalised */
dot = dx * up.x + dy * up.y;

if (dot > 0)
ch->flags |= FZ_STEXT_STRIKEOUT;
else
ch->flags |= FZ_STEXT_UNDERLINE;
}
}
}
line = line->prev;
}

block = block->prev;
}
}

static void
fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
{
fz_stext_stroke_path(ctx, dev, path, NULL, ctm, cs, color, alpha, cp);
fz_stext_device *tdev = (fz_stext_device*)dev;
fz_stext_page *page = tdev->page;
fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
fz_rect *bounds = actualtext_bounds(tdev);

/* If we're in an actualttext, then update the bounds to include this content. */
if (bounds != NULL)
*bounds = fz_union_rect(*bounds, path_bounds);

check_for_strikeout(ctx, tdev, page, path, ctm);
}

static void
fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
{
fz_stext_device *tdev = (fz_stext_device*)dev;
fz_stext_page *page = tdev->page;
fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);

/* If we're in an actualttext, then update the bounds to include this content. */
if (bounds != NULL)
*bounds = fz_union_rect(*bounds, fz_bound_path(ctx, path, ss, ctm));

check_for_strikeout(ctx, tdev, page, path, ctm);
}

static void
Expand Down
5 changes: 3 additions & 2 deletions source/fitz/stext-output.c
Original file line number Diff line number Diff line change
Expand Up @@ -630,14 +630,15 @@ as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out)
name = font_full_name(ctx, font);
fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", name, size);
}
fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" c=\"",
fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" flags=\"%d\" c=\"",
ch->quad.ul.x, ch->quad.ul.y,
ch->quad.ur.x, ch->quad.ur.y,
ch->quad.ll.x, ch->quad.ll.y,
ch->quad.lr.x, ch->quad.lr.y,
ch->origin.x, ch->origin.y,
ch->bidi,
ch->color);
ch->color,
ch->flags);
switch (ch->c)
{
case '<': fz_write_string(ctx, out, "&lt;"); break;
Expand Down

0 comments on commit 6a41622

Please sign in to comment.