Skip to content

Commit

Permalink
FSL-style expressions can use unidecode data to smash case and diacri…
Browse files Browse the repository at this point in the history
…tics (#197)

* Read unidecode data, do some plumbing of it

* More unidecode plumbing

* Do the unidecode smashing, but it doesn't seem to be working

* Ah, that's better!

* Add missing header

* And reorder the includes too

* Shortcut when there is no unidecode data to work with

* Update version and changelog

* Avoid repeated unidecode smashing of the same constant string
  • Loading branch information
e-n-f authored Feb 13, 2024
1 parent 4e52cbd commit 96f126d
Show file tree
Hide file tree
Showing 15 changed files with 183 additions and 44 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 2.44.0

* Add --unidecode-data option to allow case-insensitive filter comparisons of transliterated strings

# 2.43.0

* Change -fraction-as-needed feature dropping to be consistent across tiles and zoom levels, and to follow the same pattern as point dropping by zoom level
Expand Down
10 changes: 5 additions & 5 deletions clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ static std::vector<std::pair<double, double>> clip_poly1(std::vector<std::pair<d
std::string overzoom(const std::string &s, int oz, int ox, int oy, int nz, int nx, int ny,
int detail, int buffer, std::set<std::string> const &keep, bool do_compress,
std::vector<std::pair<unsigned, unsigned>> *next_overzoomed_tiles,
bool demultiply, json_object *filter, bool preserve_input_order, std::unordered_map<std::string, attribute_op> const &attribute_accum) {
bool demultiply, json_object *filter, bool preserve_input_order, std::unordered_map<std::string, attribute_op> const &attribute_accum, std::vector<std::string> const &unidecode_data) {
mvt_tile tile;

try {
Expand All @@ -772,7 +772,7 @@ std::string overzoom(const std::string &s, int oz, int ox, int oy, int nz, int n
exit(EXIT_PROTOBUF);
}

return overzoom(tile, oz, ox, oy, nz, nx, ny, detail, buffer, keep, do_compress, next_overzoomed_tiles, demultiply, filter, preserve_input_order, attribute_accum);
return overzoom(tile, oz, ox, oy, nz, nx, ny, detail, buffer, keep, do_compress, next_overzoomed_tiles, demultiply, filter, preserve_input_order, attribute_accum, unidecode_data);
}

struct tile_feature {
Expand Down Expand Up @@ -873,7 +873,7 @@ static struct preservecmp {
std::string overzoom(const mvt_tile &tile, int oz, int ox, int oy, int nz, int nx, int ny,
int detail, int buffer, std::set<std::string> const &keep, bool do_compress,
std::vector<std::pair<unsigned, unsigned>> *next_overzoomed_tiles,
bool demultiply, json_object *filter, bool preserve_input_order, std::unordered_map<std::string, attribute_op> const &attribute_accum) {
bool demultiply, json_object *filter, bool preserve_input_order, std::unordered_map<std::string, attribute_op> const &attribute_accum, std::vector<std::string> const &unidecode_data) {
mvt_tile outtile;
std::shared_ptr<std::string> tile_stringpool = std::make_shared<std::string>();

Expand Down Expand Up @@ -924,7 +924,7 @@ std::string overzoom(const mvt_tile &tile, int oz, int ox, int oy, int nz, int n
}

std::set<std::string> exclude_attributes;
if (filter != NULL && !evaluate(feature, layer, filter, exclude_attributes, nz)) {
if (filter != NULL && !evaluate(feature, layer, filter, exclude_attributes, nz, unidecode_data)) {
continue;
}

Expand Down Expand Up @@ -1048,7 +1048,7 @@ std::string overzoom(const mvt_tile &tile, int oz, int ox, int oy, int nz, int n
std::string child = overzoom(outtile, nz, nx, ny,
nz + 1, nx * 2 + x, ny * 2 + y,
detail, buffer, keep, false, NULL,
demultiply, filter, preserve_input_order, attribute_accum);
demultiply, filter, preserve_input_order, attribute_accum, unidecode_data);
if (child.size() > 0) {
next_overzoomed_tiles->emplace_back(nx * 2 + x, ny * 2 + y);
}
Expand Down
80 changes: 59 additions & 21 deletions evaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
#include "evaluator.hpp"
#include "errors.hpp"
#include "milo/dtoa_milo.h"
#include "text.hpp"

static std::string mvt_value_to_string(mvt_value const &one, bool &fail) {
static std::string mvt_value_to_string(mvt_value const &one, bool &fail, std::vector<std::string> const &unidecode_data) {
switch (one.type) {
case mvt_string:
return one.get_string_value();
if (unidecode_data.size() > 0) {
return unidecode_smash(unidecode_data, one.c_str());
} else {
return one.get_string_value();
}
case mvt_float:
return milo::dtoa_milo(one.numeric_value.float_value);
case mvt_double:
Expand All @@ -34,7 +39,27 @@ static std::string mvt_value_to_string(mvt_value const &one, bool &fail) {
}
}

int compare_fsl(mvt_value const &one, json_object *two, bool &fail) {
// Alter the JSON parse tree in place to replace the original match string
// with its unidecode-smashed version.
//
// To avoid repeated re-smashings of the same JSON object, objects that have
// already been smashed are marked by setting their refcon to the unidecode data.
static void smash(std::vector<std::string> const &unidecode_data, json_object *j) {
if (j->value.string.refcon == (void *) &unidecode_data) {
return;
}

std::string s = unidecode_smash(unidecode_data, j->value.string.string);
j->value.string.string = (char *) realloc(j->value.string.string, s.size() + 1);
if (j->value.string.string == NULL) {
perror("realloc for unidecode_smash");
exit(EXIT_MEMORY);
}
strcpy(j->value.string.string, s.c_str());
j->value.string.refcon = (void *) &unidecode_data;
}

int compare_fsl(mvt_value const &one, json_object *two, bool &fail, std::vector<std::string> const &unidecode_data) {
// In FSL expressions, the attribute value is coerced to the type
// of the JSON literal value it is being compared to.
//
Expand Down Expand Up @@ -96,7 +121,11 @@ int compare_fsl(mvt_value const &one, json_object *two, bool &fail) {
}

if (two->type == JSON_STRING) {
std::string lhs = mvt_value_to_string(one, fail);
std::string lhs = mvt_value_to_string(one, fail, unidecode_data);

if (unidecode_data.size() > 0) {
smash(unidecode_data, two);
}

return strcmp(lhs.c_str(), two->value.string.string);
}
Expand Down Expand Up @@ -226,7 +255,7 @@ int compare(mvt_value const &one, json_object *two, bool &fail) {
// 0: false
// 1: true
// -1: incomparable (sql null), treated as false in final output
static int eval(std::function<mvt_value(std::string const &)> feature, json_object *f, std::set<std::string> &exclude_attributes) {
static int eval(std::function<mvt_value(std::string const &)> feature, json_object *f, std::set<std::string> &exclude_attributes, std::vector<std::string> const &unidecode_data) {
if (f != NULL) {
if (f->type == JSON_TRUE) {
return 1;
Expand Down Expand Up @@ -283,10 +312,10 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
lhs = -1; // not found: null
}
} else {
lhs = eval(feature, f->value.array.array[0], exclude_attributes);
lhs = eval(feature, f->value.array.array[0], exclude_attributes, unidecode_data);
}

int rhs = eval(feature, f->value.array.array[2], exclude_attributes);
int rhs = eval(feature, f->value.array.array[2], exclude_attributes, unidecode_data);
if (lhs < 0 && rhs < 0) {
return -1; // null op null => null
}
Expand Down Expand Up @@ -337,12 +366,17 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
if (f->value.array.array[2]->type == JSON_STRING &&
(strcmp(f->value.array.array[1]->value.string.string, "cn") == 0 ||
strcmp(f->value.array.array[1]->value.string.string, "nc") == 0)) {
std::string s = mvt_value_to_string(lhs, fail);
std::string s = mvt_value_to_string(lhs, fail, unidecode_data);
if (fail) {
return -1; // null cn anything => false
}

bool contains = strstr(s.c_str(), f->value.array.array[2]->value.string.string);
bool contains;
if (unidecode_data.size() > 0) {
smash(unidecode_data, f->value.array.array[2]);
}
contains = strstr(s.c_str(), f->value.array.array[2]->value.string.string);

if (strcmp(f->value.array.array[1]->value.string.string, "cn") == 0) {
return contains;
} else {
Expand All @@ -353,7 +387,7 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
if (f->value.array.array[2]->type == JSON_ARRAY &&
(strcmp(f->value.array.array[1]->value.string.string, "in") == 0 ||
strcmp(f->value.array.array[1]->value.string.string, "ni") == 0)) {
std::string s = mvt_value_to_string(lhs, fail);
std::string s = mvt_value_to_string(lhs, fail, unidecode_data);
if (fail) {
return -1; // null in anything => false
}
Expand All @@ -364,7 +398,11 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
return -1; // anything in [not-a-string] => null
}

if (s == f->value.array.array[2]->value.array.array[i]->value.string.string) {
if (unidecode_data.size() > 0) {
smash(unidecode_data, f->value.array.array[2]->value.array.array[i]);
}

if (strcmp(s.c_str(), f->value.array.array[2]->value.array.array[i]->value.string.string) == 0) {
contains = true;
break;
}
Expand All @@ -377,7 +415,7 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
}
}

int cmp = compare_fsl(ff, f->value.array.array[2], fail);
int cmp = compare_fsl(ff, f->value.array.array[2], fail, unidecode_data);
if (fail) {
return -1; // null
}
Expand Down Expand Up @@ -516,7 +554,7 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
}

for (size_t i = 1; i < f->value.array.length; i++) {
int out = eval(feature, f->value.array.array[i], exclude_attributes);
int out = eval(feature, f->value.array.array[i], exclude_attributes, unidecode_data);

if (out >= 0) { // nulls are ignored in boolean and/or expressions
if (strcmp(f->value.array.array[0]->value.string.string, "all") == 0) {
Expand Down Expand Up @@ -607,7 +645,7 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
exit(EXIT_FILTER);
}

bool ok = eval(feature, f->value.array.array[2], exclude_attributes) > 0;
bool ok = eval(feature, f->value.array.array[2], exclude_attributes, unidecode_data) > 0;
if (!ok) {
exclude_attributes.insert(f->value.array.array[1]->value.string.string);
}
Expand All @@ -619,7 +657,7 @@ static int eval(std::function<mvt_value(std::string const &)> feature, json_obje
exit(EXIT_FILTER);
}

bool evaluate(std::function<mvt_value(std::string const &)> feature, std::string const &layer, json_object *filter, std::set<std::string> &exclude_attributes) {
bool evaluate(std::function<mvt_value(std::string const &)> feature, std::string const &layer, json_object *filter, std::set<std::string> &exclude_attributes, std::vector<std::string> const &unidecode_data) {
if (filter == NULL || filter->type != JSON_HASH) {
fprintf(stderr, "Error: filter is not a hash: %s\n", json_stringify(filter));
exit(EXIT_JSON);
Expand All @@ -630,12 +668,12 @@ bool evaluate(std::function<mvt_value(std::string const &)> feature, std::string

f = json_hash_get(filter, layer.c_str());
if (ok && f != NULL) {
ok = eval(feature, f, exclude_attributes) > 0;
ok = eval(feature, f, exclude_attributes, unidecode_data) > 0;
}

f = json_hash_get(filter, "*");
if (ok && f != NULL) {
ok = eval(feature, f, exclude_attributes) > 0;
ok = eval(feature, f, exclude_attributes, unidecode_data) > 0;
}

return ok;
Expand Down Expand Up @@ -673,7 +711,7 @@ json_object *parse_filter(const char *s) {
return filter;
}

bool evaluate(std::unordered_map<std::string, mvt_value> const &feature, std::string const &layer, json_object *filter, std::set<std::string> &exclude_attributes) {
bool evaluate(std::unordered_map<std::string, mvt_value> const &feature, std::string const &layer, json_object *filter, std::set<std::string> &exclude_attributes, std::vector<std::string> const &unidecode_data) {
std::function<mvt_value(std::string const &)> getter = [&](std::string const &key) {
auto f = feature.find(key);
if (f != feature.end()) {
Expand All @@ -686,10 +724,10 @@ bool evaluate(std::unordered_map<std::string, mvt_value> const &feature, std::st
}
};

return evaluate(getter, layer, filter, exclude_attributes);
return evaluate(getter, layer, filter, exclude_attributes, unidecode_data);
}

bool evaluate(mvt_feature const &feat, mvt_layer const &layer, json_object *filter, std::set<std::string> &exclude_attributes, int z) {
bool evaluate(mvt_feature const &feat, mvt_layer const &layer, json_object *filter, std::set<std::string> &exclude_attributes, int z, std::vector<std::string> const &unidecode_data) {
std::function<mvt_value(std::string const &)> getter = [&](std::string const &key) {
const static std::string dollar_id = "$id";
if (key == dollar_id && feat.has_id) {
Expand Down Expand Up @@ -737,5 +775,5 @@ bool evaluate(mvt_feature const &feat, mvt_layer const &layer, json_object *filt
return v;
};

return evaluate(getter, layer.name, filter, exclude_attributes);
return evaluate(getter, layer.name, filter, exclude_attributes, unidecode_data);
}
4 changes: 2 additions & 2 deletions evaluator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
#include "jsonpull/jsonpull.h"
#include "mvt.hpp"

bool evaluate(std::unordered_map<std::string, mvt_value> const &feature, std::string const &layer, json_object *filter, std::set<std::string> &exclude_attributes);
bool evaluate(std::unordered_map<std::string, mvt_value> const &feature, std::string const &layer, json_object *filter, std::set<std::string> &exclude_attributes, std::vector<std::string> const &unidecode_data);
json_object *parse_filter(const char *s);
json_object *read_filter(const char *fname);

bool evaluate(mvt_feature const &feat, mvt_layer const &layer, json_object *filter, std::set<std::string> &exclude_attributes, int z);
bool evaluate(mvt_feature const &feat, mvt_layer const &layer, json_object *filter, std::set<std::string> &exclude_attributes, int z, std::vector<std::string> const &unidecode_data);

#endif
6 changes: 4 additions & 2 deletions geometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,14 @@ std::string overzoom(const mvt_tile &tile, int oz, int ox, int oy, int nz, int n
int detail, int buffer, std::set<std::string> const &keep, bool do_compress,
std::vector<std::pair<unsigned, unsigned>> *next_overzoomed_tiles,
bool demultiply, json_object *filter, bool preserve_input_order,
std::unordered_map<std::string, attribute_op> const &attribute_accum);
std::unordered_map<std::string, attribute_op> const &attribute_accum,
std::vector<std::string> const &unidecode_data);

std::string overzoom(const std::string &s, int oz, int ox, int oy, int nz, int nx, int ny,
int detail, int buffer, std::set<std::string> const &keep, bool do_compress,
std::vector<std::pair<unsigned, unsigned>> *next_overzoomed_tiles,
bool demultiply, json_object *filter, bool preserve_input_order,
std::unordered_map<std::string, attribute_op> const &attribute_accum);
std::unordered_map<std::string, attribute_op> const &attribute_accum,
std::vector<std::string> const &unidecode_data);

#endif
1 change: 1 addition & 0 deletions jsonpull/jsonpull.c
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ json_object *json_read_separators(json_pull *j, json_separator_callback cb, void
json_object *s = add_object(j, JSON_STRING);
if (s != NULL) {
s->value.string.string = val.buf;
s->value.string.refcon = NULL;
} else {
string_free(&val);
}
Expand Down
1 change: 1 addition & 0 deletions jsonpull/jsonpull.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ typedef struct json_object {

struct {
char *string;
void *refcon; // reference constant for caller's use
} string;

struct {
Expand Down
6 changes: 5 additions & 1 deletion main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ std::map<std::string, serial_val> set_attributes;
unsigned long long preserve_point_density_threshold = 0;
long long extend_zooms_max = 0;
int retain_points_multiplier = 1;
std::vector<std::string> unidecode_data;

std::vector<order_field> order_by;
bool order_reverse;
Expand Down Expand Up @@ -2759,7 +2760,7 @@ std::pair<int, metadata> read_input(std::vector<source> &sources, char *fname, i
std::atomic<unsigned> midx(0);
std::atomic<unsigned> midy(0);
std::vector<strategy> strategies;
int written = traverse_zooms(fd, size, stringpool, &midx, &midy, maxzoom, minzoom, outdb, outdir, buffer, fname, tmpdir, gamma, full_detail, low_detail, min_detail, pool_off, initial_x, initial_y, simplification, maxzoom_simplification, layermaps, prefilter, postfilter, attribute_accum, filter, strategies, iz, shared_nodes_map, nodepos, basezoom, droprate);
int written = traverse_zooms(fd, size, stringpool, &midx, &midy, maxzoom, minzoom, outdb, outdir, buffer, fname, tmpdir, gamma, full_detail, low_detail, min_detail, pool_off, initial_x, initial_y, simplification, maxzoom_simplification, layermaps, prefilter, postfilter, attribute_accum, filter, strategies, iz, shared_nodes_map, nodepos, basezoom, droprate, unidecode_data);

if (maxzoom != written) {
if (written > minzoom) {
Expand Down Expand Up @@ -3074,6 +3075,7 @@ int main(int argc, char **argv) {
{"Filtering features by attributes", 0, 0, 0},
{"feature-filter-file", required_argument, 0, 'J'},
{"feature-filter", required_argument, 0, 'j'},
{"unidecode-data", required_argument, 0, '~'},

{"Dropping a fixed fraction of features by zoom level", 0, 0, 0},
{"drop-rate", required_argument, 0, 'r'},
Expand Down Expand Up @@ -3300,6 +3302,8 @@ int main(int argc, char **argv) {
extend_zooms_max = atoll_require(optarg, "Maximum number by which to extend zooms");
} else if (strcmp(opt, "retain-points-multiplier") == 0) {
retain_points_multiplier = atoll_require(optarg, "Multiply the fraction of points retained by zoom level");
} else if (strcmp(opt, "unidecode-data") == 0) {
unidecode_data = read_unidecode(optarg);
} else {
fprintf(stderr, "%s: Unrecognized option --%s\n", argv[0], opt);
exit(EXIT_ARGS);
Expand Down
9 changes: 8 additions & 1 deletion overzoom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "geometry.hpp"
#include "evaluator.hpp"
#include "attribute.hpp"
#include "text.hpp"

extern char *optarg;
extern int optind;
Expand All @@ -18,6 +19,7 @@ bool demultiply = false;
std::string filter;
bool preserve_input_order = false;
std::unordered_map<std::string, attribute_op> attribute_accum;
std::vector<std::string> unidecode_data;

std::set<std::string> keep;

Expand All @@ -40,6 +42,7 @@ int main(int argc, char **argv) {
{"feature-filter", required_argument, 0, 'j'},
{"preserve-input-order", no_argument, 0, 'o' & 0x1F},
{"accumulate-attribute", required_argument, 0, 'E'},
{"unidecode-data", required_argument, 0, 'u' & 0x1F},

{0, 0, 0, 0},
};
Expand Down Expand Up @@ -90,6 +93,10 @@ int main(int argc, char **argv) {
set_attribute_accum(attribute_accum, optarg, argv);
break;

case 'u' & 0x1F:
unidecode_data = read_unidecode(optarg);
break;

default:
fprintf(stderr, "Unrecognized flag -%c\n", i);
usage(argv);
Expand Down Expand Up @@ -144,7 +151,7 @@ int main(int argc, char **argv) {
json_filter = parse_filter(filter.c_str());
}

std::string out = overzoom(tile, oz, ox, oy, nz, nx, ny, detail, buffer, keep, true, NULL, demultiply, json_filter, preserve_input_order, attribute_accum);
std::string out = overzoom(tile, oz, ox, oy, nz, nx, ny, detail, buffer, keep, true, NULL, demultiply, json_filter, preserve_input_order, attribute_accum, unidecode_data);
fwrite(out.c_str(), sizeof(char), out.size(), f);
fclose(f);

Expand Down
Loading

0 comments on commit 96f126d

Please sign in to comment.