-
-
Notifications
You must be signed in to change notification settings - Fork 163
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
unicode: ensure 140 char text limit is unicode aware #1576
base: master
Are you sure you want to change the base?
Conversation
d0e3d11
to
9274422
Compare
I'm moving this to DRAFT for now pending further discussion. The current pushed state only implements this functionality for the The issue I'm now having (after pulling in #1577) is writing a testcase for this, since it was easy enough to use emoji to make it fail, but now they are being correctly stripped. Other multi-byte characters such as Kanji have the correct length in JS and the Combining Marks are already being handled, so I can't think of any unicode range (other than using diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
index 89f547d4..7b419b52 100644
--- a/sanitizer/_text_pelias_parser.js
+++ b/sanitizer/_text_pelias_parser.js
@@ -1,10 +1,10 @@
+const _ = require('lodash');
const logger = require('pelias-logger').get('api');
const unicode = require('../helper/unicode');
const Tokenizer = require('pelias-parser/tokenization/Tokenizer');
const Solution = require('pelias-parser/solver/Solution');
const AddressParser = require('pelias-parser/parser/AddressParser');
const parser = new AddressParser();
-const _ = require('lodash');
const MAX_TEXT_LENGTH = 140;
/**
@@ -38,9 +38,9 @@ function _sanitize (raw, clean) {
else {
// truncate text to $MAX_TEXT_LENGTH chars
- if (text.length > MAX_TEXT_LENGTH) {
+ if (unicode.length(text) > MAX_TEXT_LENGTH) {
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
- text = text.substring(0, MAX_TEXT_LENGTH);
+ text = unicode.substring(text, 0, MAX_TEXT_LENGTH);
}
// parse text with pelias/parser
@@ -53,7 +53,7 @@ function _sanitize (raw, clean) {
}
function parse (clean) {
-
+
// parse text
let start = new Date();
const t = new Tokenizer(clean.text);
@@ -225,7 +225,7 @@ function parse (clean) {
}
}
}
-
+
// unknown query type
else {
parsed_text.subject = t.span.body;
diff --git a/test/unit/sanitizer/_text.js b/test/unit/sanitizer/_text.js
index cb55e120..e3492722 100644
--- a/test/unit/sanitizer/_text.js
+++ b/test/unit/sanitizer/_text.js
@@ -143,9 +143,9 @@ module.exports.tests.text_parser = function(test, common) {
test('should truncate very long text inputs', (t) => {
const raw = { text: `
Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about
+We will never make a journey of a thousand miles by fretting about
how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating
+We make the journey by taking each day step by step and then repeating
it again and again until we reach our destination.` };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);
@@ -158,12 +158,12 @@ it again and again until we reach our destination.` };
// https://github.com/pelias/api/issues/1574
test('truncate should be unicode aware', (t) => {
- const raw = { text: 'a' + '👩❤️👩'.repeat(200) };
+ const raw = { text: 'a' + '慶'.repeat(200) };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);
t.equals(unicode.length(clean.text), 140);
- t.equals(clean.text, 'a' + '👩❤️👩'.repeat(139));
+ t.equals(clean.text, 'a' + '慶'.repeat(139));
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js
index a169d5d9..848aded7 100644
--- a/test/unit/sanitizer/_text_pelias_parser.js
+++ b/test/unit/sanitizer/_text_pelias_parser.js
@@ -1,5 +1,5 @@
-var sanitizer = require('../../../sanitizer/_text_pelias_parser')();
-var type_mapping = require('../../../helper/type_mapping');
+const sanitizer = require('../../../sanitizer/_text_pelias_parser')();
+const unicode = require('../../../helper/unicode');
module.exports.tests = {};
@@ -20,7 +20,7 @@ module.exports.tests.text_parser = function (test, common) {
});
let cases = [];
-
+
// USA queries
cases.push(['soho, new york, NY', {
subject: 'soho',
@@ -322,7 +322,7 @@ module.exports.tests.text_parser = function (test, common) {
cases.push(['New York', { subject: 'New York' }, true]);
cases.push(['New York N', { subject: 'New York' }, true]);
cases.push(['New York NY', { subject: 'New York' }, true]);
-
+
cases.push(['B', { subject: 'B' }, true]);
cases.push(['Be', { subject: 'Be' }, true]);
cases.push(['Ber', { subject: 'Ber' }, true]);
@@ -421,9 +421,9 @@ module.exports.tests.text_parser = function (test, common) {
const raw = {
text: `
Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about
+We will never make a journey of a thousand miles by fretting about
how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating
+We make the journey by taking each day step by step and then repeating
it again and again until we reach our destination.` };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);
@@ -433,6 +433,19 @@ it again and again until we reach our destination.` };
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
});
+
+ // https://github.com/pelias/api/issues/1574
+ test('truncate should be unicode aware', (t) => {
+ const raw = { text: 'a' + '慶'.repeat(200) };
+ const clean = {};
+ const messages = sanitizer.sanitize(raw, clean);
+
+ t.equals(unicode.length(clean.text), 140);
+ t.equals(clean.text, 'a' + '慶'.repeat(139));
+ t.deepEquals(messages.errors, [], 'no errors');
+ t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
+ t.end();
+ });
};
module.exports.all = function (tape, common) { |
Pelias enforces a 140 character limit for the
?text=
param for securty/performance reasons.Prior to this PR the two methods used to implement this behaviour (namely
String.prototype.length
andString.prototype.substring
) were not unicode aware.The linked issue noted that this can cause problems when the
substring
function truncates the string in a place where a 'unicode code point' is represented by multiple 'unicode code units'.In this case we end up slicing it such a way that the encoding becomes invalid (ie. cutting a two-byte utf8 char after the first byte).
I originally used the package stringz, which is a fairly thin wrapper around the char-regex package.
..but after having a look through the code I discovered that
char-regex
was ported from lodash and I managed to get a reference to the 'internal' function, so we don't need to bring in any new dependencies, plus lodash is well maintained.note: I had a look at trying to detect the charset and handling
utf16->utf8
conversion but it's very complicated, at some point I'd be happy with simply detecting and erroring on any non-utf8 charset but even that is not trivial.