Skip to content

Commit

Permalink
feat(unicode): ensure 140 char text limit is unicode aware
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Nov 2, 2021
1 parent 130da32 commit 9274422
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 2 deletions.
29 changes: 29 additions & 0 deletions helper/unicode.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const _ = require('lodash');
const regenerate = require('regenerate');
const unicodeToArray = require('lodash/_unicodeToArray');

// non-printable control characters
// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
Expand Down Expand Up @@ -94,3 +95,31 @@ function normalize(str) {
}

module.exports.normalize = normalize;

// unicode aware string length function
// note: ported from 'npm stringz' using 'lodash' internals in place of 'char-regex'
module.exports.length = (str) => {

// sanity checking
if (!_.isString(str)) { throw new Error('invalid string'); }

// return count of unicode characters
return unicodeToArray(str).length;
};

// unicode aware substring function
// note: ported from 'npm stringz' using 'lodash' internals in place of 'char-regex'
module.exports.substring = (str, begin, end) => {

// sanity checking
if (!_.isString(str)) { throw new Error('invalid string'); }

// even though negative numbers work here, they're not in the spec
if (!_.isFinite(begin) || begin < 0) { begin = 0; }
if (_.isFinite(end) && end < 0) { end = 0; }

const chars = unicodeToArray(str);
if (chars.length === 0){ return ''; }

return chars.slice(begin, end).join('');
};
4 changes: 2 additions & 2 deletions sanitizer/_text.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ function _sanitize( raw, clean ){
if( !_.isString(text) || _.isEmpty(text) ){
messages.errors.push(`invalid param 'text': text length, must be >0`);
} else {
if( text.length > MAX_TEXT_LENGTH ){
if( unicode.length(text) > MAX_TEXT_LENGTH ){
messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`);
text = text.substring(0, MAX_TEXT_LENGTH);
text = unicode.substring(text, 0, MAX_TEXT_LENGTH);
}
clean.text = text;
}
Expand Down
14 changes: 14 additions & 0 deletions test/unit/sanitizer/_text.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const sanitizer = require('../../../sanitizer/_text')();
const unicode = require('../../../helper/unicode');

module.exports.tests = {};

Expand Down Expand Up @@ -154,6 +155,19 @@ it again and again until we reach our destination.` };
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
});

// https://github.com/pelias/api/issues/1574
test('truncate should be unicode aware', (t) => {
const raw = { text: 'a' + '👩‍❤️‍👩'.repeat(200) };
const clean = {};
const messages = sanitizer.sanitize(raw, clean);

t.equals(unicode.length(clean.text), 140);
t.equals(clean.text, 'a' + '👩‍❤️‍👩'.repeat(139));
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]);
t.end();
});
};

module.exports.all = (tape, common) => {
Expand Down

0 comments on commit 9274422

Please sign in to comment.