Skip to content

Commit

Permalink
Removed unicode.chars regexes
Browse files Browse the repository at this point in the history
  • Loading branch information
viktor-podzigun committed Feb 9, 2024
1 parent ff8227f commit 8124473
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 236 deletions.
60 changes: 55 additions & 5 deletions lib/helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,62 @@ helpers.cleanTags = function(text) {
return helpers.stripTags(text).trim();
};

/**
* @param {string} text
* @returns {string}
*/
helpers.dropUnicode = function(text) {
if (!text) return '';
return text
.replace(unicode.chars.all, '??')
.replace(unicode.chars.combining, '')
.replace(unicode.chars.surrogate, '?');
return helpers.replaceUnicode(text, defaultUnicodeReplacer);
};

/**
* @param {boolean} isSurrogate
* @param {number} charWidth
* @returns {string}
*/
function defaultUnicodeReplacer(isSurrogate, charWidth) {
if (isSurrogate) {
return "?";
}
if (charWidth > 1) {
return "??";
}
return "";
}

/**
* @param {string} str
* @param {(isSurrogate: boolean, charWidth: number, ch: string) => string} replacer
* @returns {string}
*/
helpers.replaceUnicode = function(str, replacer) {
if (!str) return '';

const result = [];
var next = undefined;
for (var i = 0; i < str.length; i++) {
const isSurrogate = unicode.isSurrogate(str, i);
const cw = unicode.charWidth(str, i);
if (cw != 1 || isSurrogate) {
const start = next ?? 0;
if (start < i) {
result.push(str.substring(start, i));
}
next = i + (isSurrogate ? 2 : 1);
const ch = str.substring(i, next);
const rc = replacer(isSurrogate, cw, ch);
if (rc) {
result.push(rc);
}
}
if (isSurrogate) i++;
}

if (next && next < i) {
result.push(str.substring(next, str.length));
}

return result.length === 0 ? str : result.join("");
};

helpers.__defineGetter__('Screen', function() {
Expand Down
248 changes: 19 additions & 229 deletions lib/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ var floor = Math.floor;
* Wide, Surrogates, and Combining
*/

/**
* @param {string | number} str string or code point
* @param {number} [i] char index within str string
* @returns {number}
*/
exports.charWidth = function(str, i) {
var point = typeof str !== 'number'
? exports.codePointAt(str, i || 0)
Expand Down Expand Up @@ -401,6 +406,10 @@ exports.charWidth = function(str, i) {
return 1;
};

/**
* @param {string} str
* @returns {number}
*/
exports.strWidth = function(str) {
var width = 0;
for (var i = 0; i < str.length; i++) {
Expand All @@ -410,6 +419,11 @@ exports.strWidth = function(str) {
return width;
};

/**
* @param {string | number} str string or code point
* @param {number} [i] char index within str string
* @returns {boolean}
*/
exports.isSurrogate = function(str, i) {
var point = typeof str !== 'number'
? exports.codePointAt(str, i || 0)
Expand Down Expand Up @@ -475,6 +489,11 @@ const combining = combiningTable.reduce(function(out, row) {
return out;
}, {});

/**
* @param {string | number} str string or code point
* @param {number} [i] char index within str string
* @returns {boolean}
*/
exports.isCombining = function(str, i) {
var point = typeof str !== 'number'
? exports.codePointAt(str, i || 0)
Expand Down Expand Up @@ -577,232 +596,3 @@ exports.fromCodePoint = function() {
}
return result;
};

/**
* Regexes
*/

exports.chars = {};

// Double width characters that are _not_ surrogate pairs.
// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
// regex anyway. This regex is used to put a blank char after wide chars to
// be eaten, however, if this is a surrogate pair, parseContent already adds
// the extra one char because its length equals 2 instead of 1.
exports.chars.wide = new RegExp('(['
+ '\\u1100-\\u115f' // Hangul Jamo init. consonants
+ '\\u2329\\u232a'
+ '\\u2e80-\\u303e\\u3040-\\ua4cf' // CJK ... Yi
+ '\\uac00-\\ud7a3' // Hangul Syllables
+ '\\uf900-\\ufaff' // CJK Compatibility Ideographs
+ '\\ufe10-\\ufe19' // Vertical forms
+ '\\ufe30-\\ufe6f' // CJK Compatibility Forms
+ '\\uff00-\\uff60' // Fullwidth Forms
+ '\\uffe0-\\uffe6'
+ '])', 'g');

// All surrogate pair wide chars.
exports.chars.swide = new RegExp('('
// 0x20000 - 0x2fffd:
// + '[\\ud840-\\ud87f][\\udc00-\\udffd]'
// treat all surrogate pairs as wide chars (2 cells)
+ '[\\ud800-\\udbff][\\udc00-\\udfff]'
+ '|'
// 0x30000 - 0x3fffd:
+ '[\\ud880-\\ud8bf][\\udc00-\\udffd]'
+ ')', 'g');

// All wide chars including surrogate pairs.
exports.chars.all = new RegExp('('
+ exports.chars.swide.source.slice(1, -1)
+ '|'
+ exports.chars.wide.source.slice(1, -1)
+ ')', 'g');

// Regex to detect a surrogate pair.
exports.chars.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g;

// Regex to find combining characters.
exports.chars.combining = combiningTable.reduce(function(out, row) {
var low, high, range;
if (row[0] > 0x00ffff) {
low = exports.fromCodePoint(row[0]);
low = [
hexify(low.charCodeAt(0)),
hexify(low.charCodeAt(1))
];
high = exports.fromCodePoint(row[1]);
high = [
hexify(high.charCodeAt(0)),
hexify(high.charCodeAt(1))
];
range = '[\\u' + low[0] + '-' + '\\u' + high[0] + ']'
+ '[\\u' + low[1] + '-' + '\\u' + high[1] + ']';
if (!~out.indexOf('|')) out += ']';
out += '|' + range;
} else {
low = hexify(row[0]);
high = hexify(row[1]);
low = '\\u' + low;
high = '\\u' + high;
out += low + '-' + high;
}
return out;
}, '[');

exports.chars.combining = new RegExp(exports.chars.combining, 'g');

function hexify(n) {
n = n.toString(16);
while (n.length < 4) n = '0' + n;
return n;
}

/*
exports.chars.combining = new RegExp(
'['
+ '\\u0300-\\u036f'
+ '\\u0483-\\u0486'
+ '\\u0488-\\u0489'
+ '\\u0591-\\u05bd'
+ '\\u05bf-\\u05bf'
+ '\\u05c1-\\u05c2'
+ '\\u05c4-\\u05c5'
+ '\\u05c7-\\u05c7'
+ '\\u0600-\\u0603'
+ '\\u0610-\\u0615'
+ '\\u064b-\\u065e'
+ '\\u0670-\\u0670'
+ '\\u06d6-\\u06e4'
+ '\\u06e7-\\u06e8'
+ '\\u06ea-\\u06ed'
+ '\\u070f-\\u070f'
+ '\\u0711-\\u0711'
+ '\\u0730-\\u074a'
+ '\\u07a6-\\u07b0'
+ '\\u07eb-\\u07f3'
+ '\\u0901-\\u0902'
+ '\\u093c-\\u093c'
+ '\\u0941-\\u0948'
+ '\\u094d-\\u094d'
+ '\\u0951-\\u0954'
+ '\\u0962-\\u0963'
+ '\\u0981-\\u0981'
+ '\\u09bc-\\u09bc'
+ '\\u09c1-\\u09c4'
+ '\\u09cd-\\u09cd'
+ '\\u09e2-\\u09e3'
+ '\\u0a01-\\u0a02'
+ '\\u0a3c-\\u0a3c'
+ '\\u0a41-\\u0a42'
+ '\\u0a47-\\u0a48'
+ '\\u0a4b-\\u0a4d'
+ '\\u0a70-\\u0a71'
+ '\\u0a81-\\u0a82'
+ '\\u0abc-\\u0abc'
+ '\\u0ac1-\\u0ac5'
+ '\\u0ac7-\\u0ac8'
+ '\\u0acd-\\u0acd'
+ '\\u0ae2-\\u0ae3'
+ '\\u0b01-\\u0b01'
+ '\\u0b3c-\\u0b3c'
+ '\\u0b3f-\\u0b3f'
+ '\\u0b41-\\u0b43'
+ '\\u0b4d-\\u0b4d'
+ '\\u0b56-\\u0b56'
+ '\\u0b82-\\u0b82'
+ '\\u0bc0-\\u0bc0'
+ '\\u0bcd-\\u0bcd'
+ '\\u0c3e-\\u0c40'
+ '\\u0c46-\\u0c48'
+ '\\u0c4a-\\u0c4d'
+ '\\u0c55-\\u0c56'
+ '\\u0cbc-\\u0cbc'
+ '\\u0cbf-\\u0cbf'
+ '\\u0cc6-\\u0cc6'
+ '\\u0ccc-\\u0ccd'
+ '\\u0ce2-\\u0ce3'
+ '\\u0d41-\\u0d43'
+ '\\u0d4d-\\u0d4d'
+ '\\u0dca-\\u0dca'
+ '\\u0dd2-\\u0dd4'
+ '\\u0dd6-\\u0dd6'
+ '\\u0e31-\\u0e31'
+ '\\u0e34-\\u0e3a'
+ '\\u0e47-\\u0e4e'
+ '\\u0eb1-\\u0eb1'
+ '\\u0eb4-\\u0eb9'
+ '\\u0ebb-\\u0ebc'
+ '\\u0ec8-\\u0ecd'
+ '\\u0f18-\\u0f19'
+ '\\u0f35-\\u0f35'
+ '\\u0f37-\\u0f37'
+ '\\u0f39-\\u0f39'
+ '\\u0f71-\\u0f7e'
+ '\\u0f80-\\u0f84'
+ '\\u0f86-\\u0f87'
+ '\\u0f90-\\u0f97'
+ '\\u0f99-\\u0fbc'
+ '\\u0fc6-\\u0fc6'
+ '\\u102d-\\u1030'
+ '\\u1032-\\u1032'
+ '\\u1036-\\u1037'
+ '\\u1039-\\u1039'
+ '\\u1058-\\u1059'
+ '\\u1160-\\u11ff'
+ '\\u135f-\\u135f'
+ '\\u1712-\\u1714'
+ '\\u1732-\\u1734'
+ '\\u1752-\\u1753'
+ '\\u1772-\\u1773'
+ '\\u17b4-\\u17b5'
+ '\\u17b7-\\u17bd'
+ '\\u17c6-\\u17c6'
+ '\\u17c9-\\u17d3'
+ '\\u17dd-\\u17dd'
+ '\\u180b-\\u180d'
+ '\\u18a9-\\u18a9'
+ '\\u1920-\\u1922'
+ '\\u1927-\\u1928'
+ '\\u1932-\\u1932'
+ '\\u1939-\\u193b'
+ '\\u1a17-\\u1a18'
+ '\\u1b00-\\u1b03'
+ '\\u1b34-\\u1b34'
+ '\\u1b36-\\u1b3a'
+ '\\u1b3c-\\u1b3c'
+ '\\u1b42-\\u1b42'
+ '\\u1b6b-\\u1b73'
+ '\\u1dc0-\\u1dca'
+ '\\u1dfe-\\u1dff'
+ '\\u200b-\\u200f'
+ '\\u202a-\\u202e'
+ '\\u2060-\\u2063'
+ '\\u206a-\\u206f'
+ '\\u20d0-\\u20ef'
+ '\\u302a-\\u302f'
+ '\\u3099-\\u309a'
+ '\\ua806-\\ua806'
+ '\\ua80b-\\ua80b'
+ '\\ua825-\\ua826'
+ '\\ufb1e-\\ufb1e'
+ '\\ufe00-\\ufe0f'
+ '\\ufe20-\\ufe23'
+ '\\ufeff-\\ufeff'
+ '\\ufff9-\\ufffb'
+ ']'
+ '|[\\ud802-\\ud802][\\ude01-\\ude03]'
+ '|[\\ud802-\\ud802][\\ude05-\\ude06]'
+ '|[\\ud802-\\ud802][\\ude0c-\\ude0f]'
+ '|[\\ud802-\\ud802][\\ude38-\\ude3a]'
+ '|[\\ud802-\\ud802][\\ude3f-\\ude3f]'
+ '|[\\ud834-\\ud834][\\udd67-\\udd69]'
+ '|[\\ud834-\\ud834][\\udd73-\\udd82]'
+ '|[\\ud834-\\ud834][\\udd85-\\udd8b]'
+ '|[\\ud834-\\ud834][\\uddaa-\\uddad]'
+ '|[\\ud834-\\ud834][\\ude42-\\ude44]'
+ '|[\\udb40-\\udb40][\\udc01-\\udc01]'
+ '|[\\udb40-\\udb40][\\udc20-\\udc7f]'
+ '|[\\udb40-\\udb40][\\udd00-\\uddef]'
, 'g');
*/
5 changes: 4 additions & 1 deletion lib/widgets/element.js
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,10 @@ Element.prototype.parseContent = function(noTags) {
if (this.screen.fullUnicode) {
// double-width chars will eat the next char after render. create a
// blank character after it so it doesn't eat the real next char.
content = content.replace(unicode.chars.all, '$1\x03');
// content = content.replace(unicode.chars.all, '$1\x03');
content = helpers.replaceUnicode(content, (isSurrogate, charWidth, ch) => {
return charWidth > 1 ? `${ch}\x03` : ch;
});
// iTerm2 cannot render combining characters properly. - not the case anymore!
// see: https://gitlab.com/gnachman/iterm2/-/issues/2639
// if (this.screen.program.isiTerm2) {
Expand Down
1 change: 1 addition & 0 deletions test/all.mjs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
await import("./lib/helpers.test.mjs");
await import("./lib/unicode.test.mjs");
Loading

0 comments on commit 8124473

Please sign in to comment.