diff --git a/lib/helpers.js b/lib/helpers.js index fbf3cdcf..ce1277e8 100644 --- a/lib/helpers.js +++ b/lib/helpers.js @@ -142,12 +142,62 @@ helpers.cleanTags = function(text) { return helpers.stripTags(text).trim(); }; +/** + * @param {string} text + * @returns {string} + */ helpers.dropUnicode = function(text) { - if (!text) return ''; - return text - .replace(unicode.chars.all, '??') - .replace(unicode.chars.combining, '') - .replace(unicode.chars.surrogate, '?'); + return helpers.replaceUnicode(text, defaultUnicodeReplacer); +}; + +/** + * @param {boolean} isSurrogate + * @param {number} charWidth + * @returns {string} + */ +function defaultUnicodeReplacer(isSurrogate, charWidth) { + if (isSurrogate) { + return "?"; + } + if (charWidth > 1) { + return "??"; + } + return ""; +} + +/** + * @param {string} str + * @param {(isSurrogate: boolean, charWidth: number, ch: string) => string} replacer + * @returns {string} + */ +helpers.replaceUnicode = function(str, replacer) { + if (!str) return ''; + + const result = []; + var next = undefined; + for (var i = 0; i < str.length; i++) { + const isSurrogate = unicode.isSurrogate(str, i); + const cw = unicode.charWidth(str, i); + if (cw != 1 || isSurrogate) { + const start = next ?? 0; + if (start < i) { + result.push(str.substring(start, i)); + } + next = i + (isSurrogate ? 2 : 1); + const ch = str.substring(i, next); + const rc = replacer(isSurrogate, cw, ch); + if (rc) { + result.push(rc); + } + } + if (isSurrogate) i++; + } + + if (next && next < i) { + result.push(str.substring(next, str.length)); + } + + return result.length === 0 ? str : result.join(""); }; helpers.__defineGetter__('Screen', function() { diff --git a/lib/unicode.js b/lib/unicode.js index 1d61f80b..861d805b 100644 --- a/lib/unicode.js +++ b/lib/unicode.js @@ -110,6 +110,11 @@ var floor = Math.floor; * Wide, Surrogates, and Combining */ +/** + * @param {string | number} str string or code point + * @param {number} [i] char index within str string + * @returns {number} + */ exports.charWidth = function(str, i) { var point = typeof str !== 'number' ? exports.codePointAt(str, i || 0) @@ -401,6 +406,10 @@ exports.charWidth = function(str, i) { return 1; }; +/** + * @param {string} str + * @returns {number} + */ exports.strWidth = function(str) { var width = 0; for (var i = 0; i < str.length; i++) { @@ -410,6 +419,11 @@ exports.strWidth = function(str) { return width; }; +/** + * @param {string | number} str string or code point + * @param {number} [i] char index within str string + * @returns {boolean} + */ exports.isSurrogate = function(str, i) { var point = typeof str !== 'number' ? exports.codePointAt(str, i || 0) @@ -475,6 +489,11 @@ const combining = combiningTable.reduce(function(out, row) { return out; }, {}); +/** + * @param {string | number} str string or code point + * @param {number} [i] char index within str string + * @returns {boolean} + */ exports.isCombining = function(str, i) { var point = typeof str !== 'number' ? exports.codePointAt(str, i || 0) @@ -577,232 +596,3 @@ exports.fromCodePoint = function() { } return result; }; - -/** - * Regexes - */ - -exports.chars = {}; - -// Double width characters that are _not_ surrogate pairs. -// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this -// regex anyway. This regex is used to put a blank char after wide chars to -// be eaten, however, if this is a surrogate pair, parseContent already adds -// the extra one char because its length equals 2 instead of 1. -exports.chars.wide = new RegExp('([' - + '\\u1100-\\u115f' // Hangul Jamo init. consonants - + '\\u2329\\u232a' - + '\\u2e80-\\u303e\\u3040-\\ua4cf' // CJK ... Yi - + '\\uac00-\\ud7a3' // Hangul Syllables - + '\\uf900-\\ufaff' // CJK Compatibility Ideographs - + '\\ufe10-\\ufe19' // Vertical forms - + '\\ufe30-\\ufe6f' // CJK Compatibility Forms - + '\\uff00-\\uff60' // Fullwidth Forms - + '\\uffe0-\\uffe6' - + '])', 'g'); - -// All surrogate pair wide chars. -exports.chars.swide = new RegExp('(' - // 0x20000 - 0x2fffd: -// + '[\\ud840-\\ud87f][\\udc00-\\udffd]' - // treat all surrogate pairs as wide chars (2 cells) - + '[\\ud800-\\udbff][\\udc00-\\udfff]' - + '|' - // 0x30000 - 0x3fffd: - + '[\\ud880-\\ud8bf][\\udc00-\\udffd]' - + ')', 'g'); - -// All wide chars including surrogate pairs. -exports.chars.all = new RegExp('(' - + exports.chars.swide.source.slice(1, -1) - + '|' - + exports.chars.wide.source.slice(1, -1) - + ')', 'g'); - -// Regex to detect a surrogate pair. -exports.chars.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g; - -// Regex to find combining characters. -exports.chars.combining = combiningTable.reduce(function(out, row) { - var low, high, range; - if (row[0] > 0x00ffff) { - low = exports.fromCodePoint(row[0]); - low = [ - hexify(low.charCodeAt(0)), - hexify(low.charCodeAt(1)) - ]; - high = exports.fromCodePoint(row[1]); - high = [ - hexify(high.charCodeAt(0)), - hexify(high.charCodeAt(1)) - ]; - range = '[\\u' + low[0] + '-' + '\\u' + high[0] + ']' - + '[\\u' + low[1] + '-' + '\\u' + high[1] + ']'; - if (!~out.indexOf('|')) out += ']'; - out += '|' + range; - } else { - low = hexify(row[0]); - high = hexify(row[1]); - low = '\\u' + low; - high = '\\u' + high; - out += low + '-' + high; - } - return out; -}, '['); - -exports.chars.combining = new RegExp(exports.chars.combining, 'g'); - -function hexify(n) { - n = n.toString(16); - while (n.length < 4) n = '0' + n; - return n; -} - -/* -exports.chars.combining = new RegExp( - '[' - + '\\u0300-\\u036f' - + '\\u0483-\\u0486' - + '\\u0488-\\u0489' - + '\\u0591-\\u05bd' - + '\\u05bf-\\u05bf' - + '\\u05c1-\\u05c2' - + '\\u05c4-\\u05c5' - + '\\u05c7-\\u05c7' - + '\\u0600-\\u0603' - + '\\u0610-\\u0615' - + '\\u064b-\\u065e' - + '\\u0670-\\u0670' - + '\\u06d6-\\u06e4' - + '\\u06e7-\\u06e8' - + '\\u06ea-\\u06ed' - + '\\u070f-\\u070f' - + '\\u0711-\\u0711' - + '\\u0730-\\u074a' - + '\\u07a6-\\u07b0' - + '\\u07eb-\\u07f3' - + '\\u0901-\\u0902' - + '\\u093c-\\u093c' - + '\\u0941-\\u0948' - + '\\u094d-\\u094d' - + '\\u0951-\\u0954' - + '\\u0962-\\u0963' - + '\\u0981-\\u0981' - + '\\u09bc-\\u09bc' - + '\\u09c1-\\u09c4' - + '\\u09cd-\\u09cd' - + '\\u09e2-\\u09e3' - + '\\u0a01-\\u0a02' - + '\\u0a3c-\\u0a3c' - + '\\u0a41-\\u0a42' - + '\\u0a47-\\u0a48' - + '\\u0a4b-\\u0a4d' - + '\\u0a70-\\u0a71' - + '\\u0a81-\\u0a82' - + '\\u0abc-\\u0abc' - + '\\u0ac1-\\u0ac5' - + '\\u0ac7-\\u0ac8' - + '\\u0acd-\\u0acd' - + '\\u0ae2-\\u0ae3' - + '\\u0b01-\\u0b01' - + '\\u0b3c-\\u0b3c' - + '\\u0b3f-\\u0b3f' - + '\\u0b41-\\u0b43' - + '\\u0b4d-\\u0b4d' - + '\\u0b56-\\u0b56' - + '\\u0b82-\\u0b82' - + '\\u0bc0-\\u0bc0' - + '\\u0bcd-\\u0bcd' - + '\\u0c3e-\\u0c40' - + '\\u0c46-\\u0c48' - + '\\u0c4a-\\u0c4d' - + '\\u0c55-\\u0c56' - + '\\u0cbc-\\u0cbc' - + '\\u0cbf-\\u0cbf' - + '\\u0cc6-\\u0cc6' - + '\\u0ccc-\\u0ccd' - + '\\u0ce2-\\u0ce3' - + '\\u0d41-\\u0d43' - + '\\u0d4d-\\u0d4d' - + '\\u0dca-\\u0dca' - + '\\u0dd2-\\u0dd4' - + '\\u0dd6-\\u0dd6' - + '\\u0e31-\\u0e31' - + '\\u0e34-\\u0e3a' - + '\\u0e47-\\u0e4e' - + '\\u0eb1-\\u0eb1' - + '\\u0eb4-\\u0eb9' - + '\\u0ebb-\\u0ebc' - + '\\u0ec8-\\u0ecd' - + '\\u0f18-\\u0f19' - + '\\u0f35-\\u0f35' - + '\\u0f37-\\u0f37' - + '\\u0f39-\\u0f39' - + '\\u0f71-\\u0f7e' - + '\\u0f80-\\u0f84' - + '\\u0f86-\\u0f87' - + '\\u0f90-\\u0f97' - + '\\u0f99-\\u0fbc' - + '\\u0fc6-\\u0fc6' - + '\\u102d-\\u1030' - + '\\u1032-\\u1032' - + '\\u1036-\\u1037' - + '\\u1039-\\u1039' - + '\\u1058-\\u1059' - + '\\u1160-\\u11ff' - + '\\u135f-\\u135f' - + '\\u1712-\\u1714' - + '\\u1732-\\u1734' - + '\\u1752-\\u1753' - + '\\u1772-\\u1773' - + '\\u17b4-\\u17b5' - + '\\u17b7-\\u17bd' - + '\\u17c6-\\u17c6' - + '\\u17c9-\\u17d3' - + '\\u17dd-\\u17dd' - + '\\u180b-\\u180d' - + '\\u18a9-\\u18a9' - + '\\u1920-\\u1922' - + '\\u1927-\\u1928' - + '\\u1932-\\u1932' - + '\\u1939-\\u193b' - + '\\u1a17-\\u1a18' - + '\\u1b00-\\u1b03' - + '\\u1b34-\\u1b34' - + '\\u1b36-\\u1b3a' - + '\\u1b3c-\\u1b3c' - + '\\u1b42-\\u1b42' - + '\\u1b6b-\\u1b73' - + '\\u1dc0-\\u1dca' - + '\\u1dfe-\\u1dff' - + '\\u200b-\\u200f' - + '\\u202a-\\u202e' - + '\\u2060-\\u2063' - + '\\u206a-\\u206f' - + '\\u20d0-\\u20ef' - + '\\u302a-\\u302f' - + '\\u3099-\\u309a' - + '\\ua806-\\ua806' - + '\\ua80b-\\ua80b' - + '\\ua825-\\ua826' - + '\\ufb1e-\\ufb1e' - + '\\ufe00-\\ufe0f' - + '\\ufe20-\\ufe23' - + '\\ufeff-\\ufeff' - + '\\ufff9-\\ufffb' - + ']' - + '|[\\ud802-\\ud802][\\ude01-\\ude03]' - + '|[\\ud802-\\ud802][\\ude05-\\ude06]' - + '|[\\ud802-\\ud802][\\ude0c-\\ude0f]' - + '|[\\ud802-\\ud802][\\ude38-\\ude3a]' - + '|[\\ud802-\\ud802][\\ude3f-\\ude3f]' - + '|[\\ud834-\\ud834][\\udd67-\\udd69]' - + '|[\\ud834-\\ud834][\\udd73-\\udd82]' - + '|[\\ud834-\\ud834][\\udd85-\\udd8b]' - + '|[\\ud834-\\ud834][\\uddaa-\\uddad]' - + '|[\\ud834-\\ud834][\\ude42-\\ude44]' - + '|[\\udb40-\\udb40][\\udc01-\\udc01]' - + '|[\\udb40-\\udb40][\\udc20-\\udc7f]' - + '|[\\udb40-\\udb40][\\udd00-\\uddef]' -, 'g'); -*/ diff --git a/lib/widgets/element.js b/lib/widgets/element.js index 39bf1188..b4f38991 100644 --- a/lib/widgets/element.js +++ b/lib/widgets/element.js @@ -369,7 +369,10 @@ Element.prototype.parseContent = function(noTags) { if (this.screen.fullUnicode) { // double-width chars will eat the next char after render. create a // blank character after it so it doesn't eat the real next char. - content = content.replace(unicode.chars.all, '$1\x03'); +// content = content.replace(unicode.chars.all, '$1\x03'); + content = helpers.replaceUnicode(content, (isSurrogate, charWidth, ch) => { + return charWidth > 1 ? `${ch}\x03` : ch; + }); // iTerm2 cannot render combining characters properly. - not the case anymore! // see: https://gitlab.com/gnachman/iterm2/-/issues/2639 // if (this.screen.program.isiTerm2) { diff --git a/test/all.mjs b/test/all.mjs index 3e58817a..50536ee1 100644 --- a/test/all.mjs +++ b/test/all.mjs @@ -1 +1,2 @@ +await import("./lib/helpers.test.mjs"); await import("./lib/unicode.test.mjs"); diff --git a/test/lib/helpers.test.mjs b/test/lib/helpers.test.mjs new file mode 100644 index 00000000..40e3005b --- /dev/null +++ b/test/lib/helpers.test.mjs @@ -0,0 +1,61 @@ +import assert from "node:assert/strict"; +import helpers from "../../lib/helpers.js"; + +const { dropUnicode, replaceUnicode } = helpers; + +const { describe, it } = await (async () => { + // @ts-ignore + const module = process.isBun ? "bun:test" : "node:test"; + // @ts-ignore + return process.isBun // @ts-ignore + ? Promise.resolve({ describe: (_, fn) => fn(), it: test }) + : import(module); +})(); + +describe("helpers.test.mjs", () => { + it("should replace unicode chars when dropUnicode", () => { + //when & then + assert.deepEqual(dropUnicode(""), ""); + assert.deepEqual(dropUnicode("abc"), "abc"); + assert.deepEqual(dropUnicode("๐Ÿ‰"), "?"); + assert.deepEqual(dropUnicode("a๐Ÿ‰"), "a?"); + assert.deepEqual(dropUnicode("๐Ÿ‰b"), "?b"); + assert.deepEqual(dropUnicode("a๐Ÿ‰b"), "a?b"); + assert.deepEqual(dropUnicode("ๆœ"), "??"); + assert.deepEqual(dropUnicode("aๆœ"), "a??"); + assert.deepEqual(dropUnicode("ๆœb"), "??b"); + assert.deepEqual(dropUnicode("aๆœb"), "a??b"); + assert.deepEqual(dropUnicode("sฬ€"), "s"); + assert.deepEqual(dropUnicode("asฬ€"), "as"); + assert.deepEqual(dropUnicode("sฬ€b"), "sb"); + assert.deepEqual(dropUnicode("asฬ€b"), "asb"); + assert.deepEqual(dropUnicode("as๐จb"), "as?b"); //TODO: should be asb + assert.deepEqual(dropUnicode("a๐Ÿ‰sฬ€ๆœb"), "a?s??b"); + }); + + it("should replace unicode chars when replaceUnicode", () => { + //given + const replacer = (isSurrogate, charWidth, ch) => { + return charWidth > 1 ? `${ch}\x03` : ch; + }; + const replace = (str) => replaceUnicode(str, replacer); + + //when & then + assert.deepEqual(replace(""), ""); + assert.deepEqual(replace("abc"), "abc"); + assert.deepEqual(replace("๐Ÿ‰"), "๐Ÿ‰\x03"); + assert.deepEqual(replace("a๐Ÿ‰"), "a๐Ÿ‰\x03"); + assert.deepEqual(replace("๐Ÿ‰b"), "๐Ÿ‰\x03b"); + assert.deepEqual(replace("a๐Ÿ‰b"), "a๐Ÿ‰\x03b"); + assert.deepEqual(replace("ๆœ"), "ๆœ\x03"); + assert.deepEqual(replace("aๆœ"), "aๆœ\x03"); + assert.deepEqual(replace("ๆœb"), "ๆœ\x03b"); + assert.deepEqual(replace("aๆœb"), "aๆœ\x03b"); + assert.deepEqual(replace("sฬ€"), "sฬ€"); + assert.deepEqual(replace("asฬ€"), "asฬ€"); + assert.deepEqual(replace("sฬ€b"), "sฬ€b"); + assert.deepEqual(replace("asฬ€b"), "asฬ€b"); + assert.deepEqual(replace("as๐จb"), "as๐จb"); //TODO: should be asb + assert.deepEqual(replace("a๐Ÿ‰sฬ€ๆœb"), "a๐Ÿ‰\x03sฬ€ๆœ\x03b"); + }); +}); diff --git a/test/lib/unicode.test.mjs b/test/lib/unicode.test.mjs index 3be5bef9..4f8125ff 100644 --- a/test/lib/unicode.test.mjs +++ b/test/lib/unicode.test.mjs @@ -1,5 +1,10 @@ import assert from "node:assert/strict"; -import { charWidth, isCombining, isSurrogate } from "../../lib/unicode.js"; +import { + charWidth, + isCombining, + isSurrogate, + strWidth, +} from "../../lib/unicode.js"; const { describe, it } = await (async () => { // @ts-ignore @@ -18,12 +23,16 @@ describe("unicode.test.mjs", () => { const surrogateSingle = "๐Œ†"; const double = "ๆœ"; const star = "โญ"; + const complex = "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง"; + const complex2 = "๐Ÿคฆ๐Ÿผโ€โ™‚๏ธ"; assert.deepEqual(combiningNonsurrogate.length, 2); assert.deepEqual(combiningSurrogate.length, 3); assert.deepEqual(surrogateDouble.length, 2); assert.deepEqual(surrogateSingle.length, 2); assert.deepEqual(double.length, 1); assert.deepEqual(star.length, 1); + assert.deepEqual(complex.length, 11); + assert.deepEqual(complex2.length, 7); it("should check for combining", () => { //when & then @@ -81,4 +90,12 @@ describe("unicode.test.mjs", () => { assert.deepEqual(charWidth(double, 0), 2); assert.deepEqual(charWidth(star, 0), 1); //TODO: should be 2 !!! }); + + it("should return string width", () => { + //when & then + assert.deepEqual(strWidth(""), 0); + assert.deepEqual(strWidth("0"), 1); + assert.deepEqual(strWidth(complex), 8); + assert.deepEqual(strWidth(complex2), 5); + }); });