From d101ec82687671d8804f84ec33a957c533db08a1 Mon Sep 17 00:00:00 2001 From: Erek Speed Date: Tue, 2 Jan 2024 13:03:56 +0900 Subject: [PATCH] fix(dict): Make update-db script resilient to multiple spaces separating kanji entry fields (#1917) Fixes #1913 --- extension/data/kanji.dat | 2 +- utils/update-db.ts | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/extension/data/kanji.dat b/extension/data/kanji.dat index a1ab40229..d037310da 100644 --- a/extension/data/kanji.dat +++ b/extension/data/kanji.dat @@ -9187,7 +9187,7 @@ 觚|B148 S13 N4303 V5544 P1-7-6 I2n11.1 Ygu1|コ さかずき|||cup 觜|B148 S13 N4304 V5546 P2-6-7 I2n11.2 Yzui3 Yzi1|シ スイ くちばし はし|||beak, bill 觝|B148 S12 V5545 H1498 P1-7-5 I2n10.2 Ydi3|テイ ふ.れる|||touch, feel, collide with, conflict with -解|B148 G5 S13 F176 N4306 V5548 H1517 DK1017 DL1375 L1814:unravel DN1955:unravel E632 IN474 P1-7-6 I4g9.1 Yjie3 Yjie4 Yxie4|カイ ゲ と.く と.かす と.ける ほど.く ほど.ける わか.る さと.る|さとる とけ||unravel, notes, key, explanation, understanding, untie, undo, solve, answer, cancel, absolve, explain, minute +解|B148 G5 S13 F176 N4306 V5548 H1517 DK1017 DL1375 L1814:unravel DN1955:unravel E632 IN474 P1-7-6 I4g9.1 Yjie3 Yjie4 Yxie4|カイ ゲ と.く と.かす と.ける ほど.く ほぐ.す わか.る さと.る|さとる とけ||unravel, notes, key, explanation, understanding, untie, undo, solve, answer, cancel, absolve, explain, minute 觥|B148 S13 P1-7-6 Ygong1|コウ つのさかずき|||cup made of horn, obstinate 触|B148 G8 S13 F904 N4305 V5547 H1518 DK1018 DL1376 L1813:contact DN1954:contact E1428 IN874 P1-7-6 I6d7.10 Ychu4|ショク ふ.れる さわ.る さわ|||contact, touch, feel, hit, proclaim, announce, conflict 觧|B148 S13 V5549 P1-7-6 I4g9.1 Yjie3 Yjie4 Yxie4|カイ ゲ と.く と.かす と.ける ほど.く ほど.ける わか.る さと.る|||notes, key, explanation, understanding diff --git a/utils/update-db.ts b/utils/update-db.ts index bb29b493f..43597aa71 100644 --- a/utils/update-db.ts +++ b/utils/update-db.ts @@ -271,6 +271,8 @@ class KanjiDictParser extends Writable { // confuse the output. // (Also a comma could confuse it too. Would mean we should // switch to ; as a separator in that case.) + // In general, a single space is used as a separator, but sometimes more spaces appear and the spec is unclear so it's best to assume + // one or more spaces separate fields. // // e.g. 士|3B4E U58eb B33 G4 S3 F526 J1 N1160 V1117 H3405 DP4213 DK2129 DL2877 L319 DN341 K301 O41 DO59 MN5638 MP3.0279 E494 IN572 DA581 DS410 DF1173 DH521 DT441 DC386 DJ755 DG393 DM325 P4-3-2 I3p0.1 Q4010.0 DR1472 Yshi4 Wsa シ さむらい T1 お ま T2 さむらい {gentleman} {scholar} {samurai} {samurai radical (no. 33)} // @@ -295,7 +297,7 @@ class KanjiDictParser extends Writable { // - Meanings, command separated // (All | delimited) const matches = line.match( - /^(\S+) (?:.=.=== )?((?:[\x21-\x7a]+ )+)((?:[\x80-\uffff.-]+ )+)?(?:T1 ((?:[\x80-\uffff.-]+ )+))?(?:T2 ((?:[\x80-\uffff.-]+ )+))?((?:\{[^}]+\} ?)*)?$/ + /^(\S+) +(?:.=.=== )?((?:[\x21-\x7a]+ +)+)((?:[\x80-\uffff.-]+ +)+)?(?:T1 ((?:[\x80-\uffff.-]+ +)+))?(?:T2 ((?:[\x80-\uffff.-]+ +)+))?((?:\{[^}]+\} *)*)?$/ ); if (matches === null) { console.log(`Failed to parse line: ${line}`); @@ -304,7 +306,7 @@ class KanjiDictParser extends Writable { } // Trim references - const refs = matches[2].trim().split(' '); + const refs = matches[2].trim().split(/ +/); const refsToKeep = []; let hasB = false; for (const ref of refs) { @@ -341,7 +343,7 @@ class KanjiDictParser extends Writable { // Prepare meanings if (matches[6]) { - const meanings = matches[6].trim().split('} {'); + const meanings = matches[6].trim().split(/} *{/); if (meanings.length) { meanings[0] = meanings[0].slice(1); const end = meanings.length - 1; @@ -362,7 +364,8 @@ class KanjiDictParser extends Writable { this.#index[matches[1]] = matches .slice(2) - .map((part) => (part ? part.trim() : '')) + // Replace any instances of 2 or more spaces with one space. + .map((part) => (part ? part.trim().replace(/ {2,}/, ' ') : '')) .join('|'); callback();