From 6a07ac5aafe8960b00e00ff772034acdb10d5422 Mon Sep 17 00:00:00 2001 From: Jason Naylor Date: Fri, 21 Jun 2024 13:02:46 -0700 Subject: [PATCH 01/10] Add aggregate operation for merging analysis glosses - Only merges if a word has only two senses where one sense has one english gloss and the other has one tok pisin gloss and the english sense has either no semantic domains or where all the semantic domains in the english sense are also in the tok pisin sense --- database/util/twoAnalysisSenseCleanup.js | 190 +++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 database/util/twoAnalysisSenseCleanup.js diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js new file mode 100644 index 0000000000..6c715167fc --- /dev/null +++ b/database/util/twoAnalysisSenseCleanup.js @@ -0,0 +1,190 @@ +[ + { + // Match documents in the specified project that have senses with "en" or "tpi" gloss languages + $match: { + projectId: "654a790a79341a70ce2e6627", + "senses.Glosses.Language": { + $in: ["en", "tpi"], + }, + }, + }, + { + // Preserve the original document for later stages to use + $addFields: { + originalDocument: "$$ROOT", + }, + }, + { + // Add a field to count the number of senses in each document + $addFields: { + senseCount: { + $size: "$senses", + }, + }, + }, + { + // Filter documents to only those with exactly two senses + $match: { + senseCount: 2, + }, + }, + { + // Project necessary fields, and calculate size of glosses and extract first gloss language per sense + $project: { + senses: 1, + originalDocument: 1, + glossesPerSense: { + $map: { + input: "$senses", + as: "sense", + in: { + $size: "$$sense.Glosses", + }, + }, + }, + glossLangPerSense: { + $map: { + input: "$senses", + as: "sense", + in: { + $arrayElemAt: [ + "$$sense.Glosses.Language", + 0, + ], + }, + }, + }, + }, + }, + { + // Match to ensure each sense has exactly one gloss and the required languages are present + $match: { + glossesPerSense: [1, 1], + glossLangPerSense: { + $all: ["en", "tpi"], + }, + }, + }, + { + // Identify and extract 'to' and 'from' senses for merging based on gloss languages + $addFields: { + toSense: { + $arrayElemAt: [ + { + $filter: { + input: "$senses", + as: "sense", + cond: { + $eq: [ + { + $arrayElemAt: [ + "$$sense.Glosses.Language", + 0, + ], + }, + "en", + ], + }, + }, + }, + 0, + ], + }, + fromSense: { + $arrayElemAt: [ + { + $filter: { + input: "$senses", + as: "sense", + cond: { + $eq: [ + { + $arrayElemAt: [ + "$$sense.Glosses.Language", + 0, + ], + }, + "tpi", + ], + }, + }, + }, + 0, + ], + }, + }, + }, + { + // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses + $addFields: { + toSemDomGuids: + "$toSense.SemanticDomains.guid", + fromSemDomGuids: + "$fromSense.SemanticDomains.guid", + }, + }, + { + // Match documents where the semantic domains of 'to' sense are a subset of 'from' sense or 'to' sense has no semantic domains + $match: { + $expr: { + $or: [ + { + $setIsSubset: [ + "$toSemDomGuids", + "$fromSemDomGuids", + ], + }, + { + $eq: [ + { + $size: "$toSemDomGuids", + }, + 0, + ], + }, + ], + }, + }, + }, + { + // Merge the senses into one by combining glosses + $addFields: { + mergedSenses: { + $mergeObjects: [ + "$toSense", + { + Glosses: { + $concatArrays: [ + "$toSense.Glosses", + "$fromSense.Glosses", + ], + }, + }, + ], + }, + }, + }, + { + // Update the original document's senses with the merged senses + $addFields: { + "originalDocument.senses": [ + "$mergedSenses", + ], + }, + }, + { + // Replace the root of the document with the updated original document + $replaceRoot: { + newRoot: "$originalDocument", + }, + }, + { + // Merge the updated document back into the 'words' collection, updating where matched and discarding unmatched + $merge: { + into: "WordsCollection", + on: "_id", + whenMatched: "merge", + whenNotMatched: "discard", + }, + }, +] \ No newline at end of file From 90e84d58d83f4dc9a639df92aa1d1490f831a76a Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Fri, 21 Jun 2024 18:08:39 -0400 Subject: [PATCH 02/10] Condense pipeline --- database/util/twoAnalysisSenseCleanup.js | 128 ++++++----------------- 1 file changed, 34 insertions(+), 94 deletions(-) diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js index 6c715167fc..1a90b0811f 100644 --- a/database/util/twoAnalysisSenseCleanup.js +++ b/database/util/twoAnalysisSenseCleanup.js @@ -1,73 +1,22 @@ [ { - // Match documents in the specified project that have senses with "en" or "tpi" gloss languages + // Match documents in the specified project that have 2 senses, one with a lone "en"-language gloss and the other with a lone "tpi"-language gloss $match: { projectId: "654a790a79341a70ce2e6627", + senses: { $size: 2 }, + "senses.0.Glosses": { $size: 1 }, + "senses.1.Glosses": { $size: 1 }, "senses.Glosses.Language": { - $in: ["en", "tpi"], - }, - }, - }, - { - // Preserve the original document for later stages to use - $addFields: { - originalDocument: "$$ROOT", - }, - }, - { - // Add a field to count the number of senses in each document - $addFields: { - senseCount: { - $size: "$senses", - }, - }, - }, - { - // Filter documents to only those with exactly two senses - $match: { - senseCount: 2, - }, - }, - { - // Project necessary fields, and calculate size of glosses and extract first gloss language per sense - $project: { - senses: 1, - originalDocument: 1, - glossesPerSense: { - $map: { - input: "$senses", - as: "sense", - in: { - $size: "$$sense.Glosses", - }, - }, - }, - glossLangPerSense: { - $map: { - input: "$senses", - as: "sense", - in: { - $arrayElemAt: [ - "$$sense.Glosses.Language", - 0, - ], - }, - }, - }, - }, - }, - { - // Match to ensure each sense has exactly one gloss and the required languages are present - $match: { - glossesPerSense: [1, 1], - glossLangPerSense: { $all: ["en", "tpi"], }, }, }, { - // Identify and extract 'to' and 'from' senses for merging based on gloss languages $addFields: { + // Preserve the original document for later stages to use + originalDocument: "$$ROOT", + + // Identify and extract 'to' and 'from' senses for merging based on gloss languages toSense: { $arrayElemAt: [ { @@ -77,10 +26,7 @@ cond: { $eq: [ { - $arrayElemAt: [ - "$$sense.Glosses.Language", - 0, - ], + $arrayElemAt: ["$$sense.Glosses.Language", 0], }, "en", ], @@ -99,10 +45,7 @@ cond: { $eq: [ { - $arrayElemAt: [ - "$$sense.Glosses.Language", - 0, - ], + $arrayElemAt: ["$$sense.Glosses.Language", 0], }, "tpi", ], @@ -112,15 +55,10 @@ 0, ], }, - }, - }, - { - // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses - $addFields: { - toSemDomGuids: - "$toSense.SemanticDomains.guid", - fromSemDomGuids: - "$fromSense.SemanticDomains.guid", + + // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses + toSemDomGuids: "$toSense.SemanticDomains.guid", + fromSemDomGuids: "$fromSense.SemanticDomains.guid", }, }, { @@ -129,23 +67,29 @@ $expr: { $or: [ { - $setIsSubset: [ - "$toSemDomGuids", - "$fromSemDomGuids", - ], + $setIsSubset: ["$toSemDomGuids", "$fromSemDomGuids"], }, { - $eq: [ - { - $size: "$toSemDomGuids", - }, - 0, - ], + $setIsSubset: ["$fromSemDomGuids", "$toSemDomGuids"], }, ], }, }, }, + { + // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses + $addFields: { + semDoms: { + $cond: { + if: { + $gte: [{ $size: "$toSemDomGuids" }, { $size: "$fromSemDomGuids" }], + }, + then: "$toSense.SemanticDomains", + else: "$fromSense.SemanticDomains", + }, + }, + }, + }, { // Merge the senses into one by combining glosses $addFields: { @@ -154,11 +98,9 @@ "$toSense", { Glosses: { - $concatArrays: [ - "$toSense.Glosses", - "$fromSense.Glosses", - ], + $concatArrays: ["$toSense.Glosses", "$fromSense.Glosses"], }, + SemanticDomains: "$semDoms", }, ], }, @@ -167,9 +109,7 @@ { // Update the original document's senses with the merged senses $addFields: { - "originalDocument.senses": [ - "$mergedSenses", - ], + "originalDocument.senses": ["$mergedSenses"], }, }, { @@ -187,4 +127,4 @@ whenNotMatched: "discard", }, }, -] \ No newline at end of file +]; From e1d44be88273153466b160c859270b91e2311eab Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Mon, 24 Jun 2024 09:09:50 -0400 Subject: [PATCH 03/10] split and merge steps --- database/util/twoAnalysisSenseCleanup.js | 55 +++++++++++------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js index 1a90b0811f..bb05f0d399 100644 --- a/database/util/twoAnalysisSenseCleanup.js +++ b/database/util/twoAnalysisSenseCleanup.js @@ -12,11 +12,14 @@ }, }, { + // Preserve the original document for later stages to use $addFields: { - // Preserve the original document for later stages to use originalDocument: "$$ROOT", - - // Identify and extract 'to' and 'from' senses for merging based on gloss languages + }, + }, + { + // Identify and extract 'to' and 'from' senses for merging based on gloss languages + $addFields: { toSense: { $arrayElemAt: [ { @@ -55,14 +58,17 @@ 0, ], }, - - // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses + }, + }, + { + // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses + $addFields: { toSemDomGuids: "$toSense.SemanticDomains.guid", fromSemDomGuids: "$fromSense.SemanticDomains.guid", }, }, { - // Match documents where the semantic domains of 'to' sense are a subset of 'from' sense or 'to' sense has no semantic domains + // Match documents where the semantic domains of 'to' sense are a subset of 'from' sense or vice-versa $match: { $expr: { $or: [ @@ -77,41 +83,32 @@ }, }, { - // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses - $addFields: { - semDoms: { - $cond: { - if: { - $gte: [{ $size: "$toSemDomGuids" }, { $size: "$fromSemDomGuids" }], - }, - then: "$toSense.SemanticDomains", - else: "$fromSense.SemanticDomains", - }, - }, - }, - }, - { - // Merge the senses into one by combining glosses + // Update the original document's senses with a merge of the "to" and "from" senses $addFields: { - mergedSenses: { + "originalDocument.senses": { $mergeObjects: [ "$toSense", { Glosses: { $concatArrays: ["$toSense.Glosses", "$fromSense.Glosses"], }, - SemanticDomains: "$semDoms", + SemanticDomains: { + $cond: { + if: { + $gte: [ + { $size: "$toSemDomGuids" }, + { $size: "$fromSemDomGuids" }, + ], + }, + then: "$toSense.SemanticDomains", + else: "$fromSense.SemanticDomains", + }, + }, }, ], }, }, }, - { - // Update the original document's senses with the merged senses - $addFields: { - "originalDocument.senses": ["$mergedSenses"], - }, - }, { // Replace the root of the document with the updated original document $replaceRoot: { From 46d283fbb7d711a0a8b2e69115cf0a14b495d5c0 Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Mon, 24 Jun 2024 09:13:38 -0400 Subject: [PATCH 04/10] Consider lost data --- database/util/twoAnalysisSenseCleanup.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js index bb05f0d399..c0b3cfa51c 100644 --- a/database/util/twoAnalysisSenseCleanup.js +++ b/database/util/twoAnalysisSenseCleanup.js @@ -84,11 +84,15 @@ }, { // Update the original document's senses with a merge of the "to" and "from" senses + // Note: The part of speech / grammatical category of the "from" sense is lost $addFields: { "originalDocument.senses": { $mergeObjects: [ "$toSense", { + Definitions: { + $concatArrays: ["$toSense.Definitions", "$fromSense.Definitions"], + }, Glosses: { $concatArrays: ["$toSense.Glosses", "$fromSense.Glosses"], }, From 536e712de4fde5aff7d67501902866c9bd7f5b62 Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Mon, 24 Jun 2024 10:48:47 -0400 Subject: [PATCH 05/10] Add parameterized bash script --- database/util/twoAnalysisSenseCleanup.js | 43 +++++---- database/util/twoAnalysisSenseCleanup.sh | 114 +++++++++++++++++++++++ 2 files changed, 139 insertions(+), 18 deletions(-) create mode 100644 database/util/twoAnalysisSenseCleanup.sh diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js index c0b3cfa51c..4ef39e1744 100644 --- a/database/util/twoAnalysisSenseCleanup.js +++ b/database/util/twoAnalysisSenseCleanup.js @@ -18,9 +18,9 @@ }, }, { - // Identify and extract 'to' and 'from' senses for merging based on gloss languages + // Identify and extract two senses (A and B) for merging based on gloss languages $addFields: { - toSense: { + senseA: { $arrayElemAt: [ { $filter: { @@ -39,7 +39,7 @@ 0, ], }, - fromSense: { + senseB: { $arrayElemAt: [ { $filter: { @@ -61,51 +61,57 @@ }, }, { - // Add fields to extract semantic domain GUIDs from both 'to' and 'from' senses + // Add fields to extract semantic domain GUIDs from both senses $addFields: { - toSemDomGuids: "$toSense.SemanticDomains.guid", - fromSemDomGuids: "$fromSense.SemanticDomains.guid", + semDomGuidsA: "$senseA.SemanticDomains.guid", + semDomGuidsB: "$senseB.SemanticDomains.guid", }, }, { - // Match documents where the semantic domains of 'to' sense are a subset of 'from' sense or vice-versa + // Match documents where the semantic domains of one sense are a subset of the other $match: { $expr: { $or: [ { - $setIsSubset: ["$toSemDomGuids", "$fromSemDomGuids"], + $setIsSubset: ["$semDomGuidsA", "$semDomGuidsB"], }, { - $setIsSubset: ["$fromSemDomGuids", "$toSemDomGuids"], + $setIsSubset: ["$semDomGuidsB", "$semDomGuidsA"], }, ], }, }, }, { - // Update the original document's senses with a merge of the "to" and "from" senses - // Note: The part of speech / grammatical category of the "from" sense is lost + // Update the original document's senses with a merge of the two senses + // Note: The part of speech / grammatical category of the B sense is lost $addFields: { "originalDocument.senses": { $mergeObjects: [ - "$toSense", + "$senseA", { Definitions: { - $concatArrays: ["$toSense.Definitions", "$fromSense.Definitions"], + $concatArrays: ["$senseA.Definitions", "$senseB.Definitions"], }, Glosses: { - $concatArrays: ["$toSense.Glosses", "$fromSense.Glosses"], + $concatArrays: ["$senseA.Glosses", "$senseB.Glosses"], + }, + protectReasons: { + $concatArrays: [ + "$senseA.protectReasons", + "$senseB.protectReasons", + ], }, SemanticDomains: { $cond: { if: { $gte: [ - { $size: "$toSemDomGuids" }, - { $size: "$fromSemDomGuids" }, + { $size: "$semDomGuidsA" }, + { $size: "$semDomGuidsB" }, ], }, - then: "$toSense.SemanticDomains", - else: "$fromSense.SemanticDomains", + then: "$senseA.SemanticDomains", + else: "$senseB.SemanticDomains", }, }, }, @@ -121,6 +127,7 @@ }, { // Merge the updated document back into the 'words' collection, updating where matched and discarding unmatched + // Note: Need to run this again into the "FrontierCollection". $merge: { into: "WordsCollection", on: "_id", diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh new file mode 100644 index 0000000000..74128c6a6d --- /dev/null +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +############################################################### +# Script to merge senses from two different analysis languages +############################################################### + +set -e + +usage() { + cat < Date: Mon, 24 Jun 2024 10:51:31 -0400 Subject: [PATCH 06/10] Fix braces mismatch --- database/util/twoAnalysisSenseCleanup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh index 74128c6a6d..096fc936bc 100644 --- a/database/util/twoAnalysisSenseCleanup.sh +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -92,8 +92,8 @@ match2="{\$match:{\$expr:{\$or:[{\$setIsSubset:[\"\$semDomGuidsA\",\"\$semDomGui fieldD="Definitions:{\$concatArrays:[\"\$senseA.Definitions\",\"\$senseB.Definitions\"],}," fieldG="Glosses:{\$concatArrays:[\"\$senseA.Glosses\",\"\$senseB.Glosses\"],}," fieldP="protectReasons:{\$concatArrays:[\"\$senseA.protectReasons\",\"\$senseB.protectReasons\",],}," -fieldS="SemanticDomains:{\$cond:{if:{\$gte:[{\$size:\"\$semDomGuidsA\"},{\$size:\"\$semDomGuidsB\"},],},then:\"\$senseA.SemanticDomains\",else:\"\$senseB.SemanticDomains\",}," -addFields4="{\$addFields:{\"originalDocument.senses\":{\$mergeObjects:[\"\$senseA\",{$fieldD$fieldG$fieldP$fieldS},},],},},}," +fieldS="SemanticDomains:{\$cond:{if:{\$gte:[{\$size:\"\$semDomGuidsA\"},{\$size:\"\$semDomGuidsB\"},],},then:\"\$senseA.SemanticDomains\",else:\"\$senseB.SemanticDomains\",},}," +addFields4="{\$addFields:{\"originalDocument.senses\":{\$mergeObjects:[\"\$senseA\",{$fieldD$fieldG$fieldP$fieldS},],},},}," replaceRoot="{\$replaceRoot:{newRoot:\"\$originalDocument\",},}," pipeline=$match1$addFields1$addFields2$addFields3$match2$addFields4$replaceRoot From afa96a32cf73e4ff763d8d5310aa984444f0687f Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Mon, 24 Jun 2024 10:56:40 -0400 Subject: [PATCH 07/10] Fix help --- database/util/twoAnalysisSenseCleanup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh index 096fc936bc..5f99e0d52b 100644 --- a/database/util/twoAnalysisSenseCleanup.sh +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -48,7 +48,7 @@ while [[ $# -gt 0 ]] ; do LANGA=$1 shift if [[ "${LANGA}" =~ [^A-Za-z\-] ]]; then - echo "The -l/--lang argument must be alphabetic (dashes allowed)" + echo "The -A/--langA argument must be alphabetic (dashes allowed)" exit 1 fi ;; @@ -56,7 +56,7 @@ while [[ $# -gt 0 ]] ; do LANGB=$1 shift if [[ "${LANGB}" =~ [^A-Za-z\-] ]]; then - echo "The -l/--lang argument must be alphabetic (dashes allowed)" + echo "The -B/--langB argument must be alphabetic (dashes allowed)" exit 1 fi ;; From 7e76a4f5b857b2cc430f32a8560d0f8a57a0f4b0 Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Mon, 24 Jun 2024 11:09:19 -0400 Subject: [PATCH 08/10] Fix help and required arguments --- database/util/twoAnalysisSenseCleanup.sh | 28 +++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh index 5f99e0d52b..50b32485af 100644 --- a/database/util/twoAnalysisSenseCleanup.sh +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -9,7 +9,9 @@ set -e usage() { cat < Date: Mon, 24 Jun 2024 17:39:18 -0400 Subject: [PATCH 09/10] Restore the to/from sense naming --- database/util/twoAnalysisSenseCleanup.js | 34 ++++++++++++------------ database/util/twoAnalysisSenseCleanup.sh | 20 +++++++------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js index 4ef39e1744..30b196790c 100644 --- a/database/util/twoAnalysisSenseCleanup.js +++ b/database/util/twoAnalysisSenseCleanup.js @@ -18,9 +18,9 @@ }, }, { - // Identify and extract two senses (A and B) for merging based on gloss languages + // Identify and extract two senses ("to" and "from") for merging based on gloss languages $addFields: { - senseA: { + toSense: { $arrayElemAt: [ { $filter: { @@ -39,7 +39,7 @@ 0, ], }, - senseB: { + fromSense: { $arrayElemAt: [ { $filter: { @@ -63,8 +63,8 @@ { // Add fields to extract semantic domain GUIDs from both senses $addFields: { - semDomGuidsA: "$senseA.SemanticDomains.guid", - semDomGuidsB: "$senseB.SemanticDomains.guid", + toSemDomGuids: "$toSense.SemanticDomains.guid", + fromSemDomGuids: "$fromSense.SemanticDomains.guid", }, }, { @@ -73,10 +73,10 @@ $expr: { $or: [ { - $setIsSubset: ["$semDomGuidsA", "$semDomGuidsB"], + $setIsSubset: ["$toSemDomGuids", "$fromSemDomGuids"], }, { - $setIsSubset: ["$semDomGuidsB", "$semDomGuidsA"], + $setIsSubset: ["$fromSemDomGuids", "$toSemDomGuids"], }, ], }, @@ -84,34 +84,34 @@ }, { // Update the original document's senses with a merge of the two senses - // Note: The part of speech / grammatical category of the B sense is lost + // Note: The part of speech / grammatical category of the "from" sense is lost $addFields: { "originalDocument.senses": { $mergeObjects: [ - "$senseA", + "$toSense", { Definitions: { - $concatArrays: ["$senseA.Definitions", "$senseB.Definitions"], + $concatArrays: ["$toSense.Definitions", "$fromSense.Definitions"], }, Glosses: { - $concatArrays: ["$senseA.Glosses", "$senseB.Glosses"], + $concatArrays: ["$toSense.Glosses", "$fromSense.Glosses"], }, protectReasons: { $concatArrays: [ - "$senseA.protectReasons", - "$senseB.protectReasons", + "$toSense.protectReasons", + "$fromSense.protectReasons", ], }, SemanticDomains: { $cond: { if: { $gte: [ - { $size: "$semDomGuidsA" }, - { $size: "$semDomGuidsB" }, + { $size: "$toSemDomGuids" }, + { $size: "$fromSemDomGuids" }, ], }, - then: "$senseA.SemanticDomains", - else: "$senseB.SemanticDomains", + then: "$toSense.SemanticDomains", + else: "$fromSense.SemanticDomains", }, }, }, diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh index 50b32485af..23ebfae1fc 100644 --- a/database/util/twoAnalysisSenseCleanup.sh +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -110,16 +110,16 @@ fi match1="{\$match:{projectId:\"$PROJ\",senses:{\$size:2},\"senses.0.Glosses\":{\$size:1},\"senses.1.Glosses\":{\$size:1},\"senses.Glosses.Language\":{\$all:[\"$LANGA\",\"$LANGB\"],},},}," addFields1="{\$addFields:{originalDocument:\"\$\$ROOT\",},}," -fieldA="senseA:{\$arrayElemAt:[{\$filter:{input:\"\$senses\",as:\"sense\",cond:{\$eq:[{\$arrayElemAt:[\"\$\$sense.Glosses.Language\",0],},\"$LANGA\",],},},},0,],}," -fieldB="senseB:{\$arrayElemAt:[{\$filter:{input:\"\$senses\",as:\"sense\",cond:{\$eq:[{\$arrayElemAt:[\"\$\$sense.Glosses.Language\",0],},\"$LANGB\",],},},},0,],}," -addFields2="{\$addFields:{$fieldA$fieldB},}," -addFields3="{\$addFields:{semDomGuidsA:\"\$senseA.SemanticDomains.guid\",semDomGuidsB:\"\$senseB.SemanticDomains.guid\",},}," -match2="{\$match:{\$expr:{\$or:[{\$setIsSubset:[\"\$semDomGuidsA\",\"\$semDomGuidsB\"],},{\$setIsSubset:[\"\$semDomGuidsB\",\"\$semDomGuidsA\"],},],},},}," -fieldD="Definitions:{\$concatArrays:[\"\$senseA.Definitions\",\"\$senseB.Definitions\"],}," -fieldG="Glosses:{\$concatArrays:[\"\$senseA.Glosses\",\"\$senseB.Glosses\"],}," -fieldP="protectReasons:{\$concatArrays:[\"\$senseA.protectReasons\",\"\$senseB.protectReasons\",],}," -fieldS="SemanticDomains:{\$cond:{if:{\$gte:[{\$size:\"\$semDomGuidsA\"},{\$size:\"\$semDomGuidsB\"},],},then:\"\$senseA.SemanticDomains\",else:\"\$senseB.SemanticDomains\",},}," -addFields4="{\$addFields:{\"originalDocument.senses\":{\$mergeObjects:[\"\$senseA\",{$fieldD$fieldG$fieldP$fieldS},],},},}," +fieldTo="toSense:{\$arrayElemAt:[{\$filter:{input:\"\$senses\",as:\"sense\",cond:{\$eq:[{\$arrayElemAt:[\"\$\$sense.Glosses.Language\",0],},\"$LANGA\",],},},},0,],}," +fieldFrom="fromSense:{\$arrayElemAt:[{\$filter:{input:\"\$senses\",as:\"sense\",cond:{\$eq:[{\$arrayElemAt:[\"\$\$sense.Glosses.Language\",0],},\"$LANGB\",],},},},0,],}," +addFields2="{\$addFields:{$fieldTo$fieldFrom},}," +addFields3="{\$addFields:{toSemDomGuids:\"\$toSense.SemanticDomains.guid\",fromSemDomGuids:\"\$fromSense.SemanticDomains.guid\",},}," +match2="{\$match:{\$expr:{\$or:[{\$setIsSubset:[\"\$toSemDomGuids\",\"\$fromSemDomGuids\"],},{\$setIsSubset:[\"\$fromSemDomGuids\",\"\$toSemDomGuids\"],},],},},}," +fieldD="Definitions:{\$concatArrays:[\"\$toSense.Definitions\",\"\$fromSense.Definitions\"],}," +fieldG="Glosses:{\$concatArrays:[\"\$toSense.Glosses\",\"\$fromSense.Glosses\"],}," +fieldP="protectReasons:{\$concatArrays:[\"\$toSense.protectReasons\",\"\$fromSense.protectReasons\",],}," +fieldS="SemanticDomains:{\$cond:{if:{\$gte:[{\$size:\"\$toSemDomGuids\"},{\$size:\"\$fromSemDomGuids\"},],},then:\"\$toSense.SemanticDomains\",else:\"\$fromSense.SemanticDomains\",},}," +addFields4="{\$addFields:{\"originalDocument.senses\":{\$mergeObjects:[\"\$toSense\",{$fieldD$fieldG$fieldP$fieldS},],},},}," replaceRoot="{\$replaceRoot:{newRoot:\"\$originalDocument\",},}," pipeline=$match1$addFields1$addFields2$addFields3$match2$addFields4$replaceRoot From 6fb6888ba46f2253416bcc095ff73a441a94e3be Mon Sep 17 00:00:00 2001 From: Danny Rorabaugh Date: Wed, 26 Jun 2024 09:12:04 -0400 Subject: [PATCH 10/10] Incorporate review --- database/util/twoAnalysisSenseCleanup.sh | 26 +++++++++++------------- maintenance/scripts/combine-clean-aws.sh | 4 ++-- scripts/add_user_guide.sh | 4 ++-- scripts/fetch_wordlist.sh | 4 ++-- 4 files changed, 18 insertions(+), 20 deletions(-) mode change 100644 => 100755 database/util/twoAnalysisSenseCleanup.sh diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh old mode 100644 new mode 100755 index 23ebfae1fc..af06a52710 --- a/database/util/twoAnalysisSenseCleanup.sh +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash ############################################################### # Script to merge senses from two different analysis languages @@ -6,8 +6,9 @@ set -e -usage() { +usage () { cat <