diff --git a/database/util/twoAnalysisSenseCleanup.js b/database/util/twoAnalysisSenseCleanup.js new file mode 100644 index 0000000000..30b196790c --- /dev/null +++ b/database/util/twoAnalysisSenseCleanup.js @@ -0,0 +1,138 @@ +[ + { + // Match documents in the specified project that have 2 senses, one with a lone "en"-language gloss and the other with a lone "tpi"-language gloss + $match: { + projectId: "654a790a79341a70ce2e6627", + senses: { $size: 2 }, + "senses.0.Glosses": { $size: 1 }, + "senses.1.Glosses": { $size: 1 }, + "senses.Glosses.Language": { + $all: ["en", "tpi"], + }, + }, + }, + { + // Preserve the original document for later stages to use + $addFields: { + originalDocument: "$$ROOT", + }, + }, + { + // Identify and extract two senses ("to" and "from") for merging based on gloss languages + $addFields: { + toSense: { + $arrayElemAt: [ + { + $filter: { + input: "$senses", + as: "sense", + cond: { + $eq: [ + { + $arrayElemAt: ["$$sense.Glosses.Language", 0], + }, + "en", + ], + }, + }, + }, + 0, + ], + }, + fromSense: { + $arrayElemAt: [ + { + $filter: { + input: "$senses", + as: "sense", + cond: { + $eq: [ + { + $arrayElemAt: ["$$sense.Glosses.Language", 0], + }, + "tpi", + ], + }, + }, + }, + 0, + ], + }, + }, + }, + { + // Add fields to extract semantic domain GUIDs from both senses + $addFields: { + toSemDomGuids: "$toSense.SemanticDomains.guid", + fromSemDomGuids: "$fromSense.SemanticDomains.guid", + }, + }, + { + // Match documents where the semantic domains of one sense are a subset of the other + $match: { + $expr: { + $or: [ + { + $setIsSubset: ["$toSemDomGuids", "$fromSemDomGuids"], + }, + { + $setIsSubset: ["$fromSemDomGuids", "$toSemDomGuids"], + }, + ], + }, + }, + }, + { + // Update the original document's senses with a merge of the two senses + // Note: The part of speech / grammatical category of the "from" sense is lost + $addFields: { + "originalDocument.senses": { + $mergeObjects: [ + "$toSense", + { + Definitions: { + $concatArrays: ["$toSense.Definitions", "$fromSense.Definitions"], + }, + Glosses: { + $concatArrays: ["$toSense.Glosses", "$fromSense.Glosses"], + }, + protectReasons: { + $concatArrays: [ + "$toSense.protectReasons", + "$fromSense.protectReasons", + ], + }, + SemanticDomains: { + $cond: { + if: { + $gte: [ + { $size: "$toSemDomGuids" }, + { $size: "$fromSemDomGuids" }, + ], + }, + then: "$toSense.SemanticDomains", + else: "$fromSense.SemanticDomains", + }, + }, + }, + ], + }, + }, + }, + { + // Replace the root of the document with the updated original document + $replaceRoot: { + newRoot: "$originalDocument", + }, + }, + { + // Merge the updated document back into the 'words' collection, updating where matched and discarding unmatched + // Note: Need to run this again into the "FrontierCollection". + $merge: { + into: "WordsCollection", + on: "_id", + whenMatched: "merge", + whenNotMatched: "discard", + }, + }, +]; diff --git a/database/util/twoAnalysisSenseCleanup.sh b/database/util/twoAnalysisSenseCleanup.sh new file mode 100755 index 0000000000..af06a52710 --- /dev/null +++ b/database/util/twoAnalysisSenseCleanup.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash + +############################################################### +# Script to merge senses from two different analysis languages +############################################################### + +set -e + +usage () { + cat <