Merge pull request #22 from cceh/duplicate-check-input

Duplicate check input
cceh · Apr 6, 2022 · 352d149 · 352d149
2 parents 25cf260 + fe53f8c
commit 352d149
Show file tree

Hide file tree

Showing 6 changed files with 225 additions and 7 deletions.
diff --git a/staging/conversion.xpl b/staging/conversion.xpl
@@ -77,26 +77,63 @@
         </p:otherwise>
     </p:choose>
     <p:sink/>
-
+
+    <p:documentation>
+        <h2>Duplicate check in input</h2>
+        <p>This step checks for identical lemmata that occur within the same category in input files `staging/input`(does not recurse into subfolders). Such cases are not handled by the ingest pipeline and if they indeed should be part of the dataset, they need to be added manually.</p>
+    </p:documentation>
+    <p:xslt name="duplicate-check-input">
+        <p:with-param name="input-folder" select="'staging/input'"/>
+        <p:input port="source">
+            <p:document href="library/duplicates/1-check-input-data-for-duplicates.xsl"/>
+        </p:input>
+        <p:input port="stylesheet">
+            <p:document href="library/duplicates/1-check-input-data-for-duplicates.xsl"/>
+        </p:input>
+    </p:xslt>
+    <p:choose>
+        <!-- case: duplicates in 'input' -->
+        <p:when test="*:input/*:orth">
+            <p:xslt name="duplicates-input">
+                <p:input port="stylesheet">
+                    <p:document href="library/duplicates/1-check-input-data-for-duplicates-md.xsl"/>
+                </p:input>
+            </p:xslt>
+            <p:error name="error-duplicates-input" code="duplicates-input">
+                <p:input port="source">
+                    <p:pipe port="result" step="duplicates-input"/>
+                </p:input>
+            </p:error>
+        </p:when>
+        <!-- case: no duplicates in 'input' -->
+        <p:otherwise>
+            <p:documentation>
+                <p>Following statement is used as a placeholder preventing an error when no duplicates are found.</p>
+            </p:documentation>
+            <pwl:identity/>
+        </p:otherwise>
+    </p:choose>
+    <p:sink/>
+
     <p:documentation>
-        <h2>Duplicate check</h2>
+        <h2>Duplicate check in current</h2>
         <p>This step checks for identical lemmata that occur within the same category. Such cases are not handled by the ingest pipeline and if they indeed should be part of the dataset, they need to be added manually.</p>
     </p:documentation>
-    <p:xslt name="duplicate-check">
+    <p:xslt name="duplicate-check-current">
         <p:with-param name="comparisonBase" select="$comparisonBase"/>
         <p:input port="source">
-            <p:document href="library/transformation/0-check-current-data-for-duplicates.xsl"/>
+            <p:document href="library/duplicates/2-check-current-data-for-duplicates.xsl"/>
         </p:input>
         <p:input port="stylesheet">
-            <p:document href="library/transformation/0-check-current-data-for-duplicates.xsl"/>
+            <p:document href="library/duplicates/2-check-current-data-for-duplicates.xsl"/>
         </p:input>
     </p:xslt>
     <p:choose>
         <!-- case: duplicates in 'current' -->
         <p:when test="*:current/*:orth">
             <p:xslt name="duplicates">
                 <p:input port="stylesheet">
-                    <p:document href="library/transformation/0-check-current-data-for-duplicates-md.xsl"/>
+                    <p:document href="library/duplicates/2-check-current-data-for-duplicates-md.xsl"/>
                 </p:input>
             </p:xslt>
             <p:error name="error-duplicates" code="duplicates">

diff --git a/staging/library/duplicates/1-check-input-data-for-duplicates-md.xsl b/staging/library/duplicates/1-check-input-data-for-duplicates-md.xsl
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" version="2.0">
+    <xsl:template match="/">
+        <xsl:message>CAUTION: duplicates present in input.</xsl:message>
+        <md-wrapper>
+            <xsl:text>
+# WL Import Report </xsl:text><xsl:value-of select="format-date(current-date(), '[Y0001]-[M01]-[D01]')"/><xsl:text>
+
+**The directory 'staging/input' contains duplicates!**
+
+Please make sure that each lemma is unique within its category.
+
+## Duplicate lemmata
+
+
+| Lemma   | language     | WL ID | FileMaker RecordId | category | references |
+| -----------|-----------|-------------|-------------|-------------|-------------|
+</xsl:text>
+            <xsl:for-each select="*:input/*:orth">
+                <xsl:sort select="text()"/>
+                <xsl:text>| </xsl:text>
+                <xsl:value-of select="text()"/>
+                <xsl:text>| </xsl:text>
+                <xsl:value-of select="@xml:lang"/>
+                <xsl:text>| </xsl:text>
+                <xsl:value-of select="@xml:id"/>
+                <xsl:text>| </xsl:text>
+                <xsl:value-of select="@filemaker_id"/>
+                <xsl:text>| </xsl:text>
+                <xsl:value-of select="@category"/>
+                <xsl:text>| </xsl:text>
+                <xsl:value-of select="@references"/>
+                <xsl:text>|
+</xsl:text>
+
+            </xsl:for-each>
+            <xsl:text>Please remove these duplicates in the input files and relaunch the conversion.</xsl:text>
+        </md-wrapper>
+    </xsl:template>
+</xsl:stylesheet>
diff --git a/staging/library/duplicates/1-check-input-data-for-duplicates.xsl b/staging/library/duplicates/1-check-input-data-for-duplicates.xsl
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+    xmlns="http://www.tei-c.org/ns/1.0"
+    xmlns:xs="http://www.w3.org/2001/XMLSchema"
+    exclude-result-prefixes="xs" version="2.0">
+    <xsl:strip-space elements="*"/>
+    <xsl:output indent="yes"/>
+
+    <!-- 
+        <p:documentation>
+            <h2>Duplicate check</h2>
+            <p>This step checks for identical lemmata that occur within the same category. Such cases are not handled 
+        by the ingest pipeline and if they indeed should be part of the dataset, they need to be added manually.</p>
+        </p:documentation>
+    -->
+
+    <xsl:param name="input-folder"/>
+
+    <xsl:strip-space elements="*"/>
+
+    <!-- languages -->
+    <xsl:variable name="latin-short" select="'la'"/>
+    <xsl:variable name="greece-short" select="'grc'"/>
+
+    <!-- categories -->
+    <xsl:variable name="monthDays" select="'monthsDays'"/>
+    <xsl:variable name="persons" select="'persons'"/>
+    <xsl:variable name="geography" select="'geography'"/>
+    <xsl:variable name="religion" select="'religion'"/>
+    <xsl:variable name="general" select="'general'"/>
+    <xsl:variable name="ohne_Kategorie" select="'ohne Kategorie'"/>
+
+    <!-- column position -->
+    <xsl:variable name="lemma-position" select="number(1)"/>
+    <xsl:variable name="anmerkung-position" select="number(2)"/>
+    <xsl:variable name="stellen-position" select="number(3)"/>
+    <xsl:variable name="sortierhilfe-position" select="number(4)"/>
+    <xsl:variable name="wl_id-position" select="number(5)"/>
+    <xsl:variable name="wl_verweise-position" select="number(6)"/>
+
+
+    <xsl:variable name="input-lemmata">
+        <input>
+            <xsl:for-each select="collection(concat('../../../',$input-folder,'/?recurse=no;select=*.xml'))//*:ROW">
+                <xsl:variable name="lemma" select="*:COL[$lemma-position]/*:DATA[text()]"/>
+                <xsl:variable name="language">
+                        <xsl:choose>
+                            <xsl:when test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Lateinisch')]">
+                                <xsl:value-of select="$latin-short"/>
+                            </xsl:when>
+                            <xsl:otherwise>
+                                <xsl:value-of select="$greece-short"/>
+                            </xsl:otherwise>
+                        </xsl:choose>
+                </xsl:variable>
+                <xsl:variable name="wl_id" select="*:COL[$wl_id-position]/*:DATA[text()]"/>
+                <xsl:variable name="filemaker_id" select="@RECORDID"/>
+                <xsl:variable name="references">
+                    <xsl:for-each select="*:COL[$wl_verweise-position]/*:DATA[text()]">
+                        <xsl:text>`</xsl:text><xsl:value-of select="."/><xsl:text>` </xsl:text>
+                    </xsl:for-each>
+                </xsl:variable>
+                <xsl:variable name="category">
+                    <xsl:choose>
+                        <!-- sortierhilfe is only used for latin because there is only one document for all 5 categories (unlike greece) -->
+                        <xsl:when test="$language = $latin-short">
+                            <xsl:choose>
+                                <xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='a'"><xsl:value-of select="$monthDays"/></xsl:when>
+                                <xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='b'"><xsl:value-of select="$persons"/></xsl:when>
+                                <xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='c'"><xsl:value-of select="$geography"/></xsl:when>
+                                <xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='d'"><xsl:value-of select="$religion"/></xsl:when>
+                                <xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='e'"><xsl:value-of select="$general"/></xsl:when>
+                                <!-- uncategorized/spurious -->
+                                <xsl:otherwise><xsl:value-of select="$ohne_Kategorie"/></xsl:otherwise>
+                            </xsl:choose>
+                        </xsl:when>
+                        <xsl:otherwise>
+                            <xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Monate')]"><xsl:value-of select="$monthDays"/></xsl:if>
+                            <xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Namen')]"><xsl:value-of select="$persons"/></xsl:if>
+                            <xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Geographie')]"><xsl:value-of select="$geography"/></xsl:if>
+                            <xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Religion')]"><xsl:value-of select="$religion"/></xsl:if>
+                            <xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'allgemein')]"><xsl:value-of select="$general"/></xsl:if>
+                        </xsl:otherwise>
+                    </xsl:choose>
+                </xsl:variable>
+                <orth>
+                    <xsl:attribute name="xml:lang" select="$language"/>
+                    <xsl:attribute name="xml:id" select="$wl_id"/>
+                    <xsl:attribute name="filemaker_id" select="$filemaker_id"/>
+                    <xsl:attribute name="category" select="$category"/>
+                    <xsl:attribute name="references" select="$references"/>
+                    <xsl:value-of select="$lemma"/>
+                </orth>
+            </xsl:for-each>
+        </input>
+    </xsl:variable>
+
+    <xsl:template match="/">
+        <!--<xsl:result-document href="duplicates.xml">-->
+            <xsl:variable name="duplicates">
+                <!-- grouping all lemma elements by category -->
+                <xsl:for-each-group select="$input-lemmata//*:orth" group-by="@category">
+                     <!-- looking for duplicates (identical text nodes) within categories;
+                         groups with two (or more) members – as identified by [current-group()[2]] – are duplicates -->
+                     <xsl:for-each-group select="current-group()" group-by="text()">
+                         <xsl:if test="current-group()[current-group()[2]]">
+                             <xsl:sequence select="current-group()[current-group()[2]]"/>
+                         </xsl:if>
+                     </xsl:for-each-group>
+                </xsl:for-each-group>
+            </xsl:variable>
+            <input>
+                <h2>Mehrfach vorkommende Lemmata:</h2>
+                <xsl:choose>
+                    <xsl:when test="$duplicates/*">
+                        <xsl:copy-of select="$duplicates"/>
+                        <p>Empfohlenes Vorgehen: Duplikate beheben (zumindest für die Transformation); dann diese Datei entfernen (ggf. nach erneutem Testen).</p>
+                    </xsl:when>
+                    <xsl:otherwise>
+                        <p>Alle Lemmata sind innerhalb der Kategorien einmalig. Diese Datei kann entfernt werden.</p>
+                    </xsl:otherwise>
+                </xsl:choose>
+            </input>
+        <!--</xsl:result-document>-->
+    </xsl:template>
+
+</xsl:stylesheet>
diff --git a/...-check-current-data-for-duplicates-md.xsl → ...-check-current-data-for-duplicates-md.xsl b/...-check-current-data-for-duplicates-md.xsl → ...-check-current-data-for-duplicates-md.xsl
diff --git a/...n/0-check-current-data-for-duplicates.xsl → ...s/2-check-current-data-for-duplicates.xsl b/...n/0-check-current-data-for-duplicates.xsl → ...s/2-check-current-data-for-duplicates.xsl
diff --git a/staging/library/pwl-library.xpl b/staging/library/pwl-library.xpl
@@ -633,5 +633,19 @@
         </p:group>
 
     </p:declare-step>
-
+
+    <p:declare-step type="pwl:identity" name="identity">
+
+        <p:input port="source" sequence="true"/>
+        <p:output port="result" sequence="true"/>
+
+        <p:documentation>
+            <h2>Placeholder</h2>
+            <p>This step is used as a placeholder, e.g. when using only the when branch inside of a choose-statement then you can place it inside otherwise to prevent an error when this branch is selected.</p>
+        </p:documentation>
+
+        <p:identity/>
+
+    </p:declare-step>
+
 </p:library>