Skip to content

Commit

Permalink
Merge pull request #22 from cceh/duplicate-check-input
Browse files Browse the repository at this point in the history
Duplicate check input
  • Loading branch information
creativeDev6 authored Apr 6, 2022
2 parents 25cf260 + fe53f8c commit 352d149
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 7 deletions.
49 changes: 43 additions & 6 deletions staging/conversion.xpl
Original file line number Diff line number Diff line change
Expand Up @@ -77,26 +77,63 @@
</p:otherwise>
</p:choose>
<p:sink/>


<p:documentation>
<h2>Duplicate check in input</h2>
<p>This step checks for identical lemmata that occur within the same category in input files `staging/input`(does not recurse into subfolders). Such cases are not handled by the ingest pipeline and if they indeed should be part of the dataset, they need to be added manually.</p>
</p:documentation>
<p:xslt name="duplicate-check-input">
<p:with-param name="input-folder" select="'staging/input'"/>
<p:input port="source">
<p:document href="library/duplicates/1-check-input-data-for-duplicates.xsl"/>
</p:input>
<p:input port="stylesheet">
<p:document href="library/duplicates/1-check-input-data-for-duplicates.xsl"/>
</p:input>
</p:xslt>
<p:choose>
<!-- case: duplicates in 'input' -->
<p:when test="*:input/*:orth">
<p:xslt name="duplicates-input">
<p:input port="stylesheet">
<p:document href="library/duplicates/1-check-input-data-for-duplicates-md.xsl"/>
</p:input>
</p:xslt>
<p:error name="error-duplicates-input" code="duplicates-input">
<p:input port="source">
<p:pipe port="result" step="duplicates-input"/>
</p:input>
</p:error>
</p:when>
<!-- case: no duplicates in 'input' -->
<p:otherwise>
<p:documentation>
<p>Following statement is used as a placeholder preventing an error when no duplicates are found.</p>
</p:documentation>
<pwl:identity/>
</p:otherwise>
</p:choose>
<p:sink/>

<p:documentation>
<h2>Duplicate check</h2>
<h2>Duplicate check in current</h2>
<p>This step checks for identical lemmata that occur within the same category. Such cases are not handled by the ingest pipeline and if they indeed should be part of the dataset, they need to be added manually.</p>
</p:documentation>
<p:xslt name="duplicate-check">
<p:xslt name="duplicate-check-current">
<p:with-param name="comparisonBase" select="$comparisonBase"/>
<p:input port="source">
<p:document href="library/transformation/0-check-current-data-for-duplicates.xsl"/>
<p:document href="library/duplicates/2-check-current-data-for-duplicates.xsl"/>
</p:input>
<p:input port="stylesheet">
<p:document href="library/transformation/0-check-current-data-for-duplicates.xsl"/>
<p:document href="library/duplicates/2-check-current-data-for-duplicates.xsl"/>
</p:input>
</p:xslt>
<p:choose>
<!-- case: duplicates in 'current' -->
<p:when test="*:current/*:orth">
<p:xslt name="duplicates">
<p:input port="stylesheet">
<p:document href="library/transformation/0-check-current-data-for-duplicates-md.xsl"/>
<p:document href="library/duplicates/2-check-current-data-for-duplicates-md.xsl"/>
</p:input>
</p:xslt>
<p:error name="error-duplicates" code="duplicates">
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" version="2.0">
<xsl:template match="/">
<xsl:message>CAUTION: duplicates present in input.</xsl:message>
<md-wrapper>
<xsl:text>
# WL Import Report </xsl:text><xsl:value-of select="format-date(current-date(), '[Y0001]-[M01]-[D01]')"/><xsl:text>

**The directory 'staging/input' contains duplicates!**

Please make sure that each lemma is unique within its category.

## Duplicate lemmata


| Lemma | language | WL ID | FileMaker RecordId | category | references |
| -----------|-----------|-------------|-------------|-------------|-------------|
</xsl:text>
<xsl:for-each select="*:input/*:orth">
<xsl:sort select="text()"/>
<xsl:text>| </xsl:text>
<xsl:value-of select="text()"/>
<xsl:text>| </xsl:text>
<xsl:value-of select="@xml:lang"/>
<xsl:text>| </xsl:text>
<xsl:value-of select="@xml:id"/>
<xsl:text>| </xsl:text>
<xsl:value-of select="@filemaker_id"/>
<xsl:text>| </xsl:text>
<xsl:value-of select="@category"/>
<xsl:text>| </xsl:text>
<xsl:value-of select="@references"/>
<xsl:text>|
</xsl:text>

</xsl:for-each>
<xsl:text>Please remove these duplicates in the input files and relaunch the conversion.</xsl:text>
</md-wrapper>
</xsl:template>
</xsl:stylesheet>
127 changes: 127 additions & 0 deletions staging/library/duplicates/1-check-input-data-for-duplicates.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="xs" version="2.0">
<xsl:strip-space elements="*"/>
<xsl:output indent="yes"/>

<!--
<p:documentation>
<h2>Duplicate check</h2>
<p>This step checks for identical lemmata that occur within the same category. Such cases are not handled
by the ingest pipeline and if they indeed should be part of the dataset, they need to be added manually.</p>
</p:documentation>
-->

<xsl:param name="input-folder"/>

<xsl:strip-space elements="*"/>

<!-- languages -->
<xsl:variable name="latin-short" select="'la'"/>
<xsl:variable name="greece-short" select="'grc'"/>

<!-- categories -->
<xsl:variable name="monthDays" select="'monthsDays'"/>
<xsl:variable name="persons" select="'persons'"/>
<xsl:variable name="geography" select="'geography'"/>
<xsl:variable name="religion" select="'religion'"/>
<xsl:variable name="general" select="'general'"/>
<xsl:variable name="ohne_Kategorie" select="'ohne Kategorie'"/>

<!-- column position -->
<xsl:variable name="lemma-position" select="number(1)"/>
<xsl:variable name="anmerkung-position" select="number(2)"/>
<xsl:variable name="stellen-position" select="number(3)"/>
<xsl:variable name="sortierhilfe-position" select="number(4)"/>
<xsl:variable name="wl_id-position" select="number(5)"/>
<xsl:variable name="wl_verweise-position" select="number(6)"/>


<xsl:variable name="input-lemmata">
<input>
<xsl:for-each select="collection(concat('../../../',$input-folder,'/?recurse=no;select=*.xml'))//*:ROW">
<xsl:variable name="lemma" select="*:COL[$lemma-position]/*:DATA[text()]"/>
<xsl:variable name="language">
<xsl:choose>
<xsl:when test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Lateinisch')]">
<xsl:value-of select="$latin-short"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$greece-short"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<xsl:variable name="wl_id" select="*:COL[$wl_id-position]/*:DATA[text()]"/>
<xsl:variable name="filemaker_id" select="@RECORDID"/>
<xsl:variable name="references">
<xsl:for-each select="*:COL[$wl_verweise-position]/*:DATA[text()]">
<xsl:text>`</xsl:text><xsl:value-of select="."/><xsl:text>` </xsl:text>
</xsl:for-each>
</xsl:variable>
<xsl:variable name="category">
<xsl:choose>
<!-- sortierhilfe is only used for latin because there is only one document for all 5 categories (unlike greece) -->
<xsl:when test="$language = $latin-short">
<xsl:choose>
<xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='a'"><xsl:value-of select="$monthDays"/></xsl:when>
<xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='b'"><xsl:value-of select="$persons"/></xsl:when>
<xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='c'"><xsl:value-of select="$geography"/></xsl:when>
<xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='d'"><xsl:value-of select="$religion"/></xsl:when>
<xsl:when test="*:COL[$sortierhilfe-position]/*:DATA='e'"><xsl:value-of select="$general"/></xsl:when>
<!-- uncategorized/spurious -->
<xsl:otherwise><xsl:value-of select="$ohne_Kategorie"/></xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:otherwise>
<xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Monate')]"><xsl:value-of select="$monthDays"/></xsl:if>
<xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Namen')]"><xsl:value-of select="$persons"/></xsl:if>
<xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Geographie')]"><xsl:value-of select="$geography"/></xsl:if>
<xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'Religion')]"><xsl:value-of select="$religion"/></xsl:if>
<xsl:if test="parent::*:RESULTSET/preceding-sibling::*:DATABASE[contains(@NAME,'allgemein')]"><xsl:value-of select="$general"/></xsl:if>
</xsl:otherwise>
</xsl:choose>
</xsl:variable>
<orth>
<xsl:attribute name="xml:lang" select="$language"/>
<xsl:attribute name="xml:id" select="$wl_id"/>
<xsl:attribute name="filemaker_id" select="$filemaker_id"/>
<xsl:attribute name="category" select="$category"/>
<xsl:attribute name="references" select="$references"/>
<xsl:value-of select="$lemma"/>
</orth>
</xsl:for-each>
</input>
</xsl:variable>

<xsl:template match="/">
<!--<xsl:result-document href="duplicates.xml">-->
<xsl:variable name="duplicates">
<!-- grouping all lemma elements by category -->
<xsl:for-each-group select="$input-lemmata//*:orth" group-by="@category">
<!-- looking for duplicates (identical text nodes) within categories;
groups with two (or more) members – as identified by [current-group()[2]] – are duplicates -->
<xsl:for-each-group select="current-group()" group-by="text()">
<xsl:if test="current-group()[current-group()[2]]">
<xsl:sequence select="current-group()[current-group()[2]]"/>
</xsl:if>
</xsl:for-each-group>
</xsl:for-each-group>
</xsl:variable>
<input>
<h2>Mehrfach vorkommende Lemmata:</h2>
<xsl:choose>
<xsl:when test="$duplicates/*">
<xsl:copy-of select="$duplicates"/>
<p>Empfohlenes Vorgehen: Duplikate beheben (zumindest für die Transformation); dann diese Datei entfernen (ggf. nach erneutem Testen).</p>
</xsl:when>
<xsl:otherwise>
<p>Alle Lemmata sind innerhalb der Kategorien einmalig. Diese Datei kann entfernt werden.</p>
</xsl:otherwise>
</xsl:choose>
</input>
<!--</xsl:result-document>-->
</xsl:template>

</xsl:stylesheet>
16 changes: 15 additions & 1 deletion staging/library/pwl-library.xpl
Original file line number Diff line number Diff line change
Expand Up @@ -633,5 +633,19 @@
</p:group>

</p:declare-step>


<p:declare-step type="pwl:identity" name="identity">

<p:input port="source" sequence="true"/>
<p:output port="result" sequence="true"/>

<p:documentation>
<h2>Placeholder</h2>
<p>This step is used as a placeholder, e.g. when using only the when branch inside of a choose-statement then you can place it inside otherwise to prevent an error when this branch is selected.</p>
</p:documentation>

<p:identity/>

</p:declare-step>

</p:library>

0 comments on commit 352d149

Please sign in to comment.