From 211cb6f769d100ffe5e35ac5323845073116d43b Mon Sep 17 00:00:00 2001 From: berndmoos Date: Thu, 21 Nov 2024 18:48:09 +0100 Subject: [PATCH] #501 --- .../common/corpusbuild/EXBBuilder.java | 55 ++++++++++++++++++- .../common/corpusbuild/TestEXBBuilder.java | 8 ++- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/org/exmaralda/common/corpusbuild/EXBBuilder.java b/src/org/exmaralda/common/corpusbuild/EXBBuilder.java index df40ea08..3757f8b2 100644 --- a/src/org/exmaralda/common/corpusbuild/EXBBuilder.java +++ b/src/org/exmaralda/common/corpusbuild/EXBBuilder.java @@ -11,8 +11,11 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.Vector; import java.util.stream.Stream; import org.exmaralda.common.jdomutilities.IOUtilities; import org.exmaralda.partitureditor.jexmaralda.BasicTranscription; @@ -24,6 +27,7 @@ import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; +import org.jdom.Namespace; import org.jdom.xpath.XPath; import org.xml.sax.SAXException; @@ -38,6 +42,8 @@ public class EXBBuilder { String uniqueSpeakerDistinction = "descendant::abbreviation"; String segmentation = "default"; + Set deleteMetaKeys = new HashSet<>(); + public EXBBuilder(String corpusName, File topDirectory, String uniqueSpeakerDistinction, String segmentation){ this.corpusName = corpusName; @@ -46,6 +52,10 @@ public EXBBuilder(String corpusName, File topDirectory, String uniqueSpeakerDist this.segmentation = segmentation; } + public void setDeleteMetaKeys(Set deleteMetaKeys){ + this.deleteMetaKeys = deleteMetaKeys; + } + public void build() throws IOException, SAXException, JexmaraldaException, JDOMException{ List exbFiles = collectEXBFiles(); segmentEXBFiles(exbFiles); @@ -80,19 +90,26 @@ private void constructComa(List exbFiles) throws SAXException, JexmaraldaE comaDocument.getRootElement().setAttribute("uniqueSpeakerDistinction", "//speaker/" + this.uniqueSpeakerDistinction); comaDocument.getRootElement().setAttribute("Name", corpusName); comaDocument.getRootElement().setAttribute("Id", corpusName); + comaDocument.getRootElement().setAttribute("noNamespaceSchemaLocation", "http://www.exmaralda.org/xml/comacorpus.xsd", Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")); + Namespace xsiNamespace = Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance"); + comaDocument.getRootElement().addNamespaceDeclaration(xsiNamespace); + + Element corpusDataElement = new Element("CorpusData"); + comaDocument.getRootElement().addContent(corpusDataElement); for (File exbFile : exbFiles){ BasicTranscription exb = new BasicTranscription(exbFile.getAbsolutePath()); Element communicationElement = new Element("Communication") .setAttribute("Name", exb.getHead().getMetaInformation().getTranscriptionName()) .setAttribute("Id", exb.getHead().getMetaInformation().getTranscriptionName()); - comaDocument.getRootElement().addContent(communicationElement); + corpusDataElement.addContent(communicationElement); // Metadata Element communicationDescriptionElement = new Element("Description"); communicationElement.addContent(communicationDescriptionElement); UDInformationHashtable udMetaInformation = exb.getHead().getMetaInformation().getUDMetaInformation(); for (String attribute : udMetaInformation.getAllAttributes()){ + if (deleteMetaKeys.contains(attribute)) continue; String value = udMetaInformation.getValueOfAttribute(attribute); Element keyElement = new Element("Key") .setAttribute("Name", attribute) @@ -162,12 +179,45 @@ private void constructComa(List exbFiles) throws SAXException, JexmaraldaE transcriptionElement2.addContent(transcriptionDescriptionElement2); transcriptionDescriptionElement2.addContent(new Element("Key").setAttribute("Name", "segmented").setText("true")); + // Recordings + /* + + AnneWill + 360003 + + + digital + + AnneWill/AnneWill.mp4 + + + */ + Vector referencedFiles = exb.getHead().getMetaInformation().getReferencedFiles(); + int count = 0; + for (String referencedFile : referencedFiles){ + count++; + Element recordingElement = new Element("Recording") + .setAttribute("Id", exb.getHead().getMetaInformation().getTranscriptionName()+ "_REC_" + Integer.toString(count)); + recordingElement.addContent(new Element("Name").setText(new File(referencedFile).getName())); + communicationElement.addContent(recordingElement); + Element mediaElement = new Element("Media") + .setAttribute("Id", exb.getHead().getMetaInformation().getTranscriptionName()+ "_MED_" + Integer.toString(count)); + recordingElement.addContent(mediaElement); + Element mediaDescriptionElement = new Element("Description"); + mediaElement.addContent(mediaDescriptionElement); + mediaDescriptionElement.addContent(new Element("Key").setAttribute("Name", "type").setText("digital")); + + relativePath = topDirectory.toPath().relativize(new File(referencedFile).toPath()); + mediaElement.addContent(new Element("NSLink").setText(relativePath.toString().replace(File.separatorChar, '/'))); + + + } } for (String id : speakerElements.keySet()){ - comaDocument.getRootElement().addContent(speakerElements.get(id)); + corpusDataElement.addContent(speakerElements.get(id)); } File comaOutFile = new File(topDirectory, corpusName + ".coma"); @@ -203,6 +253,7 @@ private Element makeSpeakerElement(String sID, Speaker s) { speakerElement.addContent(speakerDescriptionElement); UDInformationHashtable udMetaInformation = s.getUDSpeakerInformation(); for (String attribute : udMetaInformation.getAllAttributes()){ + if (deleteMetaKeys.contains(attribute)) continue; String value = udMetaInformation.getValueOfAttribute(attribute); Element keyElement = new Element("Key") .setAttribute("Name", attribute) diff --git a/src/org/exmaralda/common/corpusbuild/TestEXBBuilder.java b/src/org/exmaralda/common/corpusbuild/TestEXBBuilder.java index 104a89ae..95c5716c 100644 --- a/src/org/exmaralda/common/corpusbuild/TestEXBBuilder.java +++ b/src/org/exmaralda/common/corpusbuild/TestEXBBuilder.java @@ -6,6 +6,8 @@ import java.io.File; import java.io.IOException; +import java.util.HashSet; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException; @@ -26,8 +28,12 @@ public static void main(String[] args) { } private void doit() { - EXBBuilder exbBuilder = new EXBBuilder("MANV", new File("C:\\UDE\\PILOT_MANV\\ZUMULT-CORPUS\\MANV"), "descendant::ud-information[@attribute-name='uniqueID']", "default"); try { + EXBBuilder exbBuilder = new EXBBuilder("MANV", new File("C:\\UDE\\PILOT_MANV\\ZUMULT-CORPUS\\MANV"), "descendant::ud-information[@attribute-name='uniqueID']", "default"); + Set dmk = new HashSet<>(); + dmk.add("ELAN-Media-File"); + dmk.add("ELAN-Mime-Type"); + exbBuilder.setDeleteMetaKeys(dmk); exbBuilder.build(); } catch (IOException | SAXException | JexmaraldaException | JDOMException ex) { Logger.getLogger(TestEXBBuilder.class.getName()).log(Level.SEVERE, null, ex);