From 14c4e9f9b1c6775d6399f9cdf43e51c715c36a6c Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Wed, 28 Apr 2021 16:53:58 +0200 Subject: [PATCH] Remove getRawText methods --- Classes/Common/Document.php | 124 ++++----------------------- Classes/Common/FulltextInterface.php | 1 + Classes/Common/IiifManifest.php | 13 +-- Classes/Common/MetsDocument.php | 19 ---- 4 files changed, 26 insertions(+), 131 deletions(-) diff --git a/Classes/Common/Document.php b/Classes/Common/Document.php index e20b7d0f7d..8c3c4fbe6e 100644 --- a/Classes/Common/Document.php +++ b/Classes/Common/Document.php @@ -676,20 +676,9 @@ protected function getFullTextFromXml($id) while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { // Get full text file. - $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); - if ($file !== false) { - // Turn off libxml's error logging. - $libxmlErrors = libxml_use_internal_errors(true); - // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. - $previousValueOfEntityLoader = libxml_disable_entity_loader(true); - // Load XML from file. - $rawTextXml = simplexml_load_string($file); - // Reset entity loader setting. - libxml_disable_entity_loader($previousValueOfEntityLoader); - // Reset libxml's error logging. - libxml_use_internal_errors($libxmlErrors); - // Get the root element's name as text format. - $textFormat = strtoupper($rawTextXml->getName()); + $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); + if ($fileContent !== false) { + $textFormat = $this->getTextFormat($fileContent); } else { $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); return $fullText; @@ -703,105 +692,28 @@ protected function getFullTextFromXml($id) } // Is this text format supported? // This part actually differs from previous version of indexed OCR - if (!empty($file) && !empty($this->formats[$textFormat])) { - if (!empty($this->formats[$textFormat]['class'])) { - $fullText = $file; - } + if (!empty($fileContent) && !empty($this->formats[$textFormat])) { + $fullText = $fileContent; } else { $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); } return $fullText; } - /** - * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be - * given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations - * with the motivation "painting" should be treated as full text representations, the extension has to be - * configured accordingly. - * - * @access public - * - * @abstract - * - * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property - * of the Manifest / Range (IIIF) - * - * @return string The physical structure node's / IIIF resource's raw text - */ - //TODO: check if this method is still needed somewhere, if not simply replace with getFullText - public abstract function getRawText($id); - - /** - * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas from an - * XML fulltext representation (currently only ALTO). For IIIF manifests, ALTO documents have - * to be given in the Canvas' / Manifest's "seeAlso" property. - * - * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property - * of the Manifest / Range (IIIF) - * - * @return string The physical structure node's / IIIF resource's raw text from XML - */ - protected function getRawTextFromXml($id) + private function getTextFormat($fileContent) { - $rawText = ''; - // Load available text formats, ... - $this->loadFormats(); - // ... physical structure ... - $this->_getPhysicalStructure(); - // ... and extension configuration. - $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]); - $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); - if (!empty($this->physicalStructureInfo[$id])) { - while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { - if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - // Get fulltext file. - $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); - if ($file !== false) { - // Turn off libxml's error logging. - $libxmlErrors = libxml_use_internal_errors(true); - // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. - $previousValueOfEntityLoader = libxml_disable_entity_loader(true); - // Load XML from file. - $rawTextXml = simplexml_load_string($file); - // Reset entity loader setting. - libxml_disable_entity_loader($previousValueOfEntityLoader); - // Reset libxml's error logging. - libxml_use_internal_errors($libxmlErrors); - // Get the root element's name as text format. - $textFormat = strtoupper($rawTextXml->getName()); - } else { - $this->logger->warning('Couldn\'t load fulltext file for structure node @ID "' . $id . '"'); - return $rawText; - } - break; - } - } - } else { - $this->logger->warning('Invalid structure node @ID "' . $id . '"'); - return $rawText; - } - // Is this text format supported? - if ( - !empty($rawTextXml) - && !empty($this->formats[$textFormat]) - ) { - if (!empty($this->formats[$textFormat]['class'])) { - $class = $this->formats[$textFormat]['class']; - // Get the raw text from class. - if ( - class_exists($class) - && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface - ) { - $rawText = $obj->getRawText($rawTextXml); - $this->rawTextArray[$id] = $rawText; - } else { - $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); - } - } - } else { - $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); - } - return $rawText; + // Turn off libxml's error logging. + $libxmlErrors = libxml_use_internal_errors(true); + // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. + $previousValueOfEntityLoader = libxml_disable_entity_loader(true); + // Load XML from file. + $rawTextXml = simplexml_load_string($fileContent); + // Reset entity loader setting. + libxml_disable_entity_loader($previousValueOfEntityLoader); + // Reset libxml's error logging. + libxml_use_internal_errors($libxmlErrors); + // Get the root element's name as text format. + return strtoupper($rawTextXml->getName()); } /** diff --git a/Classes/Common/FulltextInterface.php b/Classes/Common/FulltextInterface.php index 68d10f17ad..755b7ae531 100644 --- a/Classes/Common/FulltextInterface.php +++ b/Classes/Common/FulltextInterface.php @@ -21,6 +21,7 @@ * @access public * @abstract */ +//TODO: check if this is still needed when actually full text xml is indexed interface FulltextInterface { /** diff --git a/Classes/Common/IiifManifest.php b/Classes/Common/IiifManifest.php index b86ecbb8f7..249cf64877 100644 --- a/Classes/Common/IiifManifest.php +++ b/Classes/Common/IiifManifest.php @@ -785,9 +785,10 @@ protected function getParentDocumentUidForSaving($pid, $core) /** * {@inheritDoc} - * @see Document::getRawText() + * @see Document::getFullText() */ - public function getRawText($id) + //TODO: rewrite it to get full OCR + public function getFullText($id) { $rawText = ''; // Get text from raw text array if available. @@ -804,7 +805,7 @@ public function getRawText($id) if (!empty($this->physicalStructureInfo[$id])) { while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - $rawText = parent::getRawTextFromXml($id); + $rawText = parent::getFullTextFromXml($id); break; } } @@ -830,7 +831,7 @@ public function getRawText($id) } } } else { - Helper::devLog('Invalid structure resource @id "' . $id . '"', DEVLOG_SEVERITY_WARNING); + $this->logger->warning('Invalid structure resource @id "' . $id . '"'); return $rawText; } $this->rawTextArray[$id] = $rawText; @@ -879,7 +880,7 @@ protected function loadLocation($location) } } } - Helper::devLog('Could not load IIIF manifest from "' . $location . '"', DEVLOG_SEVERITY_ERROR); + $this->logger->error('Could not load IIIF manifest from "' . $location . '"'); return false; } @@ -996,7 +997,7 @@ public function __wakeup() $this->iiif = $resource; $this->init(); } else { - Helper::devLog('Could not load IIIF after deserialization', DEVLOG_SEVERITY_ERROR); + $this->logger->error('Could not load IIIF after deserialization'); } } diff --git a/Classes/Common/MetsDocument.php b/Classes/Common/MetsDocument.php index 88a9580d25..c9b854eefe 100644 --- a/Classes/Common/MetsDocument.php +++ b/Classes/Common/MetsDocument.php @@ -646,25 +646,6 @@ class_exists($class) } } - /** - * {@inheritDoc} - * @see \Kitodo\Dlf\Common\Document::getRawText() - */ - public function getRawText($id) - { - $rawText = ''; - // Get text from raw text array if available. - if (!empty($this->rawTextArray[$id])) { - return $this->rawTextArray[$id]; - } - // Load fileGrps and check for fulltext files. - $this->_getFileGrps(); - if ($this->hasFulltext) { - $rawText = $this->getRawTextFromXml($id); - } - return $rawText; - } - /** * {@inheritDoc} * @see \Kitodo\Dlf\Common\Document::getFullText()