Skip to content

Commit

Permalink
Remove getRawText methods
Browse files Browse the repository at this point in the history
  • Loading branch information
beatrycze-volk committed Apr 28, 2021
1 parent e80b9a6 commit 14c4e9f
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 131 deletions.
124 changes: 18 additions & 106 deletions Classes/Common/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -676,20 +676,9 @@ protected function getFullTextFromXml($id)
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
// Get full text file.
$file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($file !== false) {
// Turn off libxml's error logging.
$libxmlErrors = libxml_use_internal_errors(true);
// Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
$previousValueOfEntityLoader = libxml_disable_entity_loader(true);
// Load XML from file.
$rawTextXml = simplexml_load_string($file);
// Reset entity loader setting.
libxml_disable_entity_loader($previousValueOfEntityLoader);
// Reset libxml's error logging.
libxml_use_internal_errors($libxmlErrors);
// Get the root element's name as text format.
$textFormat = strtoupper($rawTextXml->getName());
$fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
Expand All @@ -703,105 +692,28 @@ protected function getFullTextFromXml($id)
}
// Is this text format supported?
// This part actually differs from previous version of indexed OCR
if (!empty($file) && !empty($this->formats[$textFormat])) {
if (!empty($this->formats[$textFormat]['class'])) {
$fullText = $file;
}
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$fullText = $fileContent;
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
return $fullText;
}

/**
* This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be
* given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations
* with the motivation "painting" should be treated as full text representations, the extension has to be
* configured accordingly.
*
* @access public
*
* @abstract
*
* @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
* of the Manifest / Range (IIIF)
*
* @return string The physical structure node's / IIIF resource's raw text
*/
//TODO: check if this method is still needed somewhere, if not simply replace with getFullText
public abstract function getRawText($id);

/**
* This extracts the raw text for a physical structure node / IIIF Manifest / Canvas from an
* XML fulltext representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
* of the Manifest / Range (IIIF)
*
* @return string The physical structure node's / IIIF resource's raw text from XML
*/
protected function getRawTextFromXml($id)
private function getTextFormat($fileContent)
{
$rawText = '';
// Load available text formats, ...
$this->loadFormats();
// ... physical structure ...
$this->_getPhysicalStructure();
// ... and extension configuration.
$extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]);
$fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']);
if (!empty($this->physicalStructureInfo[$id])) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
// Get fulltext file.
$file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($file !== false) {
// Turn off libxml's error logging.
$libxmlErrors = libxml_use_internal_errors(true);
// Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
$previousValueOfEntityLoader = libxml_disable_entity_loader(true);
// Load XML from file.
$rawTextXml = simplexml_load_string($file);
// Reset entity loader setting.
libxml_disable_entity_loader($previousValueOfEntityLoader);
// Reset libxml's error logging.
libxml_use_internal_errors($libxmlErrors);
// Get the root element's name as text format.
$textFormat = strtoupper($rawTextXml->getName());
} else {
$this->logger->warning('Couldn\'t load fulltext file for structure node @ID "' . $id . '"');
return $rawText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $rawText;
}
// Is this text format supported?
if (
!empty($rawTextXml)
&& !empty($this->formats[$textFormat])
) {
if (!empty($this->formats[$textFormat]['class'])) {
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (
class_exists($class)
&& ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
) {
$rawText = $obj->getRawText($rawTextXml);
$this->rawTextArray[$id] = $rawText;
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
}
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
return $rawText;
// Turn off libxml's error logging.
$libxmlErrors = libxml_use_internal_errors(true);
// Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
$previousValueOfEntityLoader = libxml_disable_entity_loader(true);
// Load XML from file.
$rawTextXml = simplexml_load_string($fileContent);
// Reset entity loader setting.
libxml_disable_entity_loader($previousValueOfEntityLoader);
// Reset libxml's error logging.
libxml_use_internal_errors($libxmlErrors);
// Get the root element's name as text format.
return strtoupper($rawTextXml->getName());
}

/**
Expand Down
1 change: 1 addition & 0 deletions Classes/Common/FulltextInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
* @access public
* @abstract
*/
//TODO: check if this is still needed when actually full text xml is indexed
interface FulltextInterface
{
/**
Expand Down
13 changes: 7 additions & 6 deletions Classes/Common/IiifManifest.php
Original file line number Diff line number Diff line change
Expand Up @@ -785,9 +785,10 @@ protected function getParentDocumentUidForSaving($pid, $core)

/**
* {@inheritDoc}
* @see Document::getRawText()
* @see Document::getFullText()
*/
public function getRawText($id)
//TODO: rewrite it to get full OCR
public function getFullText($id)
{
$rawText = '';
// Get text from raw text array if available.
Expand All @@ -804,7 +805,7 @@ public function getRawText($id)
if (!empty($this->physicalStructureInfo[$id])) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
$rawText = parent::getRawTextFromXml($id);
$rawText = parent::getFullTextFromXml($id);
break;
}
}
Expand All @@ -830,7 +831,7 @@ public function getRawText($id)
}
}
} else {
Helper::devLog('Invalid structure resource @id "' . $id . '"', DEVLOG_SEVERITY_WARNING);
$this->logger->warning('Invalid structure resource @id "' . $id . '"');
return $rawText;
}
$this->rawTextArray[$id] = $rawText;
Expand Down Expand Up @@ -879,7 +880,7 @@ protected function loadLocation($location)
}
}
}
Helper::devLog('Could not load IIIF manifest from "' . $location . '"', DEVLOG_SEVERITY_ERROR);
$this->logger->error('Could not load IIIF manifest from "' . $location . '"');
return false;
}

Expand Down Expand Up @@ -996,7 +997,7 @@ public function __wakeup()
$this->iiif = $resource;
$this->init();
} else {
Helper::devLog('Could not load IIIF after deserialization', DEVLOG_SEVERITY_ERROR);
$this->logger->error('Could not load IIIF after deserialization');
}
}

Expand Down
19 changes: 0 additions & 19 deletions Classes/Common/MetsDocument.php
Original file line number Diff line number Diff line change
Expand Up @@ -646,25 +646,6 @@ class_exists($class)
}
}

/**
* {@inheritDoc}
* @see \Kitodo\Dlf\Common\Document::getRawText()
*/
public function getRawText($id)
{
$rawText = '';
// Get text from raw text array if available.
if (!empty($this->rawTextArray[$id])) {
return $this->rawTextArray[$id];
}
// Load fileGrps and check for fulltext files.
$this->_getFileGrps();
if ($this->hasFulltext) {
$rawText = $this->getRawTextFromXml($id);
}
return $rawText;
}

/**
* {@inheritDoc}
* @see \Kitodo\Dlf\Common\Document::getFullText()
Expand Down

0 comments on commit 14c4e9f

Please sign in to comment.