Skip to content

Commit

Permalink
Merge pull request #587 from beatrycze-volk/use-solr-highlighting
Browse files Browse the repository at this point in the history
Use Solr OCR Highlighting Plugin in Search in Document Plugin
  • Loading branch information
Alexander Bigga authored Sep 21, 2021
2 parents 8a96e40 + c2d6517 commit 9432644
Show file tree
Hide file tree
Showing 16 changed files with 561 additions and 198 deletions.
134 changes: 85 additions & 49 deletions Classes/Common/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -570,8 +570,8 @@ public static function &getInstance($uid, $pid = 0, $forceReload = false)
if (!empty($extConf['caching'])) {
Helper::saveToSession(self::$registry, get_class($instance));
}
$instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance));
}
$instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance));
// Return new instance.
return $instance;
}
Expand Down Expand Up @@ -638,10 +638,8 @@ public function getPhysicalPage($logicalPage)
}

/**
* This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be
* given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations
* with the motivation "painting" should be treated as full text representations, the extension has to be
* configured accordingly.
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be
* given as ALTO for METS or as annotations or ALTO for IIIF resources.
*
* @access public
*
Expand All @@ -650,23 +648,23 @@ public function getPhysicalPage($logicalPage)
* @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
* of the Manifest / Range (IIIF)
*
* @return string The physical structure node's / IIIF resource's raw text
* @return string The OCR full text
*/
public abstract function getRawText($id);
public abstract function getFullText($id);

/**
* This extracts the raw text for a physical structure node / IIIF Manifest / Canvas from an
* XML fulltext representation (currently only ALTO). For IIIF manifests, ALTO documents have
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
* to be given in the Canvas' / Manifest's "seeAlso" property.
*
* @param string $id: The @ID attribute of the physical structure node (METS) or the @id property
* of the Manifest / Range (IIIF)
*
* @return string The physical structure node's / IIIF resource's raw text from XML
* @return string The OCR full text
*/
protected function getRawTextFromXml($id)
protected function getFullTextFromXml($id)
{
$rawText = '';
$fullText = '';
// Load available text formats, ...
$this->loadFormats();
// ... physical structure ...
Expand All @@ -677,54 +675,87 @@ protected function getRawTextFromXml($id)
if (!empty($this->physicalStructureInfo[$id])) {
while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) {
if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) {
// Get fulltext file.
$file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($file !== false) {
// Turn off libxml's error logging.
$libxmlErrors = libxml_use_internal_errors(true);
// Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
$previousValueOfEntityLoader = libxml_disable_entity_loader(true);
// Load XML from file.
$rawTextXml = simplexml_load_string($file);
// Reset entity loader setting.
libxml_disable_entity_loader($previousValueOfEntityLoader);
// Reset libxml's error logging.
libxml_use_internal_errors($libxmlErrors);
// Get the root element's name as text format.
$textFormat = strtoupper($rawTextXml->getName());
// Get full text file.
$fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext]));
if ($fileContent !== false) {
$textFormat = $this->getTextFormat($fileContent);
} else {
$this->logger->warning('Couldn\'t load fulltext file for structure node @ID "' . $id . '"');
return $rawText;
$this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"');
return $fullText;
}
break;
}
}
} else {
$this->logger->warning('Invalid structure node @ID "' . $id . '"');
return $rawText;
return $fullText;
}
// Is this text format supported?
if (
!empty($rawTextXml)
&& !empty($this->formats[$textFormat])
) {
if (!empty($this->formats[$textFormat]['class'])) {
$class = $this->formats[$textFormat]['class'];
// Get the raw text from class.
if (
class_exists($class)
&& ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface
) {
$rawText = $obj->getRawText($rawTextXml);
$this->rawTextArray[$id] = $rawText;
} else {
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');
}
}
// This part actually differs from previous version of indexed OCR
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
$fullText = $this->getFullTextWithoutImages($fileContent);
} else {
$this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"');
}
return $rawText;
return $fullText;
}

/**
* Get content of the OCR full text file without images
*
* @access private
*
* @param string $fileContent: content of the XML file
*
* @return string The content of the OCR full text file without images
*/
private function getFullTextWithoutImages($fileContent)
{
$objectXml = $this->getFullTextAsObjectXML($fileContent);
if (isset($objectXml->Layout->Page->PrintSpace->Illustration)) {
unset($objectXml->Layout->Page->PrintSpace->Illustration);
}
return $objectXml->asXML();
}

/**
* Get format of the OCR full text
*
* @access private
*
* @param string $fileContent: content of the XML file
*
* @return string The format of the OCR full text
*/
private function getTextFormat($fileContent)
{
// Get the root element's name as text format.
return strtoupper($this->getFullTextAsObjectXML($fileContent)->getName());
}

/**
* Get content of the OCR full text file
*
* @access private
*
* @param string $fileContent: content of the XML file
*
* @return \SimpleXMLElement content of the XML file as object
*/
private function getFullTextAsObjectXML($fileContent)
{
// Turn off libxml's error logging.
$libxmlErrors = libxml_use_internal_errors(true);
// Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept.
$previousValueOfEntityLoader = libxml_disable_entity_loader(true);
// Load XML from file.
$objectXml = simplexml_load_string($fileContent);
// Reset entity loader setting.
libxml_disable_entity_loader($previousValueOfEntityLoader);
// Reset libxml's error logging.
libxml_use_internal_errors($libxmlErrors);
// Get the root element
return $objectXml;
}

/**
Expand Down Expand Up @@ -1306,9 +1337,14 @@ public function save($pid = 0, $core = 0, $owner = null)
}
// Add document to index.
if ($core) {
Indexer::add($this, $core);
//TODO: change return of this method to true on success and false on failure
$hasErrors = Indexer::add($this, $core);
if ($hasErrors) {
return false;
}
} else {
$this->logger->notice('Invalid UID "' . $core . '" for Solr core');
return false;
}
return true;
}
Expand Down
Loading

0 comments on commit 9432644

Please sign in to comment.