diff --git a/Classes/Common/Document.php b/Classes/Common/Document.php index dec043cf3..bdfd6a969 100644 --- a/Classes/Common/Document.php +++ b/Classes/Common/Document.php @@ -570,8 +570,8 @@ public static function &getInstance($uid, $pid = 0, $forceReload = false) if (!empty($extConf['caching'])) { Helper::saveToSession(self::$registry, get_class($instance)); } + $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance)); } - $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance)); // Return new instance. return $instance; } @@ -638,10 +638,8 @@ public function getPhysicalPage($logicalPage) } /** - * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be - * given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations - * with the motivation "painting" should be treated as full text representations, the extension has to be - * configured accordingly. + * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be + * given as ALTO for METS or as annotations or ALTO for IIIF resources. * * @access public * @@ -650,23 +648,23 @@ public function getPhysicalPage($logicalPage) * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property * of the Manifest / Range (IIIF) * - * @return string The physical structure node's / IIIF resource's raw text + * @return string The OCR full text */ - public abstract function getRawText($id); + public abstract function getFullText($id); /** - * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas from an - * XML fulltext representation (currently only ALTO). For IIIF manifests, ALTO documents have + * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an + * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have * to be given in the Canvas' / Manifest's "seeAlso" property. * * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property * of the Manifest / Range (IIIF) * - * @return string The physical structure node's / IIIF resource's raw text from XML + * @return string The OCR full text */ - protected function getRawTextFromXml($id) + protected function getFullTextFromXml($id) { - $rawText = ''; + $fullText = ''; // Load available text formats, ... $this->loadFormats(); // ... physical structure ... @@ -677,54 +675,87 @@ protected function getRawTextFromXml($id) if (!empty($this->physicalStructureInfo[$id])) { while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - // Get fulltext file. - $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); - if ($file !== false) { - // Turn off libxml's error logging. - $libxmlErrors = libxml_use_internal_errors(true); - // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. - $previousValueOfEntityLoader = libxml_disable_entity_loader(true); - // Load XML from file. - $rawTextXml = simplexml_load_string($file); - // Reset entity loader setting. - libxml_disable_entity_loader($previousValueOfEntityLoader); - // Reset libxml's error logging. - libxml_use_internal_errors($libxmlErrors); - // Get the root element's name as text format. - $textFormat = strtoupper($rawTextXml->getName()); + // Get full text file. + $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); + if ($fileContent !== false) { + $textFormat = $this->getTextFormat($fileContent); } else { - $this->logger->warning('Couldn\'t load fulltext file for structure node @ID "' . $id . '"'); - return $rawText; + $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); + return $fullText; } break; } } } else { $this->logger->warning('Invalid structure node @ID "' . $id . '"'); - return $rawText; + return $fullText; } // Is this text format supported? - if ( - !empty($rawTextXml) - && !empty($this->formats[$textFormat]) - ) { - if (!empty($this->formats[$textFormat]['class'])) { - $class = $this->formats[$textFormat]['class']; - // Get the raw text from class. - if ( - class_exists($class) - && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface - ) { - $rawText = $obj->getRawText($rawTextXml); - $this->rawTextArray[$id] = $rawText; - } else { - $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); - } - } + // This part actually differs from previous version of indexed OCR + if (!empty($fileContent) && !empty($this->formats[$textFormat])) { + $fullText = $this->getFullTextWithoutImages($fileContent); } else { $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); } - return $rawText; + return $fullText; + } + + /** + * Get content of the OCR full text file without images + * + * @access private + * + * @param string $fileContent: content of the XML file + * + * @return string The content of the OCR full text file without images + */ + private function getFullTextWithoutImages($fileContent) + { + $objectXml = $this->getFullTextAsObjectXML($fileContent); + if (isset($objectXml->Layout->Page->PrintSpace->Illustration)) { + unset($objectXml->Layout->Page->PrintSpace->Illustration); + } + return $objectXml->asXML(); + } + + /** + * Get format of the OCR full text + * + * @access private + * + * @param string $fileContent: content of the XML file + * + * @return string The format of the OCR full text + */ + private function getTextFormat($fileContent) + { + // Get the root element's name as text format. + return strtoupper($this->getFullTextAsObjectXML($fileContent)->getName()); + } + + /** + * Get content of the OCR full text file + * + * @access private + * + * @param string $fileContent: content of the XML file + * + * @return \SimpleXMLElement content of the XML file as object + */ + private function getFullTextAsObjectXML($fileContent) + { + // Turn off libxml's error logging. + $libxmlErrors = libxml_use_internal_errors(true); + // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. + $previousValueOfEntityLoader = libxml_disable_entity_loader(true); + // Load XML from file. + $objectXml = simplexml_load_string($fileContent); + // Reset entity loader setting. + libxml_disable_entity_loader($previousValueOfEntityLoader); + // Reset libxml's error logging. + libxml_use_internal_errors($libxmlErrors); + // Get the root element + return $objectXml; } /** @@ -1306,9 +1337,14 @@ public function save($pid = 0, $core = 0, $owner = null) } // Add document to index. if ($core) { - Indexer::add($this, $core); + //TODO: change return of this method to true on success and false on failure + $hasErrors = Indexer::add($this, $core); + if ($hasErrors) { + return false; + } } else { $this->logger->notice('Invalid UID "' . $core . '" for Solr core'); + return false; } return true; } diff --git a/Classes/Common/DocumentList.php b/Classes/Common/DocumentList.php index 246c4547b..acaa7c1b9 100644 --- a/Classes/Common/DocumentList.php +++ b/Classes/Common/DocumentList.php @@ -12,8 +12,10 @@ namespace Kitodo\Dlf\Common; +use Kitodo\Dlf\Common\SolrSearchResult\ResultDocument; use Psr\Log\LoggerAwareInterface; use Psr\Log\LoggerAwareTrait; +use Solarium\QueryType\Select\Result\Result; use TYPO3\CMS\Core\SingletonInterface; use TYPO3\CMS\Core\Database\ConnectionPool; use TYPO3\CMS\Core\Utility\GeneralUtility; @@ -237,74 +239,8 @@ protected function getRecord($element) && $this->metadata['options']['source'] == 'search' ) { if ($this->solrConnect()) { - $fields = Solr::getFields(); - $params = []; - // Restrict the fields to the required ones - $params['fields'] = $fields['uid'] . ',' . $fields['id'] . ',' . $fields['toplevel'] . ',' . $fields['thumbnail'] . ',' . $fields['page']; - foreach ($this->solrConfig as $solr_name) { - $params['fields'] .= ',' . $solr_name; - } - // If it is a fulltext search, enable highlighting. - if ($this->metadata['fulltextSearch']) { - $params['component'] = [ - 'highlighting' => [ - 'query' => Solr::escapeQuery($this->metadata['searchString']), - 'field' => $fields['fulltext'], - 'usefastvectorhighlighter' => true - ] - ]; - } - // Set additional query parameters. - $params['start'] = 0; - // Set reasonable limit for safety reasons. - // We don't expect to get more than 10.000 hits per UID. - $params['rows'] = 10000; - // Take over existing filter queries. - $params['filterquery'] = isset($this->metadata['options']['params']['filterquery']) ? $this->metadata['options']['params']['filterquery'] : []; - // Extend filter query to get all documents with the same UID. - foreach ($params['filterquery'] as $key => $value) { - if (isset($value['query'])) { - $params['filterquery'][$key]['query'] = $value['query'] . ' OR ' . $fields['toplevel'] . ':true'; - } - } - // Add filter query to get all documents with the required uid. - $params['filterquery'][] = ['query' => $fields['uid'] . ':' . Solr::escapeQuery($record['uid'])]; - // Add sorting. - $params['sort'] = $this->metadata['options']['params']['sort']; - // Set query. - $params['query'] = $this->metadata['options']['select'] . ' OR ' . $fields['toplevel'] . ':true'; - // Perform search for all documents with the same uid that either fit to the search or marked as toplevel. - $selectQuery = $this->solr->service->createSelect($params); - $result = $this->solr->service->select($selectQuery); - // If it is a fulltext search, fetch the highlighting results. - if ($this->metadata['fulltextSearch']) { - $highlighting = $result->getHighlighting(); - } - // Process results. - foreach ($result as $resArray) { - // Prepare document's metadata. - $metadata = []; - foreach ($this->solrConfig as $index_name => $solr_name) { - if (!empty($resArray->$solr_name)) { - $metadata[$index_name] = (is_array($resArray->$solr_name) ? $resArray->$solr_name : [$resArray->$solr_name]); - } - } - // Add metadata to list elements. - if ($resArray->toplevel) { - $record['thumbnail'] = $resArray->thumbnail; - $record['metadata'] = $metadata; - } else { - $highlightedDoc = !empty($highlighting) ? $highlighting->getResult($resArray->id) : null; - $highlight = !empty($highlightedDoc) ? $highlightedDoc->getField($fields['fulltext'])[0] : ''; - $record['subparts'][$resArray->id] = [ - 'uid' => $resArray->uid, - 'page' => $resArray->page, - 'preview' => $highlight, - 'thumbnail' => $resArray->thumbnail, - 'metadata' => $metadata - ]; - } - } + $result = $this->getSolrResult($record); + $record = $this->getSolrRecord($record, $result); } } // Save record for later usage. @@ -316,6 +252,120 @@ protected function getRecord($element) return $record; } + /** + * It gets SOLR result + * + * @access private + * + * @param array $record: for searched document + * + * @return Result + */ + private function getSolrResult($record) { + $fields = Solr::getFields(); + + $query = $this->solr->service->createSelect(); + // Restrict the fields to the required ones + $query->setFields($fields['uid'] .',' . $fields['id'] .',' . $fields['toplevel'] .',' . $fields['thumbnail'] .',' . $fields['page']); + foreach ($this->solrConfig as $solr_name) { + $query->addField($solr_name); + } + // Set additional query parameters. + // Set reasonable limit for safety reasons. + // We don't expect to get more than 10.000 hits per UID. + $query->setStart(0)->setRows(10000); + // Take over existing filter queries. + $filterQueries = isset($this->metadata['options']['params']['filterquery']) ? $this->metadata['options']['params']['filterquery'] : []; + // Extend filter query to get all documents with the same UID. + foreach ($filterQueries as $key => $value) { + if (isset($value['query'])) { + $filterQuery[$key] = $value['query'] . ' OR ' . $fields['toplevel'] . ':true'; + $filterQuery = [ + 'key' => $key, + 'query' => $value['query'] . ' OR ' . $fields['toplevel'] . ':true' + ]; + $query->addFilterQuery($filterQuery); + } + } + // Add filter query to get all documents with the required uid. + $query->createFilterQuery('uid')->setQuery($fields['uid'] . ':' . Solr::escapeQuery($record['uid'])); + // Add sorting. + $query->addSort('score', $this->metadata['options']['params']['sort']['score']); + // Set query. + $query->setQuery($this->metadata['options']['select'] . ' OR ' . $fields['toplevel'] . ':true'); + + // If it is a fulltext search, enable highlighting. + if ($this->metadata['fulltextSearch']) { + $query->getHighlighting(); + }; + + $solrRequest = $this->solr->service->createRequest($query); + + // If it is a fulltext search, enable highlighting. + if ($this->metadata['fulltextSearch']) { + // field for which highlighting is going to be performed, + // is required if you want to have OCR highlighting + $solrRequest->addParam('hl.ocr.fl', $fields['fulltext']); + // return the coordinates of highlighted search as absolute coordinates + $solrRequest->addParam('hl.ocr.absoluteHighlights', 'on'); + // max amount of snippets for a single page + $solrRequest->addParam('hl.snippets', 20); + } + // Perform search for all documents with the same uid that either fit to the search or marked as toplevel. + $response = $this->solr->service->executeRequest($solrRequest); + return $this->solr->service->createResult($query, $response); + } + + /** + * It processes SOLR result into record, which is + * going to be displayed in the frontend list. + * + * @access private + * + * @param array $record: for searched document + * @param Result $result: found in the SOLR index + * + * @return array + */ + private function getSolrRecord($record, $result) { + // If it is a fulltext search, fetch the highlighting results. + if ($this->metadata['fulltextSearch']) { + $data = $result->getData(); + $highlighting = $data['ocrHighlighting']; + } + + // Process results. + foreach ($result as $resArray) { + // Prepare document's metadata. + $metadata = []; + foreach ($this->solrConfig as $index_name => $solr_name) { + if (!empty($resArray->$solr_name)) { + $metadata[$index_name] = (is_array($resArray->$solr_name) ? $resArray->$solr_name : [$resArray->$solr_name]); + } + } + // Add metadata to list elements. + if ($resArray->toplevel) { + $record['thumbnail'] = $resArray->thumbnail; + $record['metadata'] = $metadata; + } else { + $highlight = ''; + if (!empty($highlighting)) { + $resultDocument = new ResultDocument($resArray, $highlighting, Solr::getFields()); + $highlight = $resultDocument->getSnippets(); + } + + $record['subparts'][$resArray->id] = [ + 'uid' => $resArray->uid, + 'page' => $resArray->page, + 'preview' => $highlight, + 'thumbnail' => $resArray->thumbnail, + 'metadata' => $metadata + ]; + } + } + return $record; + } + /** * This returns the current position * @see \Iterator::key() diff --git a/Classes/Common/FulltextInterface.php b/Classes/Common/FulltextInterface.php index 68d10f17a..755b7ae53 100644 --- a/Classes/Common/FulltextInterface.php +++ b/Classes/Common/FulltextInterface.php @@ -21,6 +21,7 @@ * @access public * @abstract */ +//TODO: check if this is still needed when actually full text xml is indexed interface FulltextInterface { /** diff --git a/Classes/Common/IiifManifest.php b/Classes/Common/IiifManifest.php index 2955cdff7..8f7f50448 100644 --- a/Classes/Common/IiifManifest.php +++ b/Classes/Common/IiifManifest.php @@ -786,9 +786,10 @@ protected function getParentDocumentUidForSaving($pid, $core, $owner) /** * {@inheritDoc} - * @see Document::getRawText() + * @see Document::getFullText() */ - public function getRawText($id) + //TODO: rewrite it to get full OCR + public function getFullText($id) { $rawText = ''; // Get text from raw text array if available. @@ -805,7 +806,7 @@ public function getRawText($id) if (!empty($this->physicalStructureInfo[$id])) { while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - $rawText = parent::getRawTextFromXml($id); + $rawText = parent::getFullTextFromXml($id); break; } } diff --git a/Classes/Common/Indexer.php b/Classes/Common/Indexer.php index 01ffe73b7..c2cc786ef 100644 --- a/Classes/Common/Indexer.php +++ b/Classes/Common/Indexer.php @@ -116,6 +116,7 @@ public static function add(Document &$doc, $core = 0) $updateQuery = self::$solr->service->createUpdate(); $updateQuery->addDeleteQuery('uid:' . $doc->uid); self::$solr->service->update($updateQuery); + // Index every logical unit as separate Solr document. foreach ($doc->tableOfContents as $logicalUnit) { if (!$errors) { @@ -124,7 +125,7 @@ public static function add(Document &$doc, $core = 0) break; } } - // Index fulltext files if available. + // Index full text files if available. if ($doc->hasFulltext) { foreach ($doc->physicalStructure as $pageNumber => $xmlId) { if (!$errors) { @@ -315,6 +316,8 @@ protected static function loadIndexConf($pid) */ protected static function processLogical(Document &$doc, array $logicalUnit) { + $logger = GeneralUtility::makeInstance('TYPO3\CMS\Core\Log\LogManager')->getLogger(__CLASS__); + $errors = 0; // Get metadata for logical unit. $metadata = $doc->metadataArray[$logicalUnit['id']]; @@ -364,6 +367,7 @@ protected static function processLogical(Document &$doc, array $logicalUnit) $solrDoc->setField('terms', $metadata['terms']); $solrDoc->setField('restrictions', $metadata['restrictions']); $solrDoc->setField('collection', $doc->metadataArray[$doc->toplevelId]['collection']); + $solrDoc->setField('fulltext', ''); $coordinates = json_decode($metadata['coordinates'][0]); if (is_object($coordinates)) { $solrDoc->setField('geom', json_encode($coordinates->features[0])); @@ -413,6 +417,7 @@ protected static function processLogical(Document &$doc, array $logicalUnit) 'core.template.flashMessages' ); } + $logger->error('Apache Solr threw exception: "' . $e->getMessage() . '"'); return 1; } } @@ -443,10 +448,9 @@ protected static function processLogical(Document &$doc, array $logicalUnit) */ protected static function processPhysical(Document &$doc, $page, array $physicalUnit) { - if ( - $doc->hasFulltext - && $fulltext = $doc->getRawText($physicalUnit['id']) - ) { + $logger = GeneralUtility::makeInstance('TYPO3\CMS\Core\Log\LogManager')->getLogger(__CLASS__); + + if ($doc->hasFulltext && $fullText = $doc->getFullText($physicalUnit['id'])) { // Read extension configuration. $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey); // Create new Solr document. @@ -470,7 +474,7 @@ protected static function processPhysical(Document &$doc, $page, array $physical $solrDoc->setField('toplevel', false); $solrDoc->setField('type', $physicalUnit['type'], self::$fields['fieldboost']['type']); $solrDoc->setField('collection', $doc->metadataArray[$doc->toplevelId]['collection']); - $solrDoc->setField('fulltext', htmlspecialchars($fulltext)); + $solrDoc->setField('fulltext', $fullText); // Add faceting information to physical sub-elements if applicable. foreach ($doc->metadataArray[$doc->toplevelId] as $index_name => $data) { if ( @@ -510,6 +514,7 @@ protected static function processPhysical(Document &$doc, $page, array $physical true, 'core.template.flashMessages' ); + $logger->error('Apache Solr threw exception: "' . $e->getMessage() . '"'); } return 1; } diff --git a/Classes/Common/MetsDocument.php b/Classes/Common/MetsDocument.php index 70db04396..06900fd81 100644 --- a/Classes/Common/MetsDocument.php +++ b/Classes/Common/MetsDocument.php @@ -667,21 +667,18 @@ class_exists($class) /** * {@inheritDoc} - * @see \Kitodo\Dlf\Common\Document::getRawText() + * @see \Kitodo\Dlf\Common\Document::getFullText() */ - public function getRawText($id) + public function getFullText($id) { - $rawText = ''; - // Get text from raw text array if available. - if (!empty($this->rawTextArray[$id])) { - return $this->rawTextArray[$id]; - } - // Load fileGrps and check for fulltext files. + $fullText = ''; + + // Load fileGrps and check for full text files. $this->_getFileGrps(); if ($this->hasFulltext) { - $rawText = $this->getRawTextFromXml($id); + $fullText = $this->getFullTextFromXml($id); } - return $rawText; + return $fullText; } /** diff --git a/Classes/Common/Solr.php b/Classes/Common/Solr.php index 1ba3edb0a..693df5937 100644 --- a/Classes/Common/Solr.php +++ b/Classes/Common/Solr.php @@ -257,7 +257,7 @@ public static function escapeQueryKeepField($query, $pid) */ public static function getFields() { - $conf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]); + $conf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey); $fields = []; $fields['id'] = $conf['solrFieldId']; diff --git a/Classes/Common/SolrSearchResult/ResultDocument.php b/Classes/Common/SolrSearchResult/ResultDocument.php index 7a7d7cc7a..53cf4452a 100644 --- a/Classes/Common/SolrSearchResult/ResultDocument.php +++ b/Classes/Common/SolrSearchResult/ResultDocument.php @@ -184,6 +184,7 @@ public function getPages() * * @return array(Region) All result's regions which contain search phrase */ + public function getRegions() { return $this->regions; diff --git a/Classes/Plugin/Eid/SearchInDocument.php b/Classes/Plugin/Eid/SearchInDocument.php index 85e7f0425..920b9df65 100644 --- a/Classes/Plugin/Eid/SearchInDocument.php +++ b/Classes/Plugin/Eid/SearchInDocument.php @@ -14,6 +14,7 @@ use Kitodo\Dlf\Common\Helper; use Kitodo\Dlf\Common\Solr; +use Kitodo\Dlf\Common\SolrSearchResult\ResultDocument; use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\ServerRequestInterface; use TYPO3\CMS\Core\Http\Response; @@ -58,21 +59,38 @@ public function main(ServerRequestInterface $request) if ($solr->ready) { $query = $solr->service->createSelect(); $query->setFields([$fields['id'], $fields['uid'], $fields['page']]); - $query->setQuery($fields['fulltext'] . ':(' . Solr::escapeQuery((string) $parameters['q']) . ') AND ' . $fields['uid'] . ':' . intval($parameters['uid'])); + $query->setQuery($this->getQuery($fields, $parameters)); $query->setStart($count)->setRows(20); - $hl = $query->getHighlighting(); - $hl->setFields([$fields['fulltext']]); - $hl->setUseFastVectorHighlighter(true); - $results = $solr->service->select($query); - $output['numFound'] = $results->getNumFound(); - $highlighting = $results->getHighlighting(); - foreach ($results as $result) { - $snippet = $highlighting->getResult($result->id)->getField($fields['fulltext']); + $query->getHighlighting(); + $solrRequest = $solr->service->createRequest($query); + + // it is necessary to add the custom parameters to the request + // because query object doesn't allow custom parameters + + // field for which highlighting is going to be performed, + // is required if you want to have OCR highlighting + $solrRequest->addParam('hl.ocr.fl', $fields['fulltext']); + // return the coordinates of highlighted search as absolute coordinates + $solrRequest->addParam('hl.ocr.absoluteHighlights', 'on'); + // max amount of snippets for a single page + $solrRequest->addParam('hl.snippets', 20); + + $response = $solr->service->executeRequest($solrRequest); + $result = $solr->service->createResult($query, $response); + /** @scrutinizer ignore-call */ + $output['numFound'] = $result->getNumFound(); + $data = $result->getData(); + $highlighting = $data['ocrHighlighting']; + + foreach ($result as $record) { + $resultDocument = new ResultDocument($record, $highlighting, $fields); + $document = [ - 'id' => $result->id, - 'uid' => $result->uid, - 'page' => $result->page, - 'snippet' => !empty($snippet) ? implode(' [...] ', $snippet) : '' + 'id' => $resultDocument->getId(), + 'uid' => !empty($resultDocument->getUid()) ? $resultDocument->getUid() : $parameters['uid'], + 'page' => $resultDocument->getPage(), + 'snippet' => $resultDocument->getSnippets(), + 'highlight' => $resultDocument->getHighlightsIds() ]; $output['documents'][$count] = $document; $count++; @@ -84,4 +102,12 @@ public function main(ServerRequestInterface $request) $response->getBody()->write(json_encode($output)); return $response; } + + private function getQuery($fields, $parameters) { + return $fields['fulltext'] . ':(' . Solr::escapeQuery((string) $parameters['q']) . ') AND ' . $fields['uid'] . ':' . $this->getUid($parameters['uid']); + } + + private function getUid($uid) { + return is_numeric($uid) > 0 ? intval($uid) : $uid; + } } diff --git a/Classes/Plugin/Tools/SearchInDocumentTool.php b/Classes/Plugin/Tools/SearchInDocumentTool.php index 90fef9ac2..7bd8eb5cc 100644 --- a/Classes/Plugin/Tools/SearchInDocumentTool.php +++ b/Classes/Plugin/Tools/SearchInDocumentTool.php @@ -144,7 +144,9 @@ protected function getActionUrl() } /** - * Get current document id + * Get current document id. As default the uid will be used. + * In case there is defined documentIdUrlSchema then the id will + * extracted from this URL. * * @access protected * @@ -154,6 +156,7 @@ protected function getCurrentDocumentId() { $id = $this->doc->uid; + // example: https://host.de/items/*id*/record if (!empty($this->conf['documentIdUrlSchema'])) { $arr = explode('*', $this->conf['documentIdUrlSchema']); diff --git a/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml b/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml index 517439d40..056a0c59c 100644 --- a/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml +++ b/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml @@ -39,8 +39,12 @@ limitations under the License. - + + + + + @@ -55,6 +59,9 @@ limitations under the License. + + + @@ -127,7 +134,7 @@ limitations under the License. - + diff --git a/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml b/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml index 3e2944740..f44bf0870 100644 --- a/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml +++ b/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml @@ -83,6 +83,8 @@ + + + + Instruct the request handlers you want to enable OCR highlighting for to include the + search component you defined above. This example uses the standard /select handler. + + CAUTION: Make sure that the OCR highlight component is listed **before** the standard + highlighting component, but **after** the query component. + --> + + + query + facet + ocrHighlight + highlight + + + + diff --git a/Documentation/Plugins/Index.rst b/Documentation/Plugins/Index.rst index e840f00c4..bbca8932c 100644 --- a/Documentation/Plugins/Index.rst +++ b/Documentation/Plugins/Index.rst @@ -1115,6 +1115,9 @@ This plugin adds an possibility to search all appearances of the phrase in curre :Data Type: :ref:`t3tsref:data-type-string` :Default: + empty + :Values: + https://host.de/items/*id*/record - example value - :Property: idInputName @@ -1157,3 +1160,4 @@ This plugin adds an possibility to search all appearances of the phrase in curre :ref:`t3tsref:data-type-string` :Default: tx_dlf[encrypted] + diff --git a/Resources/Public/Javascript/PageView/PageView.js b/Resources/Public/Javascript/PageView/PageView.js index 3457ff041..9c96875ab 100644 --- a/Resources/Public/Javascript/PageView/PageView.js +++ b/Resources/Public/Javascript/PageView/PageView.js @@ -81,6 +81,12 @@ var dlfViewer = function(settings){ * @private */ this.highlightKeys = 'tx_dlf[highlight_word]'; + + /** + * @type {string|undefined} + * @private + */ + this.highlightWords = null; /** * @type {Object|undefined} @@ -292,7 +298,10 @@ dlfViewer.prototype.createControls_ = function(controlNames, layers) { /** * Displays highlight words */ -dlfViewer.prototype.displayHighlightWord = function() { +dlfViewer.prototype.displayHighlightWord = function(highlightWords = null) { + if(highlightWords != null) { + this.highlightWords = highlightWords; + } if (!dlfUtils.exists(this.highlightLayer)) { @@ -349,11 +358,18 @@ dlfViewer.prototype.displayHighlightWord = function() { } if (hasOwnProperty && this.fulltexts[0] !== undefined && this.fulltexts[0].url !== '' && this.images.length > 0) { - var value = urlParams[param], - values = value.split(';'), + var value = undefined, fulltextData = dlfFullTextUtils.fetchFullTextDataFromServer(this.fulltexts[0].url, this.images[0]), fulltextDataImageTwo = undefined; + if(this.highlightWords != null) { + value = this.highlightWords; + } else { + value = urlParams[param]; + } + + var values = decodeURIComponent(value).split(';'); + // check if there is another image / fulltext to look for if (this.images.length === 2 & this.fulltexts[1] !== undefined && this.fulltexts[1].url !== '') { var image = $.extend({}, this.images[1]); @@ -364,7 +380,7 @@ dlfViewer.prototype.displayHighlightWord = function() { var stringFeatures = fulltextDataImageTwo === undefined ? fulltextData.getStringFeatures() : fulltextData.getStringFeatures().concat(fulltextDataImageTwo.getStringFeatures()); values.forEach($.proxy(function(value) { - var features = dlfUtils.searchFeatureCollectionForText(stringFeatures, value); + var features = dlfUtils.searchFeatureCollectionForCoordinates(stringFeatures, value); if (features !== undefined) { for (var i = 0; i < features.length; i++) { this.highlightLayer.getSource().addFeatures([features[i]]); diff --git a/Resources/Public/Javascript/PageView/SearchInDocument.js b/Resources/Public/Javascript/PageView/SearchInDocument.js index 5ea4a6b8c..1e456e777 100644 --- a/Resources/Public/Javascript/PageView/SearchInDocument.js +++ b/Resources/Public/Javascript/PageView/SearchInDocument.js @@ -8,12 +8,12 @@ * LICENSE.txt file that was distributed with this source code. */ - /** - * This function increases the start parameter of the search form and submits - * the form. - * - * @returns void - */ +/** + * This function increases the start parameter of the search form and submits + * the form. + * + * @returns void + */ function nextResultPage() { var currentStart = $("#tx-dlf-search-in-document-form input[id='tx-dlf-search-in-document-start']").val(); var newStart = parseInt(currentStart) + 20; @@ -43,10 +43,226 @@ function resetStart() { $("#tx-dlf-search-in-document-form input[id='tx-dlf-search-in-document-start']").val(0); } +/** + * Add highlight effect for found search phrase. + * @param {array} highlightIds + * + * @returns void + */ +function addHighlightEffect(highlightIds) { + if (highlightIds.length > 0) { + highlightIds.forEach(function (highlightId) { + var targetElement = $('#' + highlightId); + + if (targetElement.length > 0 && !targetElement.hasClass('highlight')) { + targetElement.addClass('highlight'); + } + }); + } +} + +/** + * Get base URL for snippet links. + * + * @param {string} id + * + * @returns {string} + */ +function getBaseUrl(id) { + // Take the workview baseUrl from the form action. + // The URL may be in the following form + // - http://example.com/index.php?id=14 + // - http://example.com/workview (using slug on page with uid=14) + var baseUrl = $("form#tx-dlf-search-in-document-form").attr('action'); + + // check if action URL contains id, if not, get URL from window + if(baseUrl === undefined || baseUrl.split('?')[0].indexOf(id) === -1) { + baseUrl = $(location).attr('href'); + } + + return baseUrl; +} + +function getHighlights(highlight) { + var highlights = ""; + + for(var i = 0; i < highlight.length; i++) { + if (highlights === "") { + highlights += highlight[i]; + } else { + if(highlights.indexOf(highlight[i]) === -1) { + highlights += ';' + highlight[i]; + } + } + } + + return highlights; +} + +/** + * Get current URL query parameters. + * It returns array of params in form 'param=value' if there are any params supplied in the given url. If there are none it returns empty array + * + * @param {string} baseUrl + * + * @returns {array} array with params or empty + */ +function getCurrentQueryParams(baseUrl) { + if(baseUrl.indexOf('?') > 0) { + return baseUrl.slice(baseUrl.indexOf('?') + 1).split('&'); + } + + return []; +} + +/** + * Get all URL query parameters for snippet links. + * All means that it includes together params which were already supplied in the page url and params which are returned as search results. + * + * @param {string} baseUrl + * @param {array} queryParams + * + * @returns {array} array with params in form 'param' => 'value' + */ +function getAllQueryParams(baseUrl, queryParams) { + var params = getCurrentQueryParams(baseUrl); + + var queryParam; + for(var i = 0; i < params.length; i++) { + queryParam = params[i].split('='); + if(queryParams.indexOf(decodeURIComponent(queryParam[0])) === -1) { + queryParams.push(decodeURIComponent(queryParam[0])); + queryParams[decodeURIComponent(queryParam[0])] = queryParam[1]; + } + } + return queryParams; +} + +/** + * Get needed URL query parameters. + * It returns array of params as objects 'param' => 'value'. It contains exactly 3 params which are taken out of search result. + * + * @param {array} element + * + * @returns {array} array with params in form 'param' => 'value' + */ +function getNeededQueryParams(element) { + var id = $("input[id='tx-dlf-search-in-document-id']").attr('name'); + var highlightWord = $("input[id='tx-dlf-search-in-document-highlight-word']").attr('name'); + var page = $("input[id='tx-dlf-search-in-document-page']").attr('name'); + + var queryParams = []; + + if(id && getBaseUrl(element['uid']).split('?')[0].indexOf(element['uid']) === -1) { + queryParams.push(id); + queryParams[id] = element['uid']; + } + + if(highlightWord) { + queryParams.push(highlightWord); + queryParams[highlightWord] = encodeURIComponent($("input[id='tx-dlf-search-in-document-query']").val()); + } + + if(page) { + queryParams.push(page); + queryParams[page] = element['page']; + } + + return queryParams; +} + +/** + * Get snippet link. + * + * @param {array} element + * + * @returns {string} + */ +function getLink(element) { + var baseUrl = getBaseUrl(element['uid']); + + var queryParams = getNeededQueryParams(element); + + if (baseUrl.indexOf('?') > 0) { + queryParams = getAllQueryParams(baseUrl, queryParams); + baseUrl = baseUrl.split('?')[0]; + } + + var link = baseUrl + '?'; + + // add query params to result link + for(var i = 0; i < queryParams.length; i++) { + link += queryParams[i] + '=' + queryParams[queryParams[i]] + '&'; + } + link = link.slice(0, -1); + return link; +} + +function getNavigationButtons(start, numFound) { + var buttons = ""; + + if (start > 0) { + buttons += ''; + } + + if (numFound > (start + 20)) { + buttons += ''; + } + return buttons; +} + +function getCurrentPage() { + var page = 1; + var queryParams = getCurrentQueryParams(getBaseUrl(" ")); + + for(var i = 0; i < queryParams.length; i++) { + var queryParam = queryParams[i].split('='); + + if(decodeURIComponent(queryParam[0]) === $("input[id='tx-dlf-search-in-document-page']").attr('name')) { + page = parseInt(queryParam[1], 10); + } + } + + return page; +} + +function addImageHighlight(data) { + var page = getCurrentPage(); + + data['documents'].forEach(function (element, i) { + if(element['page'] === page) { + if (element['highlight'].length > 0) { + if(tx_dlf_viewer.map != null) { + tx_dlf_viewer.displayHighlightWord(encodeURIComponent(getHighlights(element['highlight']))); + } else { + setTimeout(addImageHighlight, 500, data); + } + } + addHighlightEffect(element['highlight']); + } + }); +} + +function triggerSearchAfterHitLoad() { + var queryParams = getCurrentQueryParams(getBaseUrl(" ")); + var searchedQueryParam = $("input[id='tx-dlf-search-in-document-highlight-word']").attr('name'); + + for(var i = 0; i < queryParams.length; i++) { + var queryParam = queryParams[i].split('='); + + if(searchedQueryParam && decodeURIComponent(queryParam[0]).indexOf(searchedQueryParam) !== -1) { + $("input[id='tx-dlf-search-in-document-query']").val(decodeURIComponent(queryParam[1])); + $("#tx-dlf-search-in-document-form").submit(); + break; + } + } +} + $(document).ready(function() { $("#tx-dlf-search-in-document-form").submit(function(event) { // Stop form from submitting normally event.preventDefault(); + $('#tx-dlf-search-in-document-loading').show(); $('#tx-dlf-search-in-document-clearing').hide(); $('#tx-dlf-search-in-document-button-next').hide(); @@ -66,35 +282,16 @@ $(document).ready(function() { var resultList = '
    '; var start = -1; if (data['numFound'] > 0) { - // Take the workview baseUrl from the form action. - // The URL may be in the following form - // - http://example.com/index.php?id=14 - // - http://example.com/workview (using slug on page with uid=14) - var baseUrl = $("form#tx-dlf-search-in-document-form").attr('action'); - - if (baseUrl.indexOf('?') > 0) { - baseUrl += '&'; - } else { - baseUrl += '?'; - } data['documents'].forEach(function (element, i) { if (start < 0) { start = i; } - var searchWord = element['snippet']; - searchWord = searchWord.substring(searchWord.indexOf('') + 4, searchWord.indexOf('')); - - var link = baseUrl - + 'tx_dlf[id]=' + element['uid'] - + '&tx_dlf[highlight_word]=' + encodeURIComponent(searchWord) - + '&tx_dlf[page]=' + element['page']; - if (element['snippet'].length > 0) { resultItems[element['page']] = '' + $('#tx-dlf-search-in-document-label-page').text() + ' ' + element['page'] + '
    ' + '' - + '' + element['snippet'] + '' + + '' + element['snippet'] + '' + ''; } }); @@ -105,29 +302,28 @@ $(document).ready(function() { resultItems.forEach(function (item, index) { resultList += '
  • ' + item + '
  • '; }); + + addImageHighlight(data); } else { resultList += '
  • ' + $('#tx-dlf-search-in-document-label-noresult').text() + '
  • '; } resultList += '
'; - if (start > 0) { - resultList += ''; - } - if (data['numFound'] > (start + 20)) { - resultList += ''; - } + resultList += getNavigationButtons(start, data['numFound']); $('#tx-dlf-search-in-document-results').html(resultList); }, "json" - ) - .done(function( data ) { + ).done(function (data) { $('#tx-dfgviewer-sru-results-loading').hide(); $('#tx-dfgviewer-sru-results-clearing').show(); }); }); - // clearing button - $('#tx-dlf-search-in-document-clearing').click(function() { + + // clearing button + $('#tx-dlf-search-in-document-clearing').click(function() { $('#tx-dlf-search-in-document-results ul').remove(); $('.results-active-indicator').remove(); $('#tx-dlf-search-in-document-query').val(''); }); + + triggerSearchAfterHitLoad(); }); diff --git a/Resources/Public/Javascript/PageView/Utility.js b/Resources/Public/Javascript/PageView/Utility.js index d731be7cf..48cc51719 100644 --- a/Resources/Public/Javascript/PageView/Utility.js +++ b/Resources/Public/Javascript/PageView/Utility.js @@ -865,16 +865,18 @@ dlfUtils.scaleToImageSize = function (features, imageObj, width, height, opt_off }; /** - * Search a feature collcetion for a feature with the given text + * Search a feature collection for a feature with the given coordinates * @param {Array.} featureCollection - * @param {string} text + * @param {string} coordinates * @return {Array.|undefined} */ -dlfUtils.searchFeatureCollectionForText = function (featureCollection, text) { +dlfUtils.searchFeatureCollectionForCoordinates = function (featureCollection, coordinates) { var features = []; featureCollection.forEach(function (ft) { if (ft.get('fulltext') !== undefined) { - if (ft.get('fulltext').toLowerCase().indexOf(text.toLowerCase()) > -1) features.push(ft); + if ((ft.get('width') + '_' + ft.get('height') + '_' + ft.get('hpos') + '_' + ft.get('vpos')) === coordinates) { + features.push(ft); + } } }); return features.length > 0 ? features : undefined;