Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ISSUE-3: OCR specific Processor and new features/processing option #11

Merged
merged 24 commits into from
Dec 15, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d0d4a6d
proc_open and killing of processes taking more than X seconds
DiegoPino Nov 23, 2020
0010de2
Moves Processing from PreSave to PostSave
DiegoPino Nov 23, 2020
b332c59
Updates processor annotations (this may change, i feel its a bit comp…
DiegoPino Nov 23, 2020
618a56c
Updates Service to run on PostSave
DiegoPino Nov 23, 2020
e144f71
So many updates on our Queue Worker
DiegoPino Nov 23, 2020
5c6fa9f
First pass on OcrPostProcessor
DiegoPino Nov 23, 2020
391101b
OK. I got XPath is working
DiegoPino Nov 23, 2020
224c440
Route Fix for D9
DiegoPino Nov 23, 2020
617b64a
Address page id question from giancarlo
DiegoPino Nov 23, 2020
f1073bd
Add search_api_solr to composer dependency
DiegoPino Nov 24, 2020
0442676
Correctly process page ratio and parse things out for miniCOR
DiegoPino Nov 24, 2020
d201c75
Checks if Checksum + search_api_id are already in Solr
DiegoPino Nov 24, 2020
2726210
Remove leading 0s from miniOCR dimensions
DiegoPino Nov 24, 2020
eaf0a47
Chained processors working
DiegoPino Dec 1, 2020
5c0d688
Fixed generic key store key, now all pages are actually different
DiegoPino Dec 1, 2020
f6247d7
Basically a LOT: For now 2 events subs, share the same code so i may …
DiegoPino Dec 4, 2020
47aa349
Created an abstract class and the queue worker. Simpler
DiegoPino Dec 4, 2020
10e8631
Update StrawberryRunnersPostProcessorPluginBase.php
DiegoPino Dec 4, 2020
45ca514
This is the largest change
DiegoPino Dec 4, 2020
04e5f11
remove deprecated D9 for temp storage
DiegoPino Dec 4, 2020
122e778
Address @giancarlobi review (comparison operation) and does some gene…
DiegoPino Dec 6, 2020
02abe54
Address Code review from @giancarlobi
DiegoPino Dec 7, 2020
4c54c94
Drupal 9 in the .info
DiegoPino Dec 7, 2020
91ccc23
Update hook for missing entity
DiegoPino Dec 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions config/schema/strawberry_runners.schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,44 @@ strawberryfield_runners.strawberry_runners_postprocessor.*:
strawberryfield_runners.strawberry_runners_postprocessor.binary:
type: config_object
label: 'Strawberry Runners Post Processor Config Entity Binary specific config'
mapping:
source_type:
type: string
label: 'The type of Source Data this Processor works on'
ado_type:
type: string
label: 'DO type(s) to limit this Processor to'
jsonkey:
type: sequence
label: 'The JSON key(s) containing the desired Source File(s)'
sequence:
- type: string
mime_type:
type: string
label: 'Mimetypes(s) to limit this Processor to'
path:
type: string
label: 'The path for he binary to execute'
arguments:
type: string
label: 'Any additional argument your executable binary requires'
output_type:
type: string
label: 'The expected and desired output of this processor'
output_destination:
type: sequence
label: 'Where and how the output will be used'
sequence:
- type: string
timeout:
type: integer
label: 'Timeout in seconds for this process'
weight:
type: integer
label: 'Order or execution in the global chain'
strawberryfield_runners.strawberry_runners_postprocessor.ocr:
type: config_object
label: 'Strawberry Runners Post Processor Config Entity OCR specific config'
mapping:
source_type:
type: string
Expand All @@ -49,6 +87,15 @@ strawberryfield_runners.strawberry_runners_postprocessor.binary:
arguments:
type: string
label: 'Any additional argument your executable binary requires'
tesseract_arguments:
type: string
label: 'Any additional argument your executable binary requires'
path:
type: string
label: 'The path for he binary to execute'
tesseract_path:
type: string
label: 'The path for he binary to execute'
output_type:
type: string
label: 'The expected and desired output of this processor'
Expand Down
9 changes: 8 additions & 1 deletion src/Annotation/StrawberryRunnersPostProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ class StrawberryRunnersPostProcessor extends Plugin {
*/
public $input_property;

/**
* The Object property that contains the additional data needed by the Processor ::run method
*
* @var string $input_arguments;
*
*/
public $input_arguments;

/**
* Processing stage: can be Entity PreSave or Index time search_api
Expand All @@ -64,4 +71,4 @@ class StrawberryRunnersPostProcessor extends Plugin {
*/
public $when = StrawberryRunnersPostProcessor::PRESAVE;

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Session\AccountInterface;
use Drupal\strawberryfield\Event\StrawberryfieldCrudEvent;
use Drupal\strawberryfield\EventSubscriber\StrawberryfieldEventPresaveSubscriber;
use Drupal\Core\StringTranslation\StringTranslationTrait;
use Drupal\Core\StringTranslation\TranslationInterface;
use Drupal\Core\Messenger\MessengerInterface;
Expand All @@ -14,14 +13,14 @@
use Drupal\Component\Utility\Unicode;
use Drupal\file\FileInterface;
use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface;
use Drupal\Core\StreamWrapper\StreamWrapperInterface;
use Drupal\Core\File\FileSystemInterface;
use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginManager;
use Drupal\strawberryfield\EventSubscriber\StrawberryfieldEventSaveSubscriber;

/**
* Event subscriber for SBF bearing entity json process event.
*/
class StrawberryRunnersEventPreSavePostProcessingSubscriber extends StrawberryfieldEventPresaveSubscriber {
class StrawberryRunnersEventSavePostProcessingSubscriber extends StrawberryfieldEventSaveSubscriber {


use StringTranslationTrait;
Expand Down Expand Up @@ -150,7 +149,7 @@ public function __construct(
* @throws \Drupal\Component\Plugin\Exception\PluginException
* @throws \Drupal\Component\Plugin\Exception\PluginNotFoundException
*/
public function onEntityPresave(StrawberryfieldCrudEvent $event) {
public function onEntitySave(StrawberryfieldCrudEvent $event) {

/* @var $plugin_config_entities \Drupal\strawberry_runners\Entity\strawberryRunnerPostprocessorEntity[] */
$plugin_config_entities = $this->entityTypeManager->getListBuilder('strawberry_runners_postprocessor')->load();
Expand Down Expand Up @@ -232,9 +231,9 @@ public function onEntityPresave(StrawberryfieldCrudEvent $event) {
/** @var $itemfield \Drupal\strawberryfield\Plugin\Field\FieldType\StrawberryFieldItem */
$flatvalues = (array) $itemfield->provideFlatten();
// Run first on entity:files
$sbf_type = NULL;
$sbf_type = [];
if (isset($flatvalues['type'])) {
$sbf_type = $flatvalues['type'];
$sbf_type = (array) $flatvalues['type'];
}
foreach ($askeymap as $jsonkey => $activePlugins) {
if (isset($flatvalues[$jsonkey])) {
Expand All @@ -243,12 +242,16 @@ public function onEntityPresave(StrawberryfieldCrudEvent $event) {

foreach($activePlugins as $activePluginId => $config) {
$valid_mimes = [];
if (empty($config['ado_type']) || in_array($config['ado_type'] , $sbf_type)) {
//@TODO also split $config['ado_type'] so we can check
$valid_ado_type = [];
$valid_ado_type = explode(',', $config['ado_type']);
if (empty($config['ado_type']) || count(array_intersect($valid_ado_type , $sbf_type)) > 0) {
$valid_mimes = explode(',', $config['mime_type']);
if (empty($valid_mimes) || (isset($asstructure["dr:mimetype"]) && in_array($asstructure["dr:mimetype"], $valid_mimes))) {
$data = new \stdClass();
$data->fid = $asstructure['dr:fid'];
$data->nid = $entity->id();
$data->nuuid = $entity->uuid();
// We are passing also the full file metadata.
// This gives us an advantage so we can reuse
// Sequence IDs, PDF pages, etc and act on them
Expand All @@ -260,6 +263,16 @@ public function onEntityPresave(StrawberryfieldCrudEvent $event) {
// $activePluginId? That would allow us to skip reprocessing
// Easier?
$data->metadata = $asstructure;

// @TODO how to force?
// Can be a state key, valuekey, or a JSON passed property.
// Issue with JSON passed property is that we can no longer
// Here modify it (Entity is saved)
// So we should really better have a non Metadata method for this
// Or/ we can have a preSave Subscriber that reads the prop,
// sets the state and then removes if before saving

$data->force = FALSE;
$data->plugin_config_entity_id = $activePluginId;
// See https://github.com/esmero/strawberry_runners/issues/10
// Since the destination Queue can be a modal thing
Expand Down
66 changes: 42 additions & 24 deletions src/Plugin/QueueWorker/IndexPostProcessorQueueWorker.php
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,13 @@ public function processItem($data) {

$processor_instance = $this->getProcessorPlugin($data->plugin_config_entity_id);

if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL) {
if (!isset($data->fid) || $data->fid == NULL || !isset($data->nid) || $data->nid == NULL || !is_array($data->metadata)) {
return;
}
$file = $this->entityTypeManager->getStorage('file')->load($data->fid);

if ($file === NULL) {
if ($file === NULL || !isset($data->metadata['checksum'])) {
error_log('Sorry the file does not exist or has no checksum yet. We really need the checksum');
return;
}
//@TODO should we wrap this around a try catch?
Expand All @@ -188,37 +189,54 @@ public function processItem($data) {

// Skip file if element is found in key_value collection.
$processed_data = $this->keyValue->get($keyvalue_collection)->get($key);

if (empty($processed_data)) {
error_log('Is this already in our temp keyValue?');
error_log(empty($processed_data));
//@TODO allow a force in case of corrupted key value? Partial output
// Extragenous weird data?
if (true || empty($processed_data) ||
$data->force == TRUE ||
(!isset($processed_data->checksum) ||
empty($processed_data->checksum) ||
$processed_data->checksum != $data->metadata['checksum'])) {
// Extract file and save it in key_value collection.
$io = new \stdClass();
$input = new \stdClass();
$input->filepath = $filelocation;

$input->page_number = 1;
// The Node UUID
$input->nuuid = $data->nuuid;
// All the rest of the associated Metadata in an as:structure
$input->metadata = $data->metadata;
$io->input = $input;
$io->output = NULL;
//@TODO implement the TEST and BENCHMARK logic here
// RUN should return exit codes so we can know if something failed
// And totally discard indexing.
$extracted_data = $processor_instance->run($io, StrawberryRunnersPostProcessorPluginInterface::PROCESS);
error_log ('processing just run');
error_log($io->ouput);
error_log('writing to keyvalue');
error_log($key);
$this->keyValue->get($keyvalue_collection)->set($key, $io->output);
}

// Get which indexes have our StrawberryfieldFlavorDatasource enabled!
$indexes = StrawberryfieldFlavorDatasource::getValidIndexes();

$item_ids = [];
if (is_a($entity, TranslatableInterface::class)) {
$translations = $entity->getTranslationLanguages();
foreach ($translations as $translation_id => $translation) {
$item_ids[] = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id;
$toindex = new \stdClass();
$toindex->fulltext = $io->output;
$toindex->checksum = $data->metadata['checksum'];
error_log(var_export($toindex,true));
$this->keyValue->get($keyvalue_collection)->set($key, $toindex);

// Get which indexes have our StrawberryfieldFlavorDatasource enabled!
$indexes = StrawberryfieldFlavorDatasource::getValidIndexes();

$item_ids = [];
if (is_a($entity, TranslatableInterface::class)) {
$translations = $entity->getTranslationLanguages();
foreach ($translations as $translation_id => $translation) {
$item_ids[] = $entity->id() . ':'.'1' .':'.$translation_id.':'.$file->uuid().':'.$data->plugin_config_entity_id;
}
}
error_log(var_export($item_ids,true));
$datasource_id = 'strawberryfield_flavor_datasource';
foreach ($indexes as $index) {
$index->trackItemsInserted($datasource_id, $item_ids);
}
}
error_log(var_export($item_ids,true));
$datasource_id = 'strawberryfield_flavor_datasource';
foreach ($indexes as $index) {
$index->trackItemsUpdated($datasource_id, $item_ids);
}
}
catch (\Exception $exception) {
Expand Down Expand Up @@ -252,7 +270,7 @@ private function ensureFileAvailability(FileInterface $file) {
// Check first if the file is already around in temp?
// @TODO can be sure its the same one? Ideas?
if (is_readable(
$this->fileSystem->realpath(
$this->fileSystem->realpath(
'temporary://sbr_' . $cache_key . '_' . basename($uri)
)
)) {
Expand Down Expand Up @@ -306,4 +324,4 @@ public function getRealpath($uri) {
}
}

}
}
Loading