Skip to content

Commit

Permalink
TASK: Extract createCrawler into WebCrawlerFactory
Browse files Browse the repository at this point in the history
  • Loading branch information
mhsdesign authored and rolandschuetz committed Feb 6, 2023
1 parent dfae32e commit f7301a6
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 112 deletions.
121 changes: 14 additions & 107 deletions Classes/Command/CheckLinksCommandController.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,9 @@
use CodeQ\LinkChecker\Infrastructure\CrawlNonExcludedUrls;
use CodeQ\LinkChecker\Domain\Notification\NotificationServiceInterface;
use CodeQ\LinkChecker\Infrastructure\OriginUrlException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Middleware;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use CodeQ\LinkChecker\Infrastructure\WebCrawlerFactory;
use GuzzleHttp\Psr7\ServerRequest;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\RequestOptions;
use Neos\ContentRepository\Domain\Service\ContextFactoryInterface;
use Neos\Flow\Annotations as Flow;
use Neos\Flow\Cli\CommandController;
Expand All @@ -30,9 +25,6 @@
use Neos\Neos\Domain\Service\ContentContext;
use Neos\Utility\ObjectAccess;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlObservers\CrawlObserver;
use Spatie\Crawler\CrawlProfiles\CrawlProfile;

/**
* @Flow\Scope("singleton")
Expand Down Expand Up @@ -69,34 +61,30 @@ class CheckLinksCommandController extends CommandController
*/
protected $uriFactory;

/**
* @var WebCrawlerFactory
* @Flow\Inject
*/
protected $webCrawlerFactory;

/**
* @Flow\Inject
* @var ResultItemRepositoryInterface
*/
protected $resultItemRepository;

/**
* @var string
* @Flow\InjectConfiguration(path="notifications.service")
* @Flow\InjectConfiguration(path="notifications")
* @var array
*/
protected $notificationServiceClass;
protected $notificationSettings;

/**
* @var BaseUriProvider
* @Flow\Inject(lazy=false)
*/
protected $baseUriProvider;

protected array $settings;

/**
* Inject the settings
*/
public function injectSettings(array $settings): void
{
$this->settings = $settings;
}

/**
* Clear all stored errors
*
Expand Down Expand Up @@ -200,7 +188,7 @@ private function crawlExternalCommandImplementation(array $domainsToCrawl, int &
$crawlProfile = new CrawlNonExcludedUrls();
$crawlObserver = new LogAndPersistResultCrawlObserver();

$crawler = self::createCrawler($this->settings['clientOptions'] ?? [], $crawlProfile, $crawlObserver);
$crawler = $this->webCrawlerFactory->createCrawler($crawlProfile, $crawlObserver);

foreach ($domainsToCrawl as $domainToCrawl) {

Expand Down Expand Up @@ -235,87 +223,6 @@ private function ensureDomainsNotEmpty(array $domains): void
}
}

private static function createCrawler(array $settings, CrawlProfile $crawlProfile, CrawlObserver $crawlObserver): Crawler
{
// If no settings are configured we just set timeout and allow_redirect.
$clientOptions = [
RequestOptions::TIMEOUT => 100,
RequestOptions::ALLOW_REDIRECTS => false,
];

if (isset($settings['cookies']) && is_bool($settings['cookies'])) {
$clientOptions[RequestOptions::COOKIES] = $settings['cookies'];
}

if (isset($settings['connectionTimeout']) && is_numeric($settings['connectionTimeout'])) {
$clientOptions[RequestOptions::CONNECT_TIMEOUT] = (int)$settings['connectionTimeout'];
}

if (isset($settings['timeout']) && is_numeric($settings['timeout'])) {
$clientOptions[RequestOptions::TIMEOUT] = (int)$settings['timeout'];
}

if (isset($settings['allowRedirects']) && is_bool($settings['allowRedirects'])) {
$clientOptions[RequestOptions::ALLOW_REDIRECTS] = $settings['allowRedirects'];
}

if (
isset($settings['auth']) && is_array($settings['auth'])
&& count($settings['auth']) > 1
) {
$clientOptions[RequestOptions::AUTH] = $settings['auth'];
}

$handler = HandlerStack::create();

if (isset($settings['retryAttempts']) && is_numeric($settings['retryAttempts']) && $settings['retryAttempts'] >= 0) {

$retryAttempts = (int)$settings['retryAttempts'];

$handler->push(
Middleware::retry(
function (
$retries,
Request $request,
Response $response = null,
\Exception $exception = null
) use($retryAttempts) {
if ($retries >= $retryAttempts) {
return false;
}
if ($exception instanceof ConnectException) {
return true;
}
return false;
},
function (
$numberOfRetries
) {
return 1000 * $numberOfRetries;
}
)
);
}

$clientOptions["handler"] = $handler;

$crawler = Crawler::create($clientOptions)
->setCrawlObserver($crawlObserver)
->setCrawlProfile($crawlProfile);

$concurrency = 10;
if (isset($settings['concurrency']) && (int)$settings['concurrency'] >= 0) {
$concurrency = (int)$settings['concurrency'];
}
$crawler->setConcurrency($concurrency);

if (!isset($settings['ignoreRobots']) || $settings['ignoreRobots']) {
$crawler->ignoreRobots();
}

return $crawler;
}

private function createLinkCheckerDashboardUriFromStuff(array $domains): UriInterface
{
$firstDomain = $domains[0];
Expand Down Expand Up @@ -353,11 +260,11 @@ private function sendNotificationIfNecessary(int $errorCount, UriInterface $link
return;
}

if (!$this->settings['notifications']['enabled']) {
if (!$this->notificationSettings['enabled']) {
return;
}

$notificationServiceClass = trim($this->notificationServiceClass);
$notificationServiceClass = trim($this->notificationSettings['service']);
if ($notificationServiceClass === '') {
$errorMessage = 'No notification service has been configured, but the notification handling is enabled';
throw new \InvalidArgumentException($errorMessage, 1540201992);
Expand All @@ -372,7 +279,7 @@ private function sendNotificationIfNecessary(int $errorCount, UriInterface $link
);
}
$notificationService->sendNotification(
$this->settings['notifications']['subject'] ?? '',
$this->notificationSettings['subject'] ?? '',
[
'errorCount' => $errorCount,
'linkCheckerDashboardUri' => $linkCheckerDashboardUri
Expand Down
5 changes: 0 additions & 5 deletions Classes/Infrastructure/EmailService.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,12 @@

namespace CodeQ\LinkChecker\Infrastructure;

use CodeQ\LinkChecker\Domain\Model\ResultItem;
use CodeQ\LinkChecker\Domain\Notification\NotificationServiceInterface;
use League\Csv\CannotInsertRecord;
use League\Csv\Exception;
use Neos\Flow\Annotations as Flow;
use Neos\Flow\Configuration\ConfigurationManager;
use Neos\FluidAdaptor\View\StandaloneView;
use Neos\SwiftMailer\Message;
use League\Csv\Writer;
use Psr\Log\LoggerInterface;
use Swift_Attachment;

/**
* @Flow\Scope("singleton")
Expand Down
110 changes: 110 additions & 0 deletions Classes/Infrastructure/WebCrawlerFactory.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
<?php

namespace CodeQ\LinkChecker\Infrastructure;

use GuzzleHttp\Client;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Middleware;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\RequestOptions;
use Neos\Flow\Annotations as Flow;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlObservers\CrawlObserver;
use Spatie\Crawler\CrawlProfiles\CrawlProfile;

/**
* @Flow\Scope("singleton")
*/
class WebCrawlerFactory
{
/**
* @Flow\InjectConfiguration(path="clientOptions")
* @var array
*/
protected $settings;

public function createCrawler(CrawlProfile $crawlProfile, CrawlObserver $crawlObserver): Crawler
{
// If no this->settings are configured we just set timeout and allow_redirect.
$clientOptions = [
RequestOptions::TIMEOUT => 100,
RequestOptions::ALLOW_REDIRECTS => false,
];

if (isset($this->settings['cookies']) && is_bool($this->settings['cookies'])) {
$clientOptions[RequestOptions::COOKIES] = $this->settings['cookies'];
}

if (isset($this->settings['connectionTimeout']) && is_numeric($this->settings['connectionTimeout'])) {
$clientOptions[RequestOptions::CONNECT_TIMEOUT] = (int)$this->settings['connectionTimeout'];
}

if (isset($this->settings['timeout']) && is_numeric($this->settings['timeout'])) {
$clientOptions[RequestOptions::TIMEOUT] = (int)$this->settings['timeout'];
}

if (isset($this->settings['allowRedirects']) && is_bool($this->settings['allowRedirects'])) {
$clientOptions[RequestOptions::ALLOW_REDIRECTS] = $this->settings['allowRedirects'];
}

if (
isset($this->settings['auth']) && is_array($this->settings['auth'])
&& count($this->settings['auth']) > 1
) {
$clientOptions[RequestOptions::AUTH] = $this->settings['auth'];
}

$handler = HandlerStack::create();

if (isset($this->settings['retryAttempts']) && is_numeric($this->settings['retryAttempts']) && $this->settings['retryAttempts'] >= 0) {
$handler->push(
self::createRetryOnConnectionTimedOutMiddleware((int)$this->settings['retryAttempts'])
);
}

$clientOptions["handler"] = $handler;

$crawler = new Crawler(new Client($clientOptions));

$crawler->setCrawlObserver($crawlObserver);
$crawler->setCrawlProfile($crawlProfile);

$concurrency = 10;
if (isset($this->settings['concurrency']) && (int)$this->settings['concurrency'] >= 0) {
$concurrency = (int)$this->settings['concurrency'];
}
$crawler->setConcurrency($concurrency);

if (!isset($this->settings['ignoreRobots']) || $this->settings['ignoreRobots']) {
$crawler->ignoreRobots();
}

return $crawler;
}

private static function createRetryOnConnectionTimedOutMiddleware(int $retryAttempts) {
return Middleware::retry(
function (
$retries,
Request $request,
Response $response = null,
\Exception $exception = null
) use($retryAttempts) {
if ($retries >= $retryAttempts) {
return false;
}
if ($exception instanceof ConnectException) {
return true;
}
return false;
},
function (
$numberOfRetries
) {
return 1000 * $numberOfRetries;
}
);
}
}

0 comments on commit f7301a6

Please sign in to comment.