Skip to content

Commit

Permalink
add sharedClient option
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Dec 31, 2023
1 parent 09c8ab2 commit 0bcca9d
Showing 1 changed file with 58 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.filter.UrlFilter;
import org.codelibs.fess.crawler.helper.MimeTypeHelper;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
Expand Down Expand Up @@ -77,6 +78,12 @@ public class PlaywrightClient extends AbstractCrawlerClient {

private static final Logger logger = LoggerFactory.getLogger(PlaywrightClient.class);

private static final Object INITIALIZATION_LOCK = new Object();

protected static Tuple4<Playwright, Browser, BrowserContext, Page> SHARED_WORKER = null;

protected static final String SHARED_CLIENT = "sharedClient";

protected static final String RENDERED_STATE = "renderedState";

protected static final String IGNORE_HTTPS_ERRORS_PROPERTY = "ignoreHttpsErrors";
Expand All @@ -91,7 +98,7 @@ public class PlaywrightClient extends AbstractCrawlerClient {

protected LaunchOptions launchOptions;

protected NewContextOptions newContextOptions = new NewContextOptions();
protected NewContextOptions newContextOptions;

protected int downloadTimeout = 15; // 15s

Expand All @@ -105,23 +112,41 @@ public class PlaywrightClient extends AbstractCrawlerClient {
protected CrawlerContainer crawlerContainer;

@Override
public synchronized void init() {
if (worker != null) {
return;
}
public void init() {
synchronized (INITIALIZATION_LOCK) {
if (worker != null) {
return;
}

if (logger.isDebugEnabled()) {
logger.debug("Initiaizing Playwright...");
}
super.init();
if (logger.isDebugEnabled()) {
logger.debug("Initiaizing Playwright...");
}
super.init();

// initialize Playwright's browser context
this.initNewContextOptions();
final String renderedStateParam = getInitParameter(RENDERED_STATE, renderedState.name(), String.class);
if (renderedStateParam != null) {
renderedState = LoadState.valueOf(renderedStateParam);
}

final String renderedStateParam = getInitParameter(RENDERED_STATE, renderedState.name(), String.class);
if (renderedStateParam != null) {
renderedState = LoadState.valueOf(renderedStateParam);
final Boolean shared = getInitParameter(SHARED_CLIENT, Boolean.FALSE, Boolean.class);
if (shared) {
if (SHARED_WORKER == null) {
if (logger.isDebugEnabled()) {
logger.debug("Creating a shared Playwright worker...");
}
SHARED_WORKER = createPlaywrightWorker();
}
logger.info("Use a shared Playwright worker.");
worker = SHARED_WORKER;
} else {
worker = createPlaywrightWorker();
}
}
}

protected Tuple4<Playwright, Browser, BrowserContext, Page> createPlaywrightWorker() {
// initialize Playwright's browser context
final NewContextOptions newContextOptions = initNewContextOptions();

Playwright playwright = null;
Browser browser = null;
Expand All @@ -130,7 +155,7 @@ public synchronized void init() {
try {
playwright = Playwright.create(new Playwright.CreateOptions().setEnv(options));
browser = getBrowserType(playwright).launch(launchOptions);
browserContext = this.createAuthenticatedContext(browser);
browserContext = this.createAuthenticatedContext(browser, newContextOptions);
page = browserContext.newPage();
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
Expand All @@ -140,7 +165,7 @@ public synchronized void init() {
throw new CrawlerSystemException("Failed to create PlaywrightClient.", e);
}

worker = new Tuple4<>(playwright, browser, browserContext, page);
return new Tuple4<>(playwright, browser, browserContext, page);
}

@Override
Expand Down Expand Up @@ -268,7 +293,13 @@ public ResponseData execute(final RequestData request) {
logger.debug("Waiting for downloaded file: {}", e.getMessage());
}
for (int i = 0; i < downloadTimeout * 10 && (downloadRef.get() == null || responseRef.get() == null); i++) {
page.waitForTimeout(100L);
try {
page.waitForTimeout(100L);
} catch (final Exception e1) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to wait for page loading.", e1);
}
}
}
final Response response = responseRef.get();
final Download download = downloadRef.get();
Expand All @@ -278,7 +309,7 @@ public ResponseData execute(final RequestData request) {
}
return createResponseData(page, request, response, download);
}
throw new CrawlerSystemException("Failed to access " + request.getUrl(), e);
throw new CrawlingAccessException("Failed to access " + request.getUrl(), e);
} finally {
resetPage(page);
}
Expand Down Expand Up @@ -453,18 +484,16 @@ protected String getCharSet(final Response response) {
/**
* Reads configurations from Web UI &amp; pass it to Playwright Context
*/
protected void initNewContextOptions() {
if (this.newContextOptions == null) {
this.newContextOptions = new NewContextOptions();
}
protected NewContextOptions initNewContextOptions() {
final NewContextOptions options = newContextOptions != null ? newContextOptions : new NewContextOptions();

// Check whether to skip SSL certificate checking
// Also check ignoreSslCertificate for backward compatibility with HcHttpClient's config
final boolean ignoreHttpsErrors = getInitParameter(IGNORE_HTTPS_ERRORS_PROPERTY, false, Boolean.class);
final boolean ignoreSslCertificate = getInitParameter(HcHttpClient.IGNORE_SSL_CERTIFICATE_PROPERTY, false, Boolean.class);

if (ignoreHttpsErrors || ignoreSslCertificate) {
this.newContextOptions.ignoreHTTPSErrors = true;
options.ignoreHTTPSErrors = true;
}

// append existing proxy configuration
Expand All @@ -482,33 +511,34 @@ protected void initNewContextOptions() {
proxy.setPassword(proxyCredentials.getPassword());
}
proxy.setBypass(proxyBypass);
this.newContextOptions.setProxy(proxy);
options.setProxy(proxy);
}
return options;
}

/**
* Creates an authenticated Playwright context, by using Fess's built-in HcHttpClient to do authentication,
* then passes its cookies to Playwright.
*/
protected BrowserContext createAuthenticatedContext(final Browser browser) {
protected BrowserContext createAuthenticatedContext(final Browser browser, final NewContextOptions newContextOptions) {
final Authentication[] authentications =
getInitParameter(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY, new Authentication[0], Authentication[].class);

if (authentications.length == 0) {
return browser.newContext(this.newContextOptions);
return browser.newContext(newContextOptions);
}

for (final Authentication authentication : authentications) {
if (!StringUtils.equals(authentication.getAuthScheme().getSchemeName(), "form")) {
// Use the first non-form auth credentials to fill the browser's credential prompt
final String username = authentication.getCredentials().getUserPrincipal().getName();
final String password = authentication.getCredentials().getPassword();
this.newContextOptions.setHttpCredentials(username, password);
newContextOptions.setHttpCredentials(username, password);
break;
}
}

final BrowserContext playwrightContext = browser.newContext(this.newContextOptions);
final BrowserContext playwrightContext = browser.newContext(newContextOptions);
try (final var fessHttpClient = new HcHttpClient()) {
fessHttpClient.setInitParameterMap(this.initParamMap);
fessHttpClient.init();
Expand Down

0 comments on commit 0bcca9d

Please sign in to comment.