From 644443c34c5aecac2850098aa9817f001a93cc9d Mon Sep 17 00:00:00 2001 From: Casper Kuijjer Date: Sun, 17 Dec 2023 19:22:46 +0100 Subject: [PATCH] testing cinerama using scrapeops --- cloud/.env.example | 3 +++ cloud/playground.ts | 53 ++++++++++++++++++++------------------ cloud/scrapers/cinerama.ts | 51 ++++++++++++++++++------------------ cloud/serverless.yml | 2 ++ 4 files changed, 59 insertions(+), 50 deletions(-) diff --git a/cloud/.env.example b/cloud/.env.example index 7363ccd1..6e4462c7 100644 --- a/cloud/.env.example +++ b/cloud/.env.example @@ -5,3 +5,6 @@ OMDB_API_KEY= GOOGLE_CUSTOM_SEARCH_API_KEY= GOOGLE_CUSTOM_SEARCH_ID= DYNAMODB_MOVIE_METADATA= +PRIVATE_BUCKET= +PUBLIC_BUCKET= +SCRAPEOPS_API_KEY= diff --git a/cloud/playground.ts b/cloud/playground.ts index 74ac56aa..488d25aa 100644 --- a/cloud/playground.ts +++ b/cloud/playground.ts @@ -132,30 +132,33 @@ const movieMetadataPlayground = async () => { } const getUsingGot = async () => { - const json = await got( - 'https://kinepolisweb-programmation.kinepolis.com/api/Programmation/NL/NL/WWW/Cinema/Cinerama', - { - headers: { - authority: 'kinepolisweb-programmation.kinepolis.com', - // accept: 'application/json, text/javascript, */*; q=0.01', - accept: - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'accept-language': 'en-US,en;q=0.9', - 'if-modified-since': 'Wed, 04 Oct 2023 19:44:34 GMT', - 'if-none-match': '"14021a8ddd8adf9db8db447b7f94cc59:1696448674.360531"', - 'sec-ch-ua': '"Chromium";v="117", "Not;A=Brand";v="8"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'cross-site', - Referer: 'https://cineramabios.nl/', - 'Referrer-Policy': 'strict-origin-when-cross-origin', - 'user-agent': - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', - }, + const url = + 'https://kinepolisweb-programmation.kinepolis.com/api/Programmation/NL/NL/WWW/Cinema/Cinerama' + const scrapeOpsProxyUrl = `https://proxy.scrapeops.io/v1/?api_key=${ + process.env.SCRAPEOPS_API_KEY + }&url=${encodeURIComponent(url)}` + + const json = await got(scrapeOpsProxyUrl, { + headers: { + authority: 'kinepolisweb-programmation.kinepolis.com', + // accept: 'application/json, text/javascript, */*; q=0.01', + accept: + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'accept-language': 'en-US,en;q=0.9', + 'if-modified-since': 'Wed, 04 Oct 2023 19:44:34 GMT', + 'if-none-match': '"14021a8ddd8adf9db8db447b7f94cc59:1696448674.360531"', + 'sec-ch-ua': '"Chromium";v="117", "Not;A=Brand";v="8"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'cross-site', + Referer: 'https://cineramabios.nl/', + 'Referrer-Policy': 'strict-origin-when-cross-origin', + 'user-agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', }, - ).json() + }).json() logger.info('result', { json }) return json @@ -185,8 +188,8 @@ const playground = async ({ event, context } = {}) => { // const results = await findMetadata('chungking express') // const results = await findMetadata('Caché') // await getUsingChromium() - // const result = await getUsingGot() - const result = await getLux() + const result = await getUsingGot() + // const result = await getLux() console.log(JSON.stringify(result, null, 2)) } diff --git a/cloud/scrapers/cinerama.ts b/cloud/scrapers/cinerama.ts index f48bd0f8..56c84dc6 100644 --- a/cloud/scrapers/cinerama.ts +++ b/cloud/scrapers/cinerama.ts @@ -42,31 +42,32 @@ const hasEnglishSubtitles = (movie: KinepolisMovie) => { const extractFromMainPage = async (): Promise => { try { - const programmation: KinepolisProgrammation = await got( - 'https://kinepolisweb-programmation.kinepolis.com/api/Programmation/NL/NL/WWW/Cinema/Cinerama', - { - headers: { - authority: 'kinepolisweb-programmation.kinepolis.com', - // accept: 'application/json, text/javascript, */*; q=0.01', - accept: - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'accept-language': 'en-US,en;q=0.9', - 'if-modified-since': 'Wed, 04 Oct 2023 19:44:34 GMT', - 'if-none-match': - '"14021a8ddd8adf9db8db447b7f94cc59:1696448674.360531"', - 'sec-ch-ua': '"Chromium";v="117", "Not;A=Brand";v="8"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'cross-site', - Referer: 'https://cineramabios.nl/', - 'Referrer-Policy': 'strict-origin-when-cross-origin', - 'user-agent': - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', - }, - }, - ).json() + const url = + 'https://kinepolisweb-programmation.kinepolis.com/api/Programmation/NL/NL/WWW/Cinema/Cinerama' + const scrapeOpsProxyUrl = `https://proxy.scrapeops.io/v1/?api_key=${ + process.env.SCRAPEOPS_API_KEY + }&url=${encodeURIComponent(url)}` + + const programmation: KinepolisProgrammation = await got(scrapeOpsProxyUrl, { + // headers: { + // authority: 'kinepolisweb-programmation.kinepolis.com', + // accept: + // 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + // 'accept-language': 'en-US,en;q=0.9', + // 'if-modified-since': 'Wed, 04 Oct 2023 19:44:34 GMT', + // 'if-none-match': '"14021a8ddd8adf9db8db447b7f94cc59:1696448674.360531"', + // 'sec-ch-ua': '"Chromium";v="117", "Not;A=Brand";v="8"', + // 'sec-ch-ua-mobile': '?0', + // 'sec-ch-ua-platform': '"macOS"', + // 'sec-fetch-dest': 'empty', + // 'sec-fetch-mode': 'cors', + // 'sec-fetch-site': 'cross-site', + // Referer: 'https://cineramabios.nl/', + // 'Referrer-Policy': 'strict-origin-when-cross-origin', + // 'user-agent': + // 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36', + // }, + }).json() const moviesWithEnglishSubtitles = programmation.films.filter(hasEnglishSubtitles) diff --git a/cloud/serverless.yml b/cloud/serverless.yml index 5f0146d4..ec40b8e4 100644 --- a/cloud/serverless.yml +++ b/cloud/serverless.yml @@ -77,6 +77,7 @@ functions: GOOGLE_CUSTOM_SEARCH_ID: ${env:GOOGLE_CUSTOM_SEARCH_ID} GOOGLE_CUSTOM_SEARCH_API_KEY: ${env:GOOGLE_CUSTOM_SEARCH_API_KEY} SCRAPERS: ${env:SCRAPERS, ''} # '' as default value, as SCRAPERS is the only optional env var + SCRAPEOPS_API_KEY: ${env:SCRAPEOPS_API_KEY} layers: - arn:aws:lambda:eu-west-1:764866452798:layer:chrome-aws-lambda:38 # https://github.com/shelfio/chrome-aws-lambda-layer @@ -91,6 +92,7 @@ functions: OMDB_API_KEY: ${env:OMDB_API_KEY} GOOGLE_CUSTOM_SEARCH_ID: ${env:GOOGLE_CUSTOM_SEARCH_ID} GOOGLE_CUSTOM_SEARCH_API_KEY: ${env:GOOGLE_CUSTOM_SEARCH_API_KEY} + SCRAPEOPS_API_KEY: ${env:SCRAPEOPS_API_KEY} notifySlack: handler: handler.notifySlack