Skip to content

Commit

Permalink
use hpagent package
Browse files Browse the repository at this point in the history
  • Loading branch information
anish-work committed Nov 13, 2024
1 parent a70415c commit 5751e88
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 79 deletions.
48 changes: 10 additions & 38 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
"author": "",
"license": "ISC",
"dependencies": {
"axios": "^1.7.7",
"cors": "^2.8.5",
"dotenv": "^8.6.0",
"express": "^4.19.2",
"got": "^11.8.0",
"hpagent": "^1.2.0",
"metascraper": "^5.14.18",
"metascraper-description": "^5.14.18",
"metascraper-image": "^5.14.18",
Expand Down
51 changes: 11 additions & 40 deletions src/proxyConfig.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// proxyConfig.js
const path = require("path");
const fs = require("fs");
const https = require("https");
const got = require("got");
const { HttpsProxyAgent, HttpProxyAgent } = require("hpagent");

// Fake user agents array - you can expand this list
const FAKE_USER_AGENTS = [
Expand Down Expand Up @@ -34,53 +32,26 @@ function getProxyUrl(scheme) {
return `${scheme}://${config.SCRAPING_PROXY_USERNAME}:${config.SCRAPING_PROXY_PASSWORD}@${config.SCRAPING_PROXY_HOST}`;
}

// Get proxy configuration
const SCRAPING_PROXIES = config.SCRAPING_PROXY_HOST
? {
http: getProxyUrl("http"),
https: getProxyUrl("https"),
}
: {};

// Function to get proxy certificate
async function getScrapingProxyCertPath() {
if (!config.SCRAPING_PROXY_CERT_URL) {
return null;
}

const certPath = path.join(config.BASE_DIR, "proxy_ca_crt.pem");

if (!fs.existsSync(certPath)) {
console.log(`Downloading proxy cert to ${certPath}`);
const response = await got(config.SCRAPING_PROXY_CERT_URL, {
responseType: "arraybuffer",
});
fs.writeFileSync(certPath, response.data);
}

return certPath;
}

// Main function to get axios config for scraping
async function getScrapingConfig() {
const certPath = await getScrapingProxyCertPath();

const httpsAgent = new https.Agent({
ca: certPath ? fs.readFileSync(certPath) : undefined,
});

return {
headers: {
"User-Agent":
FAKE_USER_AGENTS[Math.floor(Math.random() * FAKE_USER_AGENTS.length)],
},
proxy: SCRAPING_PROXIES,
httpsAgent,
agent: {
https: new HttpsProxyAgent({
https: getProxyUrl("https"),
}),
// http:
// SCRAPING_PROXIES.http &&
// new HttpProxyAgent({
// http: getProxyUrl("http"),
// }),
},
};
}

module.exports = {
getScrapingConfig,
FAKE_USER_AGENTS,
SCRAPING_PROXIES,
};

0 comments on commit 5751e88

Please sign in to comment.