Skip to content

Commit

Permalink
ok fixed shid
Browse files Browse the repository at this point in the history
  • Loading branch information
ch1pkav committed Apr 5, 2023
1 parent 53ea3a7 commit 116e917
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
8 changes: 5 additions & 3 deletions src/BSONPage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@ void BSONPage::get_text(std::string& text) const {
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &BSONPage::write_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &text);
auto res = curl_easy_perform(curl);
curl_easy_cleanup(curl);
if (res != CURLE_OK) {
curl_easy_cleanup(curl);
throw(std::runtime_error("Error while parsing " + url_));
}
long http_code = 0;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
curl_easy_cleanup(curl);
if (200 > http_code || http_code >= 300) {
throw(std::runtime_error("Error while parsing " + url_ + " HTTP code: " + std::to_string(http_code)));
}
Expand All @@ -43,14 +44,15 @@ void BSONPage::get_text(std::string& text) const {
void BSONPage::parse_page() {
std::string str;
get_text(str);
// std::cout<< str << std::endl;
parse_title(str);
parse_lang(str);
parse_links(str);
parse_headings(str);
}

void BSONPage::parse_links(const std::string& str) {
const std::regex url_re{R"!!(<\s*A\s+[^>]*href\s*=\s*"(http[^"]*)")!!", std::regex_constants::icase};
const std::regex url_re{R"!!(<\s*a\s+[^>]*href\s*=\s*"(http[^;|"]*))!!", std::regex_constants::icase};
links_ = std::set<std::string>{std::sregex_token_iterator(str.cbegin(), str.cend(), url_re, 1),
std::sregex_token_iterator()};
}
Expand Down Expand Up @@ -83,7 +85,7 @@ void BSONPage::parse_headings(const std::string &str) {
auto curr_heading = match[1].str();
curr_heading = std::regex_replace(curr_heading, std::regex(R"!!((<.+?(?=>)>|<\/.+?>|\/doc))!!"), "");
headings_.emplace_back(curr_heading);
std::cout<< curr_heading << std::endl;
// std::cout<< curr_heading << std::endl;
search_start = match.suffix().first;
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ int main(int argc, char* argv[]) {
std::string curr_url = linkQueue.front();
visited.emplace(curr_url);
linkQueue.pop();
std::cout << "Parsing: " << curr_url << std::endl;
// std::cout << "Parsing: " << curr_url << std::endl;

try {
BSONPage page{curr_url};
Expand All @@ -72,6 +72,7 @@ int main(int argc, char* argv[]) {
return url.find(domain) != std::string::npos;
}) != allowed_domains.end());

// std::cout << "Found link: " << url << std::endl;
if (visited.find(url) == visited.end() && within_allowed_domain) {
linkQueue.emplace(url);
visited.emplace(url);
Expand Down

0 comments on commit 116e917

Please sign in to comment.