From 116e91725ee3698bd12fbcd1cb903c5bed8bdbae Mon Sep 17 00:00:00 2001 From: Teodor Muzychuk Date: Thu, 6 Apr 2023 01:54:57 +0300 Subject: [PATCH] ok fixed shid --- src/BSONPage.cpp | 8 +++++--- src/main.cpp | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/BSONPage.cpp b/src/BSONPage.cpp index 6445d28..128eff9 100644 --- a/src/BSONPage.cpp +++ b/src/BSONPage.cpp @@ -28,12 +28,13 @@ void BSONPage::get_text(std::string& text) const { curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &BSONPage::write_data); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &text); auto res = curl_easy_perform(curl); - curl_easy_cleanup(curl); if (res != CURLE_OK) { + curl_easy_cleanup(curl); throw(std::runtime_error("Error while parsing " + url_)); } long http_code = 0; curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_easy_cleanup(curl); if (200 > http_code || http_code >= 300) { throw(std::runtime_error("Error while parsing " + url_ + " HTTP code: " + std::to_string(http_code))); } @@ -43,6 +44,7 @@ void BSONPage::get_text(std::string& text) const { void BSONPage::parse_page() { std::string str; get_text(str); +// std::cout<< str << std::endl; parse_title(str); parse_lang(str); parse_links(str); @@ -50,7 +52,7 @@ void BSONPage::parse_page() { } void BSONPage::parse_links(const std::string& str) { - const std::regex url_re{R"!!(<\s*A\s+[^>]*href\s*=\s*"(http[^"]*)")!!", std::regex_constants::icase}; + const std::regex url_re{R"!!(<\s*a\s+[^>]*href\s*=\s*"(http[^;|"]*))!!", std::regex_constants::icase}; links_ = std::set{std::sregex_token_iterator(str.cbegin(), str.cend(), url_re, 1), std::sregex_token_iterator()}; } @@ -83,7 +85,7 @@ void BSONPage::parse_headings(const std::string &str) { auto curr_heading = match[1].str(); curr_heading = std::regex_replace(curr_heading, std::regex(R"!!((<.+?(?=>)>|<\/.+?>|\/doc))!!"), ""); headings_.emplace_back(curr_heading); - std::cout<< curr_heading << std::endl; +// std::cout<< curr_heading << std::endl; search_start = match.suffix().first; } } diff --git a/src/main.cpp b/src/main.cpp index 65ce2dd..8453235 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -54,7 +54,7 @@ int main(int argc, char* argv[]) { std::string curr_url = linkQueue.front(); visited.emplace(curr_url); linkQueue.pop(); - std::cout << "Parsing: " << curr_url << std::endl; +// std::cout << "Parsing: " << curr_url << std::endl; try { BSONPage page{curr_url}; @@ -72,6 +72,7 @@ int main(int argc, char* argv[]) { return url.find(domain) != std::string::npos; }) != allowed_domains.end()); +// std::cout << "Found link: " << url << std::endl; if (visited.find(url) == visited.end() && within_allowed_domain) { linkQueue.emplace(url); visited.emplace(url);