From 92d7dd71fdc3b58da55221b813e9713a518a67e8 Mon Sep 17 00:00:00 2001 From: Christopher Guess Date: Wed, 22 Nov 2023 13:54:09 -0500 Subject: [PATCH] Adding mass scrape and mass scrape site delete --- Gemfile | 5 + Gemfile.lock | 9 ++ .../admin/web_scrapes_controller.rb | 39 ++++++- .../admin/web_scrapes_controller.js | 32 ++++++ .../controllers/delete_account_controller.js | 1 - app/spiders/claim_review_mech.rb | 18 +-- app/views/admin/web_scrapes/index.html.erb | 107 ++++++++++++------ config/environments/production.rb | 18 ++- config/routes.rb | 6 +- 9 files changed, 183 insertions(+), 52 deletions(-) create mode 100644 app/javascript/controllers/admin/web_scrapes_controller.js diff --git a/Gemfile b/Gemfile index 00a9476e..8faf1197 100644 --- a/Gemfile +++ b/Gemfile @@ -223,4 +223,9 @@ gem "debug" # Scraping web sites gem "tanakai" +# For regular scraping gem "mechanize" + +# Better logging printing +gem "amazing_print" +gem "rails_semantic_logger" diff --git a/Gemfile.lock b/Gemfile.lock index a2642a26..634029ac 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -143,6 +143,7 @@ GEM tzinfo (~> 2.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) + amazing_print (1.5.0) android_key_attestation (0.3.0) apparition (0.6.0) capybara (~> 3.13, < 4) @@ -409,6 +410,10 @@ GEM rails-settings-cached (2.9.2) activerecord (>= 5.0.0) railties (>= 5.0.0) + rails_semantic_logger (4.14.0) + rack + railties (>= 5.1) + semantic_logger (~> 4.13) railties (7.0.8) actionpack (= 7.0.8) activesupport (= 7.0.8) @@ -495,6 +500,8 @@ GEM rexml (~> 3.2, >= 3.2.5) rubyzip (>= 1.2.2, < 3.0) websocket (~> 1.0) + semantic_logger (4.15.0) + concurrent-ruby (~> 1.0) set (1.0.3) sidekiq (6.5.12) connection_pool (>= 2.2.5, < 3) @@ -630,6 +637,7 @@ PLATFORMS x86_64-linux DEPENDENCIES + amazing_print aws-sdk-s3 birdsong! blueprinter @@ -669,6 +677,7 @@ DEPENDENCIES rack-mini-profiler (~> 2.0) rails (~> 7.0.8) rails-settings-cached (~> 2.9) + rails_semantic_logger rake redis (~> 4.0) rolify (~> 6.0) diff --git a/app/controllers/admin/web_scrapes_controller.rb b/app/controllers/admin/web_scrapes_controller.rb index bc502cf0..d4ec963a 100644 --- a/app/controllers/admin/web_scrapes_controller.rb +++ b/app/controllers/admin/web_scrapes_controller.rb @@ -17,7 +17,7 @@ def create end end - def delete + def destroy scrapable_site = ScrapableSite.find(params[:web_scrape_id]) if scrapable_site.nil? @@ -32,6 +32,19 @@ def delete end end + def handle_form + site_ids = params[:sites_selected] + + case params[:route_to].keys.first.to_sym + when :scrape_group + scrape_selected(site_ids) + when :delete_group + delete_selected(site_ids) + end + + redirect_back_or_to admin_web_scrapes_path + end + def scrape_now scrapable_site = ScrapableSite.find(params[:web_scrape_id]) scrapable_site.scrape @@ -40,6 +53,30 @@ def scrape_now private + def process_selected(site_ids, &block) + return "No block given" unless block_given? + success_count = 0 + site_ids&.each do |site_id| + site = ScrapableSite.find(site_id) + unless site.nil? + block.call(site) + success_count += 1 + end + end + + success_count + end + + def scrape_selected(site_ids) + success_count = process_selected(site_ids) { |site| site.scrape } + flash[:success] = "Successfully scheduled #{success_count} new #{"scrape".pluralize(success_count)} to run immediately" + end + + def delete_selected(site_ids) + success_count = process_selected(site_ids) { |site| site.destroy } + flash[:success] = "Successfully deleted #{success_count} scrape organizations" + end + def scrapable_site_params params.require(:scrapable_site).permit( :name, diff --git a/app/javascript/controllers/admin/web_scrapes_controller.js b/app/javascript/controllers/admin/web_scrapes_controller.js new file mode 100644 index 00000000..97e6ae8d --- /dev/null +++ b/app/javascript/controllers/admin/web_scrapes_controller.js @@ -0,0 +1,32 @@ +import { Controller } from '@hotwired/stimulus' +import { get, post } from "@rails/request.js" + +export default class extends Controller { + static targets = [ "checkAll" ] + + initialize() { + console.log("Connected") + } + + clickAllScrapes(event) { + const checkboxes = document.getElementsByName('site[]') + checkboxes.forEach(function (checkbox, _) { + checkbox.checked = event.currentTarget.checked; + }) + } + + // async startScrape(event) { + // event.preventDefault() + + // const scrapeStartResponse = await post("/admin/web_scrapes/" + event.currentTarget.id + "/scrape.json", { + // body: {}, + // contentType: "application/json", + // responseKind: "json" + // }) + + // const scrapeStartResponseBody = await scrapeStartResponse.text + // const scrapeStartResponseBodyJson = JSON.parse(scrapeStartResponseBody) + + // Turbo.visit('/admin/web_scrapes/', { action: "replace", frame: "web_scrapes_list" }) + // } +} diff --git a/app/javascript/controllers/delete_account_controller.js b/app/javascript/controllers/delete_account_controller.js index f6a39014..12006e14 100644 --- a/app/javascript/controllers/delete_account_controller.js +++ b/app/javascript/controllers/delete_account_controller.js @@ -9,7 +9,6 @@ export default class extends Controller { * Resets email input elements. Waits a short period to ensure that inputs are not wiped before being sent to the backend */ confirmDeletion(event) { - console.log('hi') if (!window.confirm('Are you sure you want to delete your account?')) { console.log('deleting...') event.stopPropagation() diff --git a/app/spiders/claim_review_mech.rb b/app/spiders/claim_review_mech.rb index 6f327b05..9bd115fe 100644 --- a/app/spiders/claim_review_mech.rb +++ b/app/spiders/claim_review_mech.rb @@ -17,7 +17,7 @@ def process(start_url: nil, scrapable_site: nil) link = link_stack.pop links_visited << link - puts "Navigating to new link #{link}" + logger.info "Navigating to new link #{link}" begin page = get(link) @@ -28,7 +28,7 @@ def process(start_url: nil, scrapable_site: nil) # Move on next else - puts "Error not caught with response code #{e.response_code}" + logger.error "Error not caught with response code #{e.response_code}" next end rescue Mechanize::RedirectLimitReachedError, @@ -39,7 +39,7 @@ def process(start_url: nil, scrapable_site: nil) SocketError next # Skip all the various things that can go wrong rescue StandardError => e - puts "Error not caught with error #{e.message}" + logger.error "Error not caught with error #{e.message}" next end @@ -75,7 +75,7 @@ def process(start_url: nil, scrapable_site: nil) link_stack.push(url) rescue StandardError => e - puts "Error not caught with error #{e.message}" + logger.error "Error not caught with error #{e.message}" end # Check the page for ClaimReview @@ -93,7 +93,7 @@ def process(start_url: nil, scrapable_site: nil) j end rescue StandardError => e - puts "Error not caught with error #{e.message}" + logger.error "Error not caught with error #{e.message}" end if json.count.positive? && json.first.is_a?(Array) @@ -127,15 +127,15 @@ def process(start_url: nil, scrapable_site: nil) # ClaimReview.find_duplicates(json_element["claimReviewed"], json_element["link"]), json_element["author"]["name"] begin claim_review = ClaimReview.create_or_update_from_claim_review_hash(json_element, "#{link}::#{index}", false) - puts("Created a claim_review at #{link} with id #{claim_review.id}") + logger.info("Created a claim_review at #{link} with id #{claim_review.id}") found_claims_count += 1 scrapable_site.update(number_of_claims_found: found_claims_count) unless scrapable_site.nil? rescue ClaimReview::DuplicateError - puts("Error filing a duplicate ClaimReview at #{link}") + logger.error("Error filing a duplicate ClaimReview at #{link}") # add_event(e.full_message) && return rescue StandardError => e - puts("Error filing a ClaimReview at #{link}") - puts(e.full_message) && next + logger.error("Error filing a ClaimReview at #{link}") + logger.error(e.full_message) && next end end end diff --git a/app/views/admin/web_scrapes/index.html.erb b/app/views/admin/web_scrapes/index.html.erb index dcb20c59..ded8ea91 100644 --- a/app/views/admin/web_scrapes/index.html.erb +++ b/app/views/admin/web_scrapes/index.html.erb @@ -1,45 +1,80 @@ <% @title_tag = "Web Scrapes" %> <% @page_id = "web_scrapes" %> -
+

Web Scrapes

Sites

<%= link_to "New Web Site", new_admin_web_scrape_path %> - - - - - - - - - - - - - - - - <% @sites.each do |site| %> - - - - - - - - - - - <% end %> - -
NameUrlStarting UrlLast RunLast Run TimeTotal Claims FoundCreated At
- <%= site.name %>
-
<%= site.url %><%= site.starting_url %><%= site.last_run %><%= integer_to_time_duration(site.last_run_time) unless site.last_run_time.nil? %><%= site.number_of_claims_found %><%= site.created_at %> - <%= button_to "Scrape Now", admin_web_scrape_scrape_path(site), method: :post %> - - <%= button_to "Delete", admin_web_scrape_delete_path(site), method: :delete %> -
+ <%= form_with url: handle_form_admin_web_scrapes_path, class: "form" do |form| %> +
+
+ <%= form.select :filter, ["hello", "goodbye"] %> +
+
+ Filter 3 +
+
+ + + + + + + + + + + + + + <%# %> + <%# %> + + + + <% @sites.each do |site| %> + + + + + + + + + + + + <% end %> + +
+ <%= check_box_tag "#", "_", false, + { class: "checkbox", + "data-action": "click->admin--web-scrapes#clickAllScrapes" } %> + +
+ + + + + + +
+ Name +
UrlStarting UrlLast RunLast Run TimeTotal Claims FoundCreated At
+ <%= check_box_tag "sites_selected[]", site.id, false, class: "checkbox"%> + + <%= site.name %>
+
<%= site.url %><%= site.starting_url %><%= site.last_run %><%= integer_to_time_duration(site.last_run_time) unless site.last_run_time.nil? %><%= site.number_of_claims_found %><%= site.created_at %>
+
+ <%= button_tag 'Scrape Selected', name: 'route_to[scrape_group]', class: "btn btn--text" %> + <%= button_tag 'Delete Selected', name: 'route_to[delete_group]', class: "btn btn--text" %> +
+ <% end %> +
diff --git a/config/environments/production.rb b/config/environments/production.rb index 5498ec57..4e6f9eb1 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -82,12 +82,24 @@ # require "syslog/logger" # config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new "app-name") + # if ENV["RAILS_LOG_TO_STDOUT"].present? + # logger = ActiveSupport::Logger.new(STDOUT) + # logger.formatter = config.log_formatter + # config.logger = ActiveSupport::TaggedLogging.new(logger) + # end + if ENV["RAILS_LOG_TO_STDOUT"].present? - logger = ActiveSupport::Logger.new(STDOUT) - logger.formatter = config.log_formatter - config.logger = ActiveSupport::TaggedLogging.new(logger) + $stdout.sync = true + config.rails_semantic_logger.add_file_appender = false + config.semantic_logger.add_appender(io: $stdout, formatter: config.rails_semantic_logger.format) + end + + if ENV["LOG_LEVEL"].present? + config.log_level = ENV["LOG_LEVEL"].downcase.strip.to_sym end + config.semantic_logger.backtrace_level = :error + # Do not dump schema after migrations. config.active_record.dump_schema_after_migration = false diff --git a/config/routes.rb b/config/routes.rb index 1995b022..9b840f51 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -62,8 +62,10 @@ post "applicants/:id/reject", to: "applicants#reject", as: "applicant_reject" delete "applicants/:id", to: "applicants#delete", as: "applicant_delete" - resources :web_scrapes, only: [:index, :new, :create] do - delete "/", action: "delete", as: "delete" + # post "web_scrapes/scrape_selected", action: "web_scrapes#scrape_selected", as: "scrape_selected" + + resources :web_scrapes, only: [:index, :new, :create, :destroy] do + post :handle_form, on: :collection post "scrape", action: "scrape_now", as: "scrape" end end