Skip to content

Commit

Permalink
Adding mass scrape and mass scrape site delete
Browse files Browse the repository at this point in the history
  • Loading branch information
cguess committed Nov 22, 2023
1 parent ee70eb5 commit 92d7dd7
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 52 deletions.
5 changes: 5 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -223,4 +223,9 @@ gem "debug"
# Scraping web sites
gem "tanakai"

# For regular scraping
gem "mechanize"

# Better logging printing
gem "amazing_print"
gem "rails_semantic_logger"
9 changes: 9 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ GEM
tzinfo (~> 2.0)
addressable (2.8.5)
public_suffix (>= 2.0.2, < 6.0)
amazing_print (1.5.0)
android_key_attestation (0.3.0)
apparition (0.6.0)
capybara (~> 3.13, < 4)
Expand Down Expand Up @@ -409,6 +410,10 @@ GEM
rails-settings-cached (2.9.2)
activerecord (>= 5.0.0)
railties (>= 5.0.0)
rails_semantic_logger (4.14.0)
rack
railties (>= 5.1)
semantic_logger (~> 4.13)
railties (7.0.8)
actionpack (= 7.0.8)
activesupport (= 7.0.8)
Expand Down Expand Up @@ -495,6 +500,8 @@ GEM
rexml (~> 3.2, >= 3.2.5)
rubyzip (>= 1.2.2, < 3.0)
websocket (~> 1.0)
semantic_logger (4.15.0)
concurrent-ruby (~> 1.0)
set (1.0.3)
sidekiq (6.5.12)
connection_pool (>= 2.2.5, < 3)
Expand Down Expand Up @@ -630,6 +637,7 @@ PLATFORMS
x86_64-linux

DEPENDENCIES
amazing_print
aws-sdk-s3
birdsong!
blueprinter
Expand Down Expand Up @@ -669,6 +677,7 @@ DEPENDENCIES
rack-mini-profiler (~> 2.0)
rails (~> 7.0.8)
rails-settings-cached (~> 2.9)
rails_semantic_logger
rake
redis (~> 4.0)
rolify (~> 6.0)
Expand Down
39 changes: 38 additions & 1 deletion app/controllers/admin/web_scrapes_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def create
end
end

def delete
def destroy
scrapable_site = ScrapableSite.find(params[:web_scrape_id])

if scrapable_site.nil?
Expand All @@ -32,6 +32,19 @@ def delete
end
end

def handle_form
site_ids = params[:sites_selected]

case params[:route_to].keys.first.to_sym
when :scrape_group
scrape_selected(site_ids)
when :delete_group
delete_selected(site_ids)
end

redirect_back_or_to admin_web_scrapes_path
end

def scrape_now
scrapable_site = ScrapableSite.find(params[:web_scrape_id])
scrapable_site.scrape
Expand All @@ -40,6 +53,30 @@ def scrape_now

private

def process_selected(site_ids, &block)
return "No block given" unless block_given?
success_count = 0
site_ids&.each do |site_id|
site = ScrapableSite.find(site_id)
unless site.nil?
block.call(site)
success_count += 1
end
end

success_count
end

def scrape_selected(site_ids)
success_count = process_selected(site_ids) { |site| site.scrape }
flash[:success] = "Successfully scheduled #{success_count} new #{"scrape".pluralize(success_count)} to run immediately"
end

def delete_selected(site_ids)
success_count = process_selected(site_ids) { |site| site.destroy }
flash[:success] = "Successfully deleted #{success_count} scrape organizations"
end

def scrapable_site_params
params.require(:scrapable_site).permit(
:name,
Expand Down
32 changes: 32 additions & 0 deletions app/javascript/controllers/admin/web_scrapes_controller.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { Controller } from '@hotwired/stimulus'
import { get, post } from "@rails/request.js"

export default class extends Controller {
static targets = [ "checkAll" ]

initialize() {
console.log("Connected")
}

clickAllScrapes(event) {
const checkboxes = document.getElementsByName('site[]')
checkboxes.forEach(function (checkbox, _) {
checkbox.checked = event.currentTarget.checked;
})
}

// async startScrape(event) {
// event.preventDefault()

// const scrapeStartResponse = await post("/admin/web_scrapes/" + event.currentTarget.id + "/scrape.json", {
// body: {},
// contentType: "application/json",
// responseKind: "json"
// })

// const scrapeStartResponseBody = await scrapeStartResponse.text
// const scrapeStartResponseBodyJson = JSON.parse(scrapeStartResponseBody)

// Turbo.visit('/admin/web_scrapes/', { action: "replace", frame: "web_scrapes_list" })
// }
}
1 change: 0 additions & 1 deletion app/javascript/controllers/delete_account_controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ export default class extends Controller {
* Resets email input elements. Waits a short period to ensure that inputs are not wiped before being sent to the backend
*/
confirmDeletion(event) {
console.log('hi')
if (!window.confirm('Are you sure you want to delete your account?')) {
console.log('deleting...')
event.stopPropagation()
Expand Down
18 changes: 9 additions & 9 deletions app/spiders/claim_review_mech.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def process(start_url: nil, scrapable_site: nil)
link = link_stack.pop
links_visited << link

puts "Navigating to new link #{link}"
logger.info "Navigating to new link #{link}"

begin
page = get(link)
Expand All @@ -28,7 +28,7 @@ def process(start_url: nil, scrapable_site: nil)
# Move on
next
else
puts "Error not caught with response code #{e.response_code}"
logger.error "Error not caught with response code #{e.response_code}"
next
end
rescue Mechanize::RedirectLimitReachedError,
Expand All @@ -39,7 +39,7 @@ def process(start_url: nil, scrapable_site: nil)
SocketError
next # Skip all the various things that can go wrong
rescue StandardError => e
puts "Error not caught with error #{e.message}"
logger.error "Error not caught with error #{e.message}"
next
end

Expand Down Expand Up @@ -75,7 +75,7 @@ def process(start_url: nil, scrapable_site: nil)

link_stack.push(url)
rescue StandardError => e
puts "Error not caught with error #{e.message}"
logger.error "Error not caught with error #{e.message}"
end

# Check the page for ClaimReview
Expand All @@ -93,7 +93,7 @@ def process(start_url: nil, scrapable_site: nil)
j
end
rescue StandardError => e
puts "Error not caught with error #{e.message}"
logger.error "Error not caught with error #{e.message}"
end

if json.count.positive? && json.first.is_a?(Array)
Expand Down Expand Up @@ -127,15 +127,15 @@ def process(start_url: nil, scrapable_site: nil)
# ClaimReview.find_duplicates(json_element["claimReviewed"], json_element["link"]), json_element["author"]["name"]
begin
claim_review = ClaimReview.create_or_update_from_claim_review_hash(json_element, "#{link}::#{index}", false)
puts("Created a claim_review at #{link} with id #{claim_review.id}")
logger.info("Created a claim_review at #{link} with id #{claim_review.id}")
found_claims_count += 1
scrapable_site.update(number_of_claims_found: found_claims_count) unless scrapable_site.nil?
rescue ClaimReview::DuplicateError
puts("Error filing a duplicate ClaimReview at #{link}")
logger.error("Error filing a duplicate ClaimReview at #{link}")
# add_event(e.full_message) && return
rescue StandardError => e
puts("Error filing a ClaimReview at #{link}")
puts(e.full_message) && next
logger.error("Error filing a ClaimReview at #{link}")
logger.error(e.full_message) && next
end
end
end
Expand Down
107 changes: 71 additions & 36 deletions app/views/admin/web_scrapes/index.html.erb
Original file line number Diff line number Diff line change
@@ -1,45 +1,80 @@
<% @title_tag = "Web Scrapes" %>
<% @page_id = "web_scrapes" %>

<div class="content" data-controller="admin--applicants">
<div class="content" data-controller="admin--web-scrapes">
<h1>Web Scrapes</h1>
<h2>Sites</h2>

<%= link_to "New Web Site", new_admin_web_scrape_path %>

<table class="table">
<thead>
<tr>
<th>Name</th>
<th>Url</th>
<th>Starting Url</th>
<th>Last Run</th>
<th>Last Run Time</th>
<th>Total Claims Found</th>
<th>Created At</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<% @sites.each do |site| %>
<td>
<%= site.name %><br>
</td>
<td><%= site.url %></td>
<td><%= site.starting_url %></td>
<td><%= site.last_run %></td>
<td><%= integer_to_time_duration(site.last_run_time) unless site.last_run_time.nil? %></td>
<td><%= site.number_of_claims_found %></td>
<td><%= site.created_at %></td>
<td>
<%= button_to "Scrape Now", admin_web_scrape_scrape_path(site), method: :post %>
</td>
<td>
<%= button_to "Delete", admin_web_scrape_delete_path(site), method: :delete %>
</td>
</tr>
<% end %>
</tbody>
</table>
<%= form_with url: handle_form_admin_web_scrapes_path, class: "form" do |form| %>
<div style="display: flex; justify-content: space-around;">
<div class="field__wrapper--input">
<%= form.select :filter, ["hello", "goodbye"] %>
</div>
<div>
Filter 3
</div>
</div>

<turbo-frame id="web_scrapes_list">
<table class="table">
<thead>
<tr>
<th>
<%= check_box_tag "#", "_", false,
{ class: "checkbox",
"data-action": "click->admin--web-scrapes#clickAllScrapes" } %>
</th>
<th style="display: flex;">
<div>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5">
<path fill-rule="evenodd" d="M14.77 12.79a.75.75 0 01-1.06-.02L10 8.832 6.29 12.77a.75.75 0 11-1.08-1.04l4.25-4.5a.75.75 0 011.08 0l4.25 4.5a.75.75 0 01-.02 1.06z" clip-rule="evenodd" />
</svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" class="w-5 h-5">
<path fill-rule="evenodd" d="M5.23 7.21a.75.75 0 011.06.02L10 11.168l3.71-3.938a.75.75 0 111.08 1.04l-4.25 4.5a.75.75 0 01-1.08 0l-4.25-4.5a.75.75 0 01.02-1.06z" clip-rule="evenodd" />
</svg>
</div>
Name
</th>
<th>Url</th>
<th>Starting Url</th>
<th>Last Run</th>
<th>Last Run Time</th>
<th>Total Claims Found</th>
<th>Created At</th>
<%# <th></th> %>
<%# <th></th> %>
</tr>
</thead>
<tbody>
<% @sites.each do |site| %>
<td>
<%= check_box_tag "sites_selected[]", site.id, false, class: "checkbox"%>
</td>
<td>
<%= site.name %><br>
</td>
<td style="word-break:break-all;"><%= site.url %></td>
<td style="word-break:break-all;"><%= site.starting_url %></td>
<td><%= site.last_run %></td>
<td><%= integer_to_time_duration(site.last_run_time) unless site.last_run_time.nil? %></td>
<td><%= site.number_of_claims_found %></td>
<td><%= site.created_at %></td>
<!-- <td>
<%= link_to "Scrape Now", "#", "data-action": "click->admin--web-scrapes#startScrape", id: site.id %>
</td> -->
<!-- <td>
<%= button_to "Delete", admin_web_scrape_path(site), method: :delete %>
</td> -->
</tr>
<% end %>
</tbody>
</table>
<div>
<%= button_tag 'Scrape Selected', name: 'route_to[scrape_group]', class: "btn btn--text" %>
<%= button_tag 'Delete Selected', name: 'route_to[delete_group]', class: "btn btn--text" %>
</div>
<% end %>
</turbo-frame>
</div>
18 changes: 15 additions & 3 deletions config/environments/production.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,24 @@
# require "syslog/logger"
# config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new "app-name")

# if ENV["RAILS_LOG_TO_STDOUT"].present?
# logger = ActiveSupport::Logger.new(STDOUT)
# logger.formatter = config.log_formatter
# config.logger = ActiveSupport::TaggedLogging.new(logger)
# end

if ENV["RAILS_LOG_TO_STDOUT"].present?
logger = ActiveSupport::Logger.new(STDOUT)
logger.formatter = config.log_formatter
config.logger = ActiveSupport::TaggedLogging.new(logger)
$stdout.sync = true
config.rails_semantic_logger.add_file_appender = false
config.semantic_logger.add_appender(io: $stdout, formatter: config.rails_semantic_logger.format)
end

if ENV["LOG_LEVEL"].present?
config.log_level = ENV["LOG_LEVEL"].downcase.strip.to_sym
end

config.semantic_logger.backtrace_level = :error

# Do not dump schema after migrations.
config.active_record.dump_schema_after_migration = false

Expand Down
6 changes: 4 additions & 2 deletions config/routes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@
post "applicants/:id/reject", to: "applicants#reject", as: "applicant_reject"
delete "applicants/:id", to: "applicants#delete", as: "applicant_delete"

resources :web_scrapes, only: [:index, :new, :create] do
delete "/", action: "delete", as: "delete"
# post "web_scrapes/scrape_selected", action: "web_scrapes#scrape_selected", as: "scrape_selected"

resources :web_scrapes, only: [:index, :new, :create, :destroy] do
post :handle_form, on: :collection
post "scrape", action: "scrape_now", as: "scrape"
end
end
Expand Down

0 comments on commit 92d7dd7

Please sign in to comment.