Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Curiosity Scraper Fix #200

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
81 changes: 42 additions & 39 deletions app/models/curiosity_scraper.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
class CuriosityScraper
require "open-uri"
require 'json'
BASE_URL = "https://mars.nasa.gov/msl/multimedia/raw/"
BASE_URL = "https://mars.nasa.gov/api/v1/raw_image_items/"

attr_reader :rover
def initialize
Expand All @@ -12,17 +12,15 @@ def scrape
create_photos
end

# grabs the HTML from the main page of the curiosity rover image gallery
def main_page
Nokogiri::HTML(URI.open("https://mars.nasa.gov/msl/multimedia/raw-images/?order=sol+desc%2Cinstrument_sort+asc%2Csample_type_sort+asc%2C+date_taken+desc&per_page=50&page=0&mission=msl"))
end

def collect_links
latest_sol_available = JSON.parse(main_page.css('[data-react-props]').last.attr('data-react-props'))["header_counts"]["latest_sol"].to_i
# Fetch the latest sol available through the API
response = JSON.parse(URI.open(BASE_URL + "?order=sol%20desc,instrument_sort%20asc,sample_type_sort%20asc,%20date_taken%20desc&per_page=1&page=0&condition_1=msl:mission").read)
latest_sol_available = response["items"].first["sol"].to_i
latest_sol_scraped = rover.photos.maximum(:sol).to_i
sols_to_scrape = latest_sol_scraped..latest_sol_available
sols_to_scrape = (latest_sol_scraped..latest_sol_available) #.to_a.last(10) # Only fetch the last 10 sols

sols_to_scrape.map { |sol|
"https://mars.nasa.gov/msl/raw/listimagesraw.cfm?&s=#{sol}"
"#{BASE_URL}?order=sol%20desc,instrument_sort%20asc,sample_type_sort%20asc,%20date_taken%20desc&per_page=200&page=0&condition_1=msl:mission&condition_2=#{sol}:sol:in"
}
end

Expand All @@ -35,44 +33,49 @@ def create_photos
end

def scrape_photo_page(url)
image_page = Nokogiri::HTML(URI.open url)
image_array = image_page.css("div.RawImageCaption a")
.map { |link| link["href"] }
image_array.each do |image|
create_photo(image, url)
begin
response = JSON.parse(URI.open(url).read)
response['items'].each do |image|
create_photo(image) if image['extended'] && image['extended']['sample_type'] == 'full'
end
rescue OpenURI::HTTPError => e
puts "HTTP error occurred: #{e.message} for URL: #{url}. Skipping."
rescue StandardError => e
puts "Error occurred: #{e.message} for URL: #{url}. Skipping."
end
end

def create_photo(image, url)
if !thumbnail?(image)
sol = url.scan(/(?<==)\d+/).first
camera = camera_from_url image
fail "Camera not found. Name: #{camera}" if camera.is_a?(String)
photo = Photo.find_or_initialize_by(sol: sol, camera: camera,
img_src: image, rover: rover)
def create_photo(image)
sol = image['sol']
camera = camera_from_json(image)
link = image['https_url']

if camera.is_a?(String)
puts "WARNING: Camera not found. Name: #{camera}"
else
photo = Photo.find_or_initialize_by(sol: sol, camera: camera, img_src: link, rover: rover)
photo.log_and_save_if_new
end
end

def camera_abbreviations
{
fcam: "FHAZ",
rcam: "RHAZ",
ccam: "CHEMCAM",
mcam: "MAST",
ncam: "NAVCAM",
mhli: "MAHLI",
mrdi: "MARDI"
}
end
def camera_from_json(image)
camera_name = image['instrument']
camera = rover.cameras.find_by(name: camera_name) || rover.cameras.find_by(full_name: camera_name)

def thumbnail?(image_url)
image_url.to_s.include?("_T")
end
if camera.nil?
# Log a warning
puts "WARNING: Camera not found. Name: #{camera_name}. Adding to database."

# Add the new camera to the database
camera = rover.cameras.create(name: camera_name, full_name: camera_name)

if camera.persisted?
puts "New camera added to database: #{camera_name}"
else
puts "Failed to add camera to the database: #{camera_name}"
end
end

def camera_from_url(image_url)
camera_abbreviation = image_url.match(/\/(?<camera>\w{4})\/\w+.(JPG|jpg|PNG|png)/)[:camera]
camera_name = camera_abbreviations[camera_abbreviation.to_sym]
rover.cameras.find_by(name: camera_name) || camera_name || camera_abbreviation
camera
end
end