Skip to content

Commit

Permalink
better escapes the URLs before parsing them
Browse files Browse the repository at this point in the history
  • Loading branch information
matstc committed Jan 27, 2015
1 parent 4344d23 commit 5fb7a0a
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
11 changes: 7 additions & 4 deletions lib/arachnid.rb
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ def crawl(options = {})
no_hash_in_url?(link) &&
extension_not_ignored?(link)

absolute_link = make_absolute(sanitize_link(split_url_at_hash(link)), response.effective_url)
absolute_link = make_absolute(split_url_at_hash(link), response.effective_url)
@global_queue << absolute_link unless @global_queue.include?(absolute_link)
end

rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
$stderr.puts "#{e.class}: ignored link #{link}"
$stderr.puts "#{e.class}: Ignored link #{link} @ #{e.backtrace ? e.backtrace[0] : 'no backtrace'}"
end
end

Expand Down Expand Up @@ -136,11 +136,14 @@ def extension_not_ignored?(url)
end

def sanitize_link(url)
url.gsub(/\s+/, "%20")
hash_position = url.index('#')
left_part = hash_position ? url[0,hash_position] : url
sanitized = left_part.gsub(/[ éèêàâôïûùÉÈÊÀÂÔÏÛÙöäüßÖÄÜ]/) {|w| CGI::escape(w)}
sanitized + (hash_position ? url[hash_position..-1] : "")
end

def make_absolute( href, root )
URI.parse(root).merge(URI.parse(split_url_at_hash(href.gsub(/\s+/, "%20")))).to_s
URI.parse(root).merge(URI.parse(sanitize_link(split_url_at_hash(href)))).to_s
end

end
Expand Down
6 changes: 6 additions & 0 deletions test/arachnid_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
require "minitest/autorun"

class ArachnidTest < Minitest::Test
def test_sanitizes_url
arachnid = Arachnid.new 'example.com'

assert_equal "http://example.com/%C3%A9#anchor", arachnid.sanitize_link("http://example.com/é#anchor")
end

def test_ignores_specified_extensions
arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg']

Expand Down

0 comments on commit 5fb7a0a

Please sign in to comment.