DEV: General refactoring and cleanup (#474)

Included: * DEV: Remove some defunct sites * DEV: Refactor GoogleDocsOnebox * DEV: Refactor PastebinOnebox * DEV: Refactor GfycatOnebox * DEV: Refactor PdfOnebox * DEV: Refactor PubmedOnebox * DEV: Refactor AmazonOnebox * DEV: Minor InstagramOnebox clean up * DEV: Small tweaks to TrelloOnebox Other stuff: * DEV: Don't use `link` and `url` interchangeably * DEV: Roll `Layout#link` into `Layout#uri` * DEV: Properly memoize `Layout#uri` * DEV: Re-use `Layout#uri` * DEV: Rename `get_secure_image` to `secure_image_url` * DEV: Normalize `matches_regexp` calls * DEV: Use path-specific encoding rules * DEV: `Layout#checksum` is no longer used * DEV: Remove double `protected` * DEV: Refactor `Matcher` * DEV: Use the existing `Engine#uri` method * DEV: The first `Engine#initialize` arg is `url` * DEV: Use `match?` * DEV: Simplify allowlistedgeneric template * DEV: Initialize `@options` in json spec * DEV: Simplify conditionals * DEV: Merge attr_readers * DEV: Whitespace, `after()` before tests
discourse · Apr 20, 2021 · b816ada · b816ada
1 parent 907be5b
commit b816ada
Show file tree

Hide file tree

Showing 33 changed files with 369 additions and 248 deletions.
diff --git a/lib/onebox/engine.rb b/lib/onebox/engine.rb
@@ -28,32 +28,28 @@ def self.origins_to_regexes(origins)
       end
     end
 
-    attr_reader :url, :uri
-    attr_reader :timeout
+    attr_reader :url, :uri, :options, :timeout
     attr :errors
 
     DEFAULT = {}
-    def options
-      @options
-    end
 
     def options=(opt)
-      return @options if opt.nil? #make sure options provided
-      opt = opt.to_h  if opt.instance_of?(OpenStruct)
+      return @options if opt.nil? # make sure options provided
+      opt = opt.to_h if opt.instance_of?(OpenStruct)
       @options.merge!(opt)
       @options
     end
 
-    def initialize(link, timeout = nil)
+    def initialize(url, timeout = nil)
       @errors = {}
       @options = DEFAULT
       class_name = self.class.name.split("::").last.to_s
 
       # Set the engine options extracted from global options.
       self.options = Onebox.options[class_name] || {}
 
-      @url = link
-      @uri = URI(link)
+      @url = url
+      @uri = URI(url)
       if always_https?
         @uri.scheme = 'https'
         @url = @uri.to_s

diff --git a/lib/onebox/engine/allowlisted_generic_onebox.rb b/lib/onebox/engine/allowlisted_generic_onebox.rb
@@ -27,7 +27,6 @@ def self.default_allowed_domains
           500px.com
           8tracks.com
           abc.net.au
-          about.com
           answers.com
           arstechnica.com
           ask.com
@@ -36,11 +35,9 @@ def self.default_allowed_domains
           bbs.boingboing.net
           bestbuy.ca
           bestbuy.com
-          blip.tv
           bloomberg.com
           businessinsider.com
           change.org
-          clikthrough.com
           cnet.com
           cnn.com
           codepen.io
@@ -90,24 +87,20 @@ def self.default_allowed_domains
           meetup.com
           mixcloud.com
           mlb.com
-          myshopify.com
           myspace.com
           nba.com
           npr.org
           nytimes.com
           photobucket.com
           pinterest.com
           reference.com
-          revision3.com
           rottentomatoes.com
           samsung.com
-          screenr.com
           scribd.com
           slideshare.net
           sourceforge.net
           speakerdeck.com
           spotify.com
-          squidoo.com
           streamable.com
           techcrunch.com
           ted.com
@@ -124,7 +117,6 @@ def self.default_allowed_domains
           twitpic.com
           usatoday.com
           viddler.com
-          videojug.com
           vine.co
           walmart.com
           washingtonpost.com
@@ -275,7 +267,6 @@ def data
 
       def rewrite_https(html)
         return unless html
-        uri = URI(@url)
         if AllowlistedGenericOnebox.host_matches(uri, AllowlistedGenericOnebox.rewrites)
           html = html.gsub("http://", "https://")
         end

diff --git a/lib/onebox/engine/amazon_onebox.rb b/lib/onebox/engine/amazon_onebox.rb
@@ -19,10 +19,8 @@ def url
         # If possible, fetch the cached HTML body immediately so we can
         # try to grab the canonical URL from that document,
         # rather than guess at the best URL structure to use
-        if body_cacher&.respond_to?('cache_response_body?')
-          if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
-            @raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
-          end
+        if !@raw && has_cached_body
+          @raw = Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
         end
 
         if @raw
@@ -31,7 +29,8 @@ def url
         end
 
         if match && match[:id]
-          return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
+          id = Addressable::URI.encode_component(match[:id], Addressable::URI::CharacterClasses::PATH)
+          return "https://www.amazon.#{tld}/dp/#{id}"
         end
 
         @url
@@ -49,6 +48,12 @@ def http_params
 
       private
 
+      def has_cached_body
+        body_cacher&.respond_to?('cache_response_body?') &&
+          body_cacher.cache_response_body?(uri.to_s) &&
+          body_cacher.cached_response_body_exists?(uri.to_s)
+      end
+
       def match
         @match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
       end
@@ -57,19 +62,21 @@ def image
         if (main_image = raw.css("#main-image")) && main_image.any?
           attributes = main_image.first.attributes
 
-          return attributes["data-a-hires"].to_s if attributes["data-a-hires"]
-
-          if attributes["data-a-dynamic-image"]
+          if attributes["data-a-hires"]
+            return attributes["data-a-hires"].to_s
+          elsif attributes["data-a-dynamic-image"]
             return ::JSON.parse(attributes["data-a-dynamic-image"].value).keys.first
           end
         end
 
         if (landing_image = raw.css("#landingImage")) && landing_image.any?
           attributes = landing_image.first.attributes
 
-          return attributes["data-old-hires"].to_s if attributes["data-old-hires"]
-
-          landing_image.first["src"].to_s
+          if attributes["data-old-hires"]
+            return attributes["data-old-hires"].to_s
+          else
+            return landing_image.first["src"].to_s
+          end
         end
 
         if (ebook_image = raw.css("#ebooksImgBlkFront")) && ebook_image.any?
@@ -91,16 +98,16 @@ def price
       end
 
       def multiple_authors(authors_xpath)
-        author_list = raw.xpath(authors_xpath)
-        authors = []
-        author_list.each { |a| authors << a.inner_text.strip }
-        authors.join(", ")
+        raw
+          .xpath(authors_xpath)
+          .map { |a| a.inner_text.strip }
+          .join(", ")
       end
 
       def data
         og = ::Onebox::OpenGraph.new(raw)
 
-        if raw.at_css('#dp.book_mobile') #printed books
+        if raw.at_css('#dp.book_mobile') # printed books
           title = raw.at("h1#title")&.inner_text
           authors = raw.at_css('#byline_secondary_view_div') ? multiple_authors("//div[@id='byline_secondary_view_div']//span[@class='a-text-bold']") : raw.at("#byline")&.inner_text
           rating = raw.at("#averageCustomerReviews_feature_div .a-icon")&.inner_text || raw.at("#cmrsArcLink .a-icon")&.inner_text

diff --git a/lib/onebox/engine/flickr_onebox.rb b/lib/onebox/engine/flickr_onebox.rb
@@ -32,7 +32,7 @@ def album_html(og)
                   <span class='album-title'>#{album_title}</span>
                 </span>
               </span>
-              <img src='#{og.get_secure_image}' #{og.title_attr} height='#{og.image_height}' width='#{og.image_width}'>
+              <img src='#{og.secure_image_url}' #{og.title_attr} height='#{og.image_height}' width='#{og.image_width}'>
             </a>
           </div>
         HTML
@@ -43,7 +43,7 @@ def image_html(og)
 
         <<-HTML
           <a href='#{escaped_url}' target='_blank' rel='noopener' class="onebox">
-            <img src='#{og.get_secure_image}' #{og.title_attr} alt='Imgur' height='#{og.image_height}' width='#{og.image_width}'>
+            <img src='#{og.secure_image_url}' #{og.title_attr} alt='Imgur' height='#{og.image_height}' width='#{og.image_width}'>
           </a>
         HTML
       end

diff --git a/lib/onebox/engine/gfycat_onebox.rb b/lib/onebox/engine/gfycat_onebox.rb
@@ -9,8 +9,8 @@ class GfycatOnebox
       matches_regexp(/^https?:\/\/gfycat\.com\//)
       always_https
 
+      # This engine should have priority over AllowlistedGenericOnebox.
       def self.priority
-        # This engine should have priority over AllowlistedGenericOnebox.
         1
       end
 
@@ -21,6 +21,7 @@ def to_html
               <img src="https://gfycat.com/static/favicons/favicon-96x96.png" class="site-icon" width="64" height="64">
               <a href="#{data[:url]}" target="_blank" rel="nofollow ugc noopener">Gfycat.com</a>
             </header>
+
             <article class="onebox-body">
               <h4>
                 #{data[:title]} by
@@ -36,11 +37,12 @@ def to_html
                   <img title="Sorry, your browser doesn't support HTML5 video." src="#{data[:posterUrl]}">
                 </video>
               </div>
+
               <p>
                 <span class="label1">#{data[:keywords]}</span>
               </p>
-
             </article>
+
             <div style="clear: both"></div>
           </aside>
         HTML
@@ -61,52 +63,50 @@ def match
         @match ||= @url.match(/^https?:\/\/gfycat\.com\/(gifs\/detail\/)?(?<name>.+)/)
       end
 
-      def nokogiri_page
-        @nokogiri_page ||= begin
-          response = Onebox::Helpers.fetch_response(url, redirect_limit: 10) rescue nil
-          Nokogiri::HTML(response)
-        end
-      end
+      def og_data
+        return @og_data if defined?(@og_data)
 
-      def get_og_data
-        og_data = {}
+        response = Onebox::Helpers.fetch_response(url, redirect_limit: 10) rescue nil
+        page = Nokogiri::HTML(response)
+        script = page.at_css('script[type="application/ld+json"]')
 
-        if json_string = nokogiri_page.at_css('script[type="application/ld+json"]')&.text
-          og_data = Onebox::Helpers.symbolize_keys(::MultiJson.load(json_string))
+        if json_string = script&.text
+          @og_data = Onebox::Helpers.symbolize_keys(::MultiJson.load(json_string))
+        else
+          @og_data = {}
         end
-
-        og_data
       end
 
       def data
-        og_data = get_og_data
+        return @data if defined?(@data)
 
-        response = {
+        @data = {
           name: match[:name],
           title: og_data[:headline] || 'No Title',
           author: og_data[:author],
-          url: @url
+          url: @url,
         }
 
-        keywords = og_data[:keywords]&.split(',')
-        if keywords
-          response[:keywords] = keywords.map { |t| "<a href='https://gfycat.com/gifs/search/#{t}'>##{t}</a>" }.join(' ')
+        if keywords = og_data[:keywords]&.split(',')
+          @data[:keywords] = keywords
+            .map { |keyword| "<a href='https://gfycat.com/gifs/search/#{keyword}'>##{keyword}</a>" }
+            .join(' ')
         end
 
         if og_data[:video]
           content_url = ::Onebox::Helpers.normalize_url_for_output(og_data[:video][:contentUrl])
           video_url = Pathname.new(content_url)
-          response[:webmUrl] = video_url.sub_ext(".webm").to_s
-          response[:mp4Url] = video_url.sub_ext(".mp4").to_s
+          @data[:webmUrl] = video_url.sub_ext(".webm").to_s
+          @data[:mp4Url] = video_url.sub_ext(".mp4").to_s
 
           thumbnail_url = ::Onebox::Helpers.normalize_url_for_output(og_data[:video][:thumbnailUrl])
-          response[:posterUrl] = thumbnail_url
+          @data[:posterUrl] = thumbnail_url
 
-          response[:width] = og_data[:video][:width]
-          response[:height] = og_data[:video][:height]
+          @data[:width] = og_data[:video][:width]
+          @data[:height] = og_data[:video][:height]
         end
 
-        response
+        @data
       end
     end
   end

diff --git a/lib/onebox/engine/github_commit_onebox.rb b/lib/onebox/engine/github_commit_onebox.rb
@@ -10,7 +10,7 @@ class GithubCommitOnebox
       include JSON
       include Onebox::Mixins::GithubBody
 
-      matches_regexp Regexp.new("^https?://(?:www\.)?(?:(?:\w)+\.)?(github)\.com(?:/)?(?:.)*/commit/")
+      matches_regexp(/^https?:\/\/(?:www\.)?(?:(?:\w)+\.)?(github)\.com(?:\/)?(?:.)*\/commit\//)
       always_https
 
       def url

diff --git a/lib/onebox/engine/github_folder_onebox.rb b/lib/onebox/engine/github_folder_onebox.rb
@@ -7,7 +7,7 @@ class GithubFolderOnebox
       include StandardEmbed
       include LayoutSupport
 
-      matches_regexp Regexp.new(/^https?:\/\/(?:www\.)?(?:(?:\w)+\.)?(github)\.com[\:\d]*(\/[^\/]+){2}/)
+      matches_regexp(/^https?:\/\/(?:www\.)?(?:(?:\w)+\.)?(github)\.com[\:\d]*(\/[^\/]+){2}/)
       always_https
 
       def self.priority