From 24fb2549f7b69841e052491439bc8375ed5acfd9 Mon Sep 17 00:00:00 2001 From: Syphax bouazzouni Date: Mon, 27 May 2024 17:08:04 +0200 Subject: [PATCH] Merge to master: Release 2.1.0 - Refactor the cache code (#16) * use $DEBUG_API_CLIENT instead of $DEBUG for debugging request * add $API_CLIENT_INVALIDATE_CACHE option to disable cache * save and re-use the top_level_links http response as the result do not change * deprecate the find method in favor of get method for better performance * disable the refresh_cache on each update call * Refactor: update cache code and add tests (#15) * add tests for the api cache code * refactor the cache middleware code * remove unused function in the cache middleware * fix refactored cache code not getting the ld_obj when updating the cache --- Gemfile | 3 +- Gemfile.lock | 14 ++ config/config.test.rb | 9 +- lib/ontologies_api_client/collection.rb | 2 +- .../middleware/faraday-object-cache.rb | 213 ++++++++---------- test/middleware/test_cache.rb | 141 ++++++++++++ 6 files changed, 263 insertions(+), 119 deletions(-) create mode 100644 test/middleware/test_cache.rb diff --git a/Gemfile b/Gemfile index acdf917..b7f6653 100644 --- a/Gemfile +++ b/Gemfile @@ -4,4 +4,5 @@ gemspec gem 'rake' gem 'pry' -gem 'test-unit' \ No newline at end of file +gem 'test-unit' +gem 'webmock', require: false \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 77dace9..413fd8c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -20,8 +20,14 @@ GEM i18n (>= 1.6, < 2) minitest (>= 5.1) tzinfo (~> 2.0) + addressable (2.8.6) + public_suffix (>= 2.0.2, < 6.0) + bigdecimal (3.1.7) coderay (1.1.3) concurrent-ruby (1.1.10) + crack (1.0.0) + bigdecimal + rexml excon (0.95.0) faraday (2.0.1) faraday-net_http (~> 2.0) @@ -32,6 +38,7 @@ GEM faraday-multipart (1.0.4) multipart-post (~> 2) faraday-net_http (2.1.0) + hashdiff (1.1.0) i18n (1.12.0) concurrent-ruby (~> 1.0) lz4-ruby (0.3.3) @@ -44,13 +51,19 @@ GEM pry (0.14.1) coderay (~> 1.1) method_source (~> 1.0) + public_suffix (5.0.5) rake (13.0.6) + rexml (3.2.6) ruby2_keywords (0.0.5) spawnling (2.1.5) test-unit (3.5.7) power_assert tzinfo (2.0.5) concurrent-ruby (~> 1.0) + webmock (3.23.0) + addressable (>= 2.8.0) + crack (>= 0.3.2) + hashdiff (>= 0.4.0, < 2.0.0) PLATFORMS x86_64-darwin-21 @@ -62,6 +75,7 @@ DEPENDENCIES pry rake test-unit + webmock BUNDLED WITH 2.4.21 diff --git a/config/config.test.rb b/config/config.test.rb index af6837b..9185c5d 100644 --- a/config/config.test.rb +++ b/config/config.test.rb @@ -3,11 +3,12 @@ # be set via ENV variable UT_APIKEY abort('UT_APIKEY env variable is not set. Canceling tests') unless ENV.include?('UT_APIKEY') abort('UT_APIKEY env variable is set to an empty value. Canceling tests') unless ENV['UT_APIKEY'].size > 5 - +$API_CLIENT_INVALIDATE_CACHE = false +$DEBUG_API_CLIENT = false LinkedData::Client.config do |config| config.rest_url = 'https://data.bioontology.org' - config.apikey = ENV['UT_APIKEY'] -# config.apikey = 'xxxxx-xxxxx-xxxxxxxxxx' + config.apikey = '8b5b7825-538d-40e0-9e9e-5ab9274a9aeb' config.links_attr = 'links' - config.cache = false + config.cache = true + config.debug_client = false end diff --git a/lib/ontologies_api_client/collection.rb b/lib/ontologies_api_client/collection.rb index 6e0e36f..77572bf 100644 --- a/lib/ontologies_api_client/collection.rb +++ b/lib/ontologies_api_client/collection.rb @@ -77,7 +77,7 @@ def where(params = {}, &block) # Find a resource by id # @deprecated replace with "get" def find(id, params = {}) - [get(id, params)] + get(id, params) end ## diff --git a/lib/ontologies_api_client/middleware/faraday-object-cache.rb b/lib/ontologies_api_client/middleware/faraday-object-cache.rb index f5e06d3..080c416 100644 --- a/lib/ontologies_api_client/middleware/faraday-object-cache.rb +++ b/lib/ontologies_api_client/middleware/faraday-object-cache.rb @@ -8,107 +8,47 @@ class ObjectCacheResponse < Faraday::Response attr_accessor :parsed_body end - ## - # This middleware causes Faraday to return an actual object instead of a response - # This is done so that the object is cached instead of the unparsed json body. - # Otherwise, we have to re-parse the json on every cache hit, which is extrememly costly - # when compared to unmarshaling an object. class ObjectCache < Faraday::Middleware def initialize(app, *arguments) super(app) + options = arguments.last.is_a?(Hash) ? arguments.pop : {} + @logger = options.delete(:logger) + @store = options[:store] || ActiveSupport::Cache.lookup_store(nil, options) + end - if arguments.last.is_a? Hash - options = arguments.pop - @logger = options.delete(:logger) - else - options = arguments - end - @store = options[:store] || ActiveSupport::Cache.lookup_store(store, options) + def last_modified_key_id(request_key) + "LM::#{request_key}" + end + + def last_retrieved_key_id(request_key) + "LR::#{request_key}" end def call(env) invalidate_cache = env[:request_headers].delete(:invalidate_cache) - # Add if newer than last modified statement to headers request_key = cache_key_for(create_request(env)) - last_modified_key = "LM::#{request_key}" - last_retrieved_key = "LR::#{request_key}" + last_modified_key = last_modified_key_id(request_key) + last_retrieved_key = last_retrieved_key_id(request_key) - # If we invalidate the cache, then it forces a clean request if invalidate_cache - cache_delete(request_key) - cache_delete(last_modified_key) - cache_delete(last_retrieved_key) + delete_cache_entries(request_key, last_modified_key, last_retrieved_key) env[:request_headers]["Cache-Control"] = "no-cache" puts "Invalidated key #{request_key}" if enable_debug(request_key) end - # If we made the last request within the expiry if cache_exist?(last_retrieved_key) && cache_exist?(request_key) puts "Not expired: #{env[:url].to_s}, key #{request_key}" if enable_debug(request_key) - cached_item = cache_read(request_key) - ld_obj = cached_item.is_a?(Hash) && cached_item.key?(:ld_obj) ? cached_item[:ld_obj] : cached_item - env[:status] = 304 - cached_response = ObjectCacheResponse.new(env) - cached_response.parsed_body = ld_obj - return cached_response + return retrieve_cached_response(request_key) end - last_modified = cache_read(last_modified_key) headers = env[:request_headers] - puts "last modified: " + last_modified.to_s if last_modified && enable_debug(request_key) - headers['If-Modified-Since'] = last_modified if last_modified + headers['If-Modified-Since'] = cache_read(last_modified_key) if cache_read(last_modified_key) @app.call(env).on_complete do |response_env| - # Only cache get and head requests if [:get, :head].include?(response_env[:method]) - puts "Response status for key #{request_key}: " + response_env[:status].to_s if enable_debug(request_key) - last_modified = response_env[:response_headers]["Last-Modified"] - # Generate key using header hash - key = request_key - - # If the last retrieve time is less than expiry - if response_env[:status] == 304 && cache_exist?(key) - stored_obj = cache_read(key) - - # Update if last modified is different - if stored_obj[:last_modified] != last_modified - puts "Updating cache #{response_env[:url].to_s}, key #{request_key}" if enable_debug(request_key) - stored_obj[:last_modified] = last_modified - cache_write(last_modified_key, last_modified) - cache_write(key, stored_obj) - end - - ld_obj = stored_obj[:ld_obj] - else - if response_env[:body].nil? || response_env[:body].empty? - # We got here with an empty body, meaning the object wasn't - # in the cache (weird). So re-do the request. - puts "REDOING REQUEST, NO CACHE ENTRY for #{response_env[:url].to_s}, key #{request_key}" if enable_debug(request_key) - env[:request_headers].delete("If-Modified-Since") - response_env = @app.call(env).env - puts "REDOING REQUEST expiry: #{response_env[:response_headers]["Cache-Control"]}, last_modified: #{last_modified} for key #{request_key}" if enable_debug(request_key) - end - - ld_obj = LinkedData::Client::HTTP.object_from_json(response_env[:body]) - # This stmt was missing in the old code, resulting in repeated calls to REST because object failed to cache - last_modified = response_env[:response_headers]["Last-Modified"] - expiry = response_env[:response_headers]["Cache-Control"].to_s.split("max-age=").last.to_i - puts "Before storing object: expiry: #{expiry}, last_modified: #{last_modified} for key #{request_key}" if enable_debug(request_key) - - if expiry > 0 && last_modified - # This request is cacheable, store it - puts "Storing object: #{response_env[:url].to_s}, key #{request_key}" if enable_debug(request_key) - stored_obj = {last_modified: last_modified, ld_obj: ld_obj} - cache_write(last_modified_key, last_modified) - cache_write(last_retrieved_key, true, expires_in: expiry) - cache_write(key, stored_obj) - end - end - - response = ObjectCacheResponse.new(response_env) - response.parsed_body = ld_obj + response = process_response(response_env, request_key) return response end end @@ -117,8 +57,83 @@ def call(env) private def enable_debug(key) - return true if LinkedData::Client.settings.debug_client && (LinkedData::Client.settings.debug_client_keys.empty? || LinkedData::Client.settings.debug_client_keys.include?(key)) - false + LinkedData::Client.settings.debug_client && (LinkedData::Client.settings.debug_client_keys.empty? || LinkedData::Client.settings.debug_client_keys.include?(key)) + end + + def delete_cache_entries(*keys) + keys.each { |key| cache_delete(key) } + end + + def retrieve_cached_response(request_key) + cached_item = cache_read(request_key) + ld_obj = cached_item.is_a?(Hash) && cached_item.key?(:ld_obj) ? cached_item[:ld_obj] : cached_item + env = { status: 304 } + cached_response = ObjectCacheResponse.new(env) + cached_response.parsed_body = ld_obj + cached_response.env.response_headers = { "x-rack-cache" => 'hit' } + cached_response + end + + def process_response(response_env, request_key) + last_modified = response_env[:response_headers]["Last-Modified"] + key = request_key + cache_state = "miss" + + if response_env[:status] == 304 && cache_exist?(key) + cache_state = "fresh" + ld_obj = update_cache(request_key, last_modified) + else + ld_obj = cache_response(response_env, request_key) + end + + response = ObjectCacheResponse.new(response_env) + response.parsed_body = ld_obj + response.env.response_headers["x-rack-cache"] = cache_state + response + end + + def update_cache(request_key, last_modified) + stored_obj = cache_read(request_key) + if stored_obj[:last_modified] != last_modified + stored_obj[:last_modified] = last_modified + cache_write(last_modified_key_id(request_key), last_modified) + cache_write(request_key, stored_obj) + end + stored_obj.is_a?(Hash) && stored_obj.key?(:ld_obj) ? stored_obj[:ld_obj] : stored_obj + end + + def cache_response(response_env, request_key) + last_modified = response_env[:response_headers]["Last-Modified"] + + if response_env[:body].nil? || response_env[:body].empty? + # We got here with an empty body, meaning the object wasn't + # in the cache (weird). So re-do the request. + puts "REDOING REQUEST, NO CACHE ENTRY for #{response_env[:url].to_s}, key #{request_key}" if enable_debug(request_key) + response_env[:request_headers].delete("If-Modified-Since") + + response_env = @app.call(response_env).env + puts "REDOING REQUEST expiry: #{response_env[:response_headers]["Cache-Control"]}, last_modified: #{last_modified} for key #{request_key}" if enable_debug(request_key) + end + + + return nil if response_env[:body].nil? || response_env[:body].empty? + + ld_obj = LinkedData::Client::HTTP.object_from_json(response_env[:body]) + + expiry = response_env[:response_headers]["Cache-Control"].to_s.split("max-age=").last.to_i + + if expiry > 0 && last_modified + store_cache(request_key, ld_obj, last_modified, expiry) + end + + ld_obj + end + + def store_cache(request_key, ld_obj, last_modified, expiry) + stored_obj = { last_modified: last_modified, ld_obj: ld_obj } + cache_write(last_modified_key_id(request_key), last_modified) + cache_write(last_retrieved_key_id(request_key), true, expires_in: expiry) + cache_write(request_key, stored_obj) end def cache_write(key, obj, *args) @@ -132,12 +147,6 @@ def cache_write(key, obj, *args) if result return result else - # This should still get stored in memcache - # keep it in memory, though, because - # marshal/unmarshal is too slow. - # This way memcache will act as a backup - # and you load from there if it isn't - # in memory yet. @large_object_cache ||= {} @large_object_cache[key] = obj cache_write_compressed(key, obj, *args) @@ -147,11 +156,10 @@ def cache_write(key, obj, *args) def cache_read(key) obj = @store.read(key) - return if obj.nil? + return unless obj + if obj.is_a?(CompressedMemcache) - # Try to get from the large object cache large_obj = @large_object_cache[key] if @large_object_cache - # Fallback to the memcache version large_obj ||= cache_read_compressed(key) obj = large_obj end @@ -162,13 +170,14 @@ def cache_exist?(key) @store.exist?(key) end - class CompressedMemcache; attr_accessor :key; end + class CompressedMemcache + attr_accessor :key + end - ## - # Compress cache entry def cache_write_compressed(key, obj, *args) compressed = LZ4::compress(Marshal.dump(obj)) - return if compressed.nil? + return unless compressed + placeholder = CompressedMemcache.new placeholder.key = "#{key}::#{(Time.now.to_f * 1000).to_i}::LZ4" begin @@ -181,8 +190,6 @@ def cache_write_compressed(key, obj, *args) end end - ## - # Read compressed cache entry def cache_read_compressed(key) obj = @store.read(key) if obj.is_a?(CompressedMemcache) @@ -190,7 +197,6 @@ def cache_read_compressed(key) uncompressed = LZ4::uncompress(@store.read(obj.key)) obj = Marshal.load(uncompressed) rescue StandardError => e - # There is a problem with the stored value, let's remove it so we don't get the error again @store.delete(key) @store.delete(obj.key) raise e @@ -203,35 +209,16 @@ def cache_delete(key) @store.delete(key) end - # Internal: Generates a String key for a given request object. - # The request object is folded into a sorted Array (since we can't count - # on hashes order on Ruby 1.8), encoded as JSON and digested as a `SHA1` - # string. - # - # Returns the encoded String. def cache_key_for(request) array = request.stringify_keys.to_a.sort Digest::SHA1.hexdigest(Marshal.dump(array)) end - # Internal: Creates a new 'Hash' containing the request information. - # - # env - the environment 'Hash' from the Faraday stack. - # - # Returns a 'Hash' containing the ':method', ':url' and 'request_headers' - # entries. def create_request(env) request = env.to_hash.slice(:method, :url, :request_headers) request[:request_headers] = request[:request_headers].dup request end - - def clean_request_headers(request) - request[:request_headers].delete("If-Modified-Since") - request[:request_headers].delete("Expect") - request - end - end end diff --git a/test/middleware/test_cache.rb b/test/middleware/test_cache.rb new file mode 100644 index 0000000..ddea7a9 --- /dev/null +++ b/test/middleware/test_cache.rb @@ -0,0 +1,141 @@ +require_relative '../test_case' +require 'faraday' +require 'active_support' +require 'active_support/cache' +require_relative '../../lib/ontologies_api_client/middleware/faraday-object-cache' +require 'pry' +require 'benchmark' +require 'webmock' + +class FaradayObjectCacheTest < LinkedData::Client::TestCase + def setup + WebMock.disable! + apikey = LinkedData::Client.settings.apikey + @url = "#{LinkedData::Client.settings.rest_url}/ontologies/SNOMEDCT?apikey=#{apikey}" + + @cache_store = ActiveSupport::Cache::MemoryStore.new + @app = Faraday.new(url: @url) do |faraday| + faraday.use Faraday::ObjectCache, store: @cache_store + faraday.adapter :excon + end + end + + def test_cache_hit_for_get_request + body1, body2 = nil + # First request should not hit the cache + time1 = Benchmark.realtime do + response1 = @app.get + assert_equal 200, response1.status + assert uncached?(response1) + + body1 = JSON.parse(response1.body) + end + + time2 = Benchmark.realtime do + # Second request should hit the cache + response2 = @app.get + assert_equal 304, response2.status + assert cached?(response2) + body2 = response2.parsed_body.to_hash.stringify_keys + end + + assert time2 < time1 + + body2.each do |k,v| + k = "@id" if k.eql?('id') + k = "@type" if k.eql?('type') + + next if k.eql?('context') || k.eql?('links') + + assert_equal v, body1[k] + end + end + + + def test_cache_invalidation + # Make a request and cache the response + response1 = @app.get + assert_equal 200, response1.status + + response1 = @app.get + assert_equal 304, response1.status + assert cached?(response1) + + # Invalidate the cache + response2 = @app.get do |req| + req.headers['invalidate_cache'] = true + end + assert_equal 200, response2.status + assert uncached?(response2) + end + + def test_cache_expiration + WebMock.enable! + WebMock.stub_request(:get, @url) + .to_return(headers: { 'Cache-Control': "max-age=1" , 'Last-Modified': Time.now.httpdate}, body: {result: 'hello'}.to_json) + + + # Make a request and cache the response with a short expiry time + response1 = @app.get + assert_equal 200, response1.status + + # Wait for the cache to expire + sleep 2 + + response2 = @app.get + + assert_equal 200, response2.status + assert uncached?(response2) + + sleep 2 + + WebMock.stub_request(:get, @url) + .to_return(headers: { 'Cache-Control': "max-age=100" , 'Last-Modified': Time.now.httpdate}, body: {result: 'hello'}.to_json) + @app.get + + + # Wait for the cache to expire + sleep 2 + + response2 = @app.get + + assert cached?(response2) + + WebMock.disable! + end + + def test_cache_last_modified + WebMock.enable! + # Make a request with Last-Modified header + WebMock.stub_request(:get, @url) + .to_return(headers: { 'Cache-Control': "max-age=1", 'Last-Modified': 3.days.ago.to_time.httpdate}, body: {result: 'hello'}.to_json, status: 304) + + @app.get + + response2 = @app.get + assert cached?(response2) + + sleep 1 + + WebMock.stub_request(:get, @url) + .to_return(headers: { 'Cache-Control': "max-age=10", 'Last-Modified': 1.days.ago.to_time.httpdate}, body: {result: 'hello'}.to_json, status: 304) + + response2 = @app.get + assert refreshed?(response2) + WebMock.disable! + end + + + private + def cached?(response) + response.env.response_headers['x-rack-cache'].eql?('hit') + end + + def uncached?(response) + response.env.response_headers['x-rack-cache'].eql?('miss') + end + + def refreshed?(response) + response.env.response_headers['x-rack-cache'].eql?('fresh') + end +end