From 1c91fc9077f053b7fdca7672406efef1f95c63ba Mon Sep 17 00:00:00 2001 From: Nirbhay Kundan Date: Mon, 2 Jul 2018 10:05:02 +0530 Subject: [PATCH 1/6] Adding support for 1.Custom Headers and Cookies with Initial request. 2.Shared cookies middleware to share cookies between crawl nodes --- crawler/crawling/distributed_scheduler.py | 13 +++-- crawler/crawling/shared_cookies.py | 66 +++++++++++++++++++++++ kafka-monitor/plugins/scraper_schema.json | 8 +++ 3 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 crawler/crawling/shared_cookies.py diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index e07af32b..682becf1 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -563,11 +563,18 @@ def request_from_feed(self, item): req.meta[key] = item[key] # extra check to add items to request + if 'headers' in item and item['headers'] is not None: + for key, value in item['headers'].iteritems(): + req.headers[key] = value + if 'cookie' in item and item['cookie'] is not None: - if isinstance(item['cookie'], dict): - req.cookies = item['cookie'] - elif isinstance(item['cookie'], basestring): + if isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) + + if 'cookies' in item and item['cookies'] is not None: + if isinstance(item['cookies'], dict): + req.cookies = item['cookies'] + return req def parse_cookie(self, string): diff --git a/crawler/crawling/shared_cookies.py b/crawler/crawling/shared_cookies.py new file mode 100644 index 00000000..046f239b --- /dev/null +++ b/crawler/crawling/shared_cookies.py @@ -0,0 +1,66 @@ +import redis +import pickle + +from scrapy.downloadermiddlewares.cookies import CookiesMiddleware + + +class CustomCookiesMiddleware(CookiesMiddleware): + ''' + Custom Cookies Middleware to pass our required cookies along but not + persist between calls + ''' + + def __init__(self, debug=True, server=None): + CookiesMiddleware.__init__(self, debug) + self.redis_conn = server + self.debug = debug + + @classmethod + def from_crawler(cls, crawler): + server = redis.Redis(host=crawler.settings.get('REDIS_HOST'), + port=crawler.settings.get('REDIS_PORT'), + db=crawler.settings.get('REDIS_DB')) + return cls(crawler.settings.getbool('COOKIES_DEBUG'), server) + + def process_request(self, request, spider): + if 'dont_merge_cookies' in request.meta: + return + cookiejarkey = "{spiderid}:sharedcookies:{crawlid}".format( + spiderid=request.meta.get("spiderid"), + crawlid=request.meta.get("crawlid")) + + jar = self.jars[cookiejarkey] + jar.clear() + if self.redis_conn.exists(cookiejarkey): + data = self.redis_conn.get(cookiejarkey) + jar = pickle.loads(data) + + cookies = self._get_request_cookies(jar, request) + for cookie in cookies: + jar.set_cookie_if_ok(cookie, request) + + # set Cookie header + request.headers.pop('Cookie', None) + jar.add_cookie_header(request) + self._debug_cookie(request, spider) + self.redis_conn.set(cookiejarkey, pickle.dumps(jar)) + + def process_response(self, request, response, spider): + if request.meta.get('dont_merge_cookies', False): + return response + cookiejarkey = "{spiderid}:sharedcookies:{crawlid}".format( + spiderid=request.meta.get("spiderid"), + crawlid=request.meta.get("crawlid")) + # extract cookies from Set-Cookie and drop invalid/expired cookies + + jar = self.jars[cookiejarkey] + jar.clear() + + if self.redis_conn.exists(cookiejarkey): + data = self.redis_conn.get(cookiejarkey) + jar = pickle.loads(data) + + jar.extract_cookies(response, request) + self._debug_set_cookie(response, spider) + self.redis_conn.set(cookiejarkey, pickle.dumps(jar)) + return response \ No newline at end of file diff --git a/kafka-monitor/plugins/scraper_schema.json b/kafka-monitor/plugins/scraper_schema.json index 2992617b..5ab95290 100644 --- a/kafka-monitor/plugins/scraper_schema.json +++ b/kafka-monitor/plugins/scraper_schema.json @@ -78,12 +78,20 @@ "maxLength": 1000, "default": null }, + "headers": { + "type":"object", + "default": null + }, "cookie": { "type": "string", "minLength": 3, "maxLength": 1000, "default": null }, + "cookies": { + "type" : "object", + "default": null + }, "attrs": { "type": "object", "default": null From e52b6ecee422d60a042041388a7bc0fcba6011ca Mon Sep 17 00:00:00 2001 From: Nirbhay Kundan Date: Mon, 2 Jul 2018 10:15:07 +0530 Subject: [PATCH 2/6] Updated Class Name and Comment --- crawler/crawling/shared_cookies.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crawler/crawling/shared_cookies.py b/crawler/crawling/shared_cookies.py index 046f239b..6b02a2fb 100644 --- a/crawler/crawling/shared_cookies.py +++ b/crawler/crawling/shared_cookies.py @@ -4,10 +4,9 @@ from scrapy.downloadermiddlewares.cookies import CookiesMiddleware -class CustomCookiesMiddleware(CookiesMiddleware): +class SharedCookiesMiddleware(CookiesMiddleware): ''' - Custom Cookies Middleware to pass our required cookies along but not - persist between calls + Shared Cookies Middleware to share same cookies between crawl Nodes. ''' def __init__(self, debug=True, server=None): @@ -63,4 +62,4 @@ def process_response(self, request, response, spider): jar.extract_cookies(response, request) self._debug_set_cookie(response, spider) self.redis_conn.set(cookiejarkey, pickle.dumps(jar)) - return response \ No newline at end of file + return response From 51ab3fa7704c4a6b21d3dbe75a0482555ac953a0 Mon Sep 17 00:00:00 2001 From: Nirbhay Kundan Date: Mon, 2 Jul 2018 11:49:11 +0530 Subject: [PATCH 3/6] Increasing coverage by covering cookies and header custom feed --- crawler/tests/test_distributed_scheduler.py | 28 ++++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index e8e050d3..cb4876e5 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -50,6 +50,8 @@ def get_request(self): req.meta['expires'] = 0 req.meta['useragent'] = None req.meta['cookie'] = None + req.meta['cookies'] = None + req.meta['headers'] = None return req @@ -203,12 +205,21 @@ def test_next_request(self, t): "crawlid": "abc123", "appid": "myapp", "spiderid": "link", + "headers": { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,ima/webp,/;q=0.8", + "Accept-Encoding": "gzip, deflate", + "X-Requested-With": "dmoztools.net", + "User-Agent": "My Custom User Agent" + }, + "cookies": { + "device_id": "1", + "app_token": "guid" + } } + self.scheduler.find_item = MagicMock(return_value=feed) out = self.scheduler.next_request() self.assertEqual(out.url, 'http://ex.com') - for key in out.meta: - self.assertEqual(out.meta[key], self.req.meta[key]) # test request from serialized request exist_req = Request('http://ex.com') @@ -216,11 +227,20 @@ def test_next_request(self, t): exist_item["meta"]["crawlid"] = "abc123" exist_item["meta"]["appid"] = "myapp" exist_item["meta"]["spiderid"] = "link" + exist_item["meta"]["cookies"] = { + "device_id": "1", + "app_token": "guid" + } + exist_item["meta"]["headers"] = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,ima/webp,/;q=0.8", + "Accept-Encoding": "gzip, deflate", + "X-Requested-With": "dmoztools.net", + "User-Agent": "My Custom User Agent" + } self.scheduler.find_item = MagicMock(return_value=exist_item) out = self.scheduler.next_request() self.assertEqual(out.url, 'http://ex.com') - for key in out.meta: - self.assertEqual(out.meta[key], self.req.meta[key]) + # test didn't get item self.scheduler.find_item = MagicMock(return_value=None) From f7362cc0c2d00c56e7f877d026970f25341f359a Mon Sep 17 00:00:00 2001 From: Nirbhay Kundan Date: Mon, 2 Jul 2018 12:42:12 +0530 Subject: [PATCH 4/6] Some more distributed scheduler test improvement --- crawler/tests/test_distributed_scheduler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index cb4876e5..92f3d236 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -205,12 +205,14 @@ def test_next_request(self, t): "crawlid": "abc123", "appid": "myapp", "spiderid": "link", + "useragent": "useragent", "headers": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,ima/webp,/;q=0.8", "Accept-Encoding": "gzip, deflate", "X-Requested-With": "dmoztools.net", "User-Agent": "My Custom User Agent" }, + "cookie" : "ajs_user_id=null; ajs_group_id=null;", "cookies": { "device_id": "1", "app_token": "guid" From ac0138136aa8eac1dbaf8bd5dc0490d1c402ab2b Mon Sep 17 00:00:00 2001 From: Nirbhay Kundan Date: Mon, 2 Jul 2018 14:47:25 +0530 Subject: [PATCH 5/6] Some more distributed scheduler test improvement --- crawler/crawling/distributed_scheduler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index 682becf1..2eb6cc17 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -564,8 +564,9 @@ def request_from_feed(self, item): # extra check to add items to request if 'headers' in item and item['headers'] is not None: - for key, value in item['headers'].iteritems(): - req.headers[key] = value + if isinstance(item['headers'], dict): + for key, value in item['headers'].iteritems(): + req.headers[key] = value if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], basestring): From 1cd7940d7b547dc4c50d60330e852afc644f8473 Mon Sep 17 00:00:00 2001 From: Nirbhay Kundan Date: Mon, 2 Jul 2018 15:11:15 +0530 Subject: [PATCH 6/6] changed dict.iteritems() to dict.items() api to make it compatible with Python 3 --- crawler/crawling/distributed_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index 2eb6cc17..2567aba5 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -565,7 +565,7 @@ def request_from_feed(self, item): # extra check to add items to request if 'headers' in item and item['headers'] is not None: if isinstance(item['headers'], dict): - for key, value in item['headers'].iteritems(): + for key, value in item['headers'].items(): req.headers[key] = value if 'cookie' in item and item['cookie'] is not None: