Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for issue #182 #190

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions crawler/crawling/distributed_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,11 +563,19 @@ def request_from_feed(self, item):
req.meta[key] = item[key]

# extra check to add items to request
if 'headers' in item and item['headers'] is not None:
if isinstance(item['headers'], dict):
for key, value in item['headers'].items():
req.headers[key] = value

if 'cookie' in item and item['cookie'] is not None:
if isinstance(item['cookie'], dict):
req.cookies = item['cookie']
elif isinstance(item['cookie'], basestring):
if isinstance(item['cookie'], basestring):
req.cookies = self.parse_cookie(item['cookie'])

if 'cookies' in item and item['cookies'] is not None:
if isinstance(item['cookies'], dict):
req.cookies = item['cookies']

return req

def parse_cookie(self, string):
Expand Down
65 changes: 65 additions & 0 deletions crawler/crawling/shared_cookies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import redis
import pickle

from scrapy.downloadermiddlewares.cookies import CookiesMiddleware


class SharedCookiesMiddleware(CookiesMiddleware):
'''
Shared Cookies Middleware to share same cookies between crawl Nodes.
'''

def __init__(self, debug=True, server=None):
CookiesMiddleware.__init__(self, debug)
self.redis_conn = server
self.debug = debug

@classmethod
def from_crawler(cls, crawler):
server = redis.Redis(host=crawler.settings.get('REDIS_HOST'),
port=crawler.settings.get('REDIS_PORT'),
db=crawler.settings.get('REDIS_DB'))
return cls(crawler.settings.getbool('COOKIES_DEBUG'), server)

def process_request(self, request, spider):
if 'dont_merge_cookies' in request.meta:
return
cookiejarkey = "{spiderid}:sharedcookies:{crawlid}".format(
spiderid=request.meta.get("spiderid"),
crawlid=request.meta.get("crawlid"))

jar = self.jars[cookiejarkey]
jar.clear()
if self.redis_conn.exists(cookiejarkey):
data = self.redis_conn.get(cookiejarkey)
jar = pickle.loads(data)

cookies = self._get_request_cookies(jar, request)
for cookie in cookies:
jar.set_cookie_if_ok(cookie, request)

# set Cookie header
request.headers.pop('Cookie', None)
jar.add_cookie_header(request)
self._debug_cookie(request, spider)
self.redis_conn.set(cookiejarkey, pickle.dumps(jar))

def process_response(self, request, response, spider):
if request.meta.get('dont_merge_cookies', False):
return response
cookiejarkey = "{spiderid}:sharedcookies:{crawlid}".format(
spiderid=request.meta.get("spiderid"),
crawlid=request.meta.get("crawlid"))
# extract cookies from Set-Cookie and drop invalid/expired cookies

jar = self.jars[cookiejarkey]
jar.clear()

if self.redis_conn.exists(cookiejarkey):
data = self.redis_conn.get(cookiejarkey)
jar = pickle.loads(data)

jar.extract_cookies(response, request)
self._debug_set_cookie(response, spider)
self.redis_conn.set(cookiejarkey, pickle.dumps(jar))
return response
30 changes: 26 additions & 4 deletions crawler/tests/test_distributed_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def get_request(self):
req.meta['expires'] = 0
req.meta['useragent'] = None
req.meta['cookie'] = None
req.meta['cookies'] = None
req.meta['headers'] = None

return req

Expand Down Expand Up @@ -203,24 +205,44 @@ def test_next_request(self, t):
"crawlid": "abc123",
"appid": "myapp",
"spiderid": "link",
"useragent": "useragent",
"headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,ima/webp,/;q=0.8",
"Accept-Encoding": "gzip, deflate",
"X-Requested-With": "dmoztools.net",
"User-Agent": "My Custom User Agent"
},
"cookie" : "ajs_user_id=null; ajs_group_id=null;",
"cookies": {
"device_id": "1",
"app_token": "guid"
}
}

self.scheduler.find_item = MagicMock(return_value=feed)
out = self.scheduler.next_request()
self.assertEqual(out.url, 'http://ex.com')
for key in out.meta:
self.assertEqual(out.meta[key], self.req.meta[key])

# test request from serialized request
exist_req = Request('http://ex.com')
exist_item = request_to_dict(exist_req)
exist_item["meta"]["crawlid"] = "abc123"
exist_item["meta"]["appid"] = "myapp"
exist_item["meta"]["spiderid"] = "link"
exist_item["meta"]["cookies"] = {
"device_id": "1",
"app_token": "guid"
}
exist_item["meta"]["headers"] = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,ima/webp,/;q=0.8",
"Accept-Encoding": "gzip, deflate",
"X-Requested-With": "dmoztools.net",
"User-Agent": "My Custom User Agent"
}
self.scheduler.find_item = MagicMock(return_value=exist_item)
out = self.scheduler.next_request()
self.assertEqual(out.url, 'http://ex.com')
for key in out.meta:
self.assertEqual(out.meta[key], self.req.meta[key])


# test didn't get item
self.scheduler.find_item = MagicMock(return_value=None)
Expand Down
8 changes: 8 additions & 0 deletions kafka-monitor/plugins/scraper_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,20 @@
"maxLength": 1000,
"default": null
},
"headers": {
"type":"object",
"default": null
},
"cookie": {
"type": "string",
"minLength": 3,
"maxLength": 1000,
"default": null
},
"cookies": {
"type" : "object",
"default": null
},
"attrs": {
"type": "object",
"default": null
Expand Down