diff --git a/.circleci/config.yml b/.circleci/config.yml index 28862cdc..effa14dd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -196,7 +196,7 @@ jobs: # is a tagged release | is a pull request | is on the develop branch if [[ ! -z $CIRCLE_TAG ]] || [[ ! -z $CIRCLE_PULL_REQUEST ]] || [[ $BRANCH == $DEVELOP_BRANCH ]]; then - docker login -u "${DOCKER_USER}" -p "${DOCKER_PASS}" + docker login -u "${DOCKER_USER}" -p "${DOCKER_PASS}" docker.io docker build --build-arg BUILD_NUMBER=$CIRCLE_BUILD_NUM -t $KAFKA_MONITOR_IMAGE-$VERSION_TAG -f docker/kafka-monitor/Dockerfile.py3 . docker build --build-arg BUILD_NUMBER=$CIRCLE_BUILD_NUM -t $REDIS_MONITOR_IMAGE-$VERSION_TAG -f docker/redis-monitor/Dockerfile.py3 . diff --git a/README.md b/README.md index c3c6e1f5..b37a8f49 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Please see the ``requirements.txt`` within each sub project for Pip package depe Other important components required to run the cluster -- Python 2.7 or 3.6: https://www.python.org/downloads/ +- Python 3.10: https://www.python.org/downloads/ - Redis: http://redis.io - Zookeeper: https://zookeeper.apache.org - Kafka: http://kafka.apache.org diff --git a/crawler/crawling/distributed_scheduler.py b/crawler/crawling/distributed_scheduler.py index e849f9a7..25d639db 100644 --- a/crawler/crawling/distributed_scheduler.py +++ b/crawler/crawling/distributed_scheduler.py @@ -6,9 +6,15 @@ from six import string_types from builtins import object from scrapy.http import Request -from scrapy.conf import settings + +# from scrapy.conf import settings +from scrapy.settings import Settings +from . import settings +new_settings = Settings() +new_settings.setmodule(settings) + from scrapy.utils.python import to_unicode -from scrapy.utils.reqser import request_to_dict, request_from_dict +from scrapy.utils.request import request_from_dict import redis import random @@ -106,12 +112,12 @@ def __init__(self, server, persist, update_int, timeout, retries, logger, self.my_uuid = str(uuid.uuid4()).split('-')[4] def setup_zookeeper(self): - self.assign_path = settings.get('ZOOKEEPER_ASSIGN_PATH', "") - self.my_id = settings.get('ZOOKEEPER_ID', 'all') + self.assign_path = new_settings.get('ZOOKEEPER_ASSIGN_PATH', "") + self.my_id = new_settings.get('ZOOKEEPER_ID', 'all') self.logger.debug("Trying to establish Zookeeper connection") try: self.zoo_watcher = ZookeeperWatcher( - hosts=settings.get('ZOOKEEPER_HOSTS'), + hosts=new_settings.get('ZOOKEEPER_HOSTS'), filepath=self.assign_path + self.my_id, config_handler=self.change_config, error_handler=self.error_config, @@ -120,7 +126,7 @@ def setup_zookeeper(self): self.logger.error("Could not connect to Zookeeper") sys.exit(1) - if self.zoo_watcher.ping(): + if self.zoo_watcher.is_valid(): self.logger.debug("Successfully set up Zookeeper connection") else: self.logger.error("Could not ping Zookeeper") @@ -290,7 +296,7 @@ def update_ipaddress(self): self.old_ip = self.my_ip self.my_ip = '127.0.0.1' try: - obj = urllib.request.urlopen(settings.get('PUBLIC_IP_URL', + obj = urllib.request.urlopen(new_settings.get('PUBLIC_IP_URL', 'http://ip.42.pl/raw')) results = self.ip_regex.findall(obj.read().decode('utf-8')) if len(results) > 0: @@ -311,7 +317,7 @@ def report_self(self): ''' Reports the crawler uuid to redis ''' - self.logger.debug("Reporting self id", extra={'uuid':self.my_uuid}) + self.logger.debug("Reporting self id: {extra}".format(extra={'uuid':self.my_uuid})) key = "stats:crawler:{m}:{s}:{u}".format( m=socket.gethostname(), s=self.spider.name, @@ -321,36 +327,36 @@ def report_self(self): @classmethod def from_settings(cls, settings): - server = redis.Redis(host=settings.get('REDIS_HOST'), - port=settings.get('REDIS_PORT'), - db=settings.get('REDIS_DB'), - password=settings.get('REDIS_PASSWORD'), + server = redis.Redis(host=new_settings.get('REDIS_HOST'), + port=new_settings.get('REDIS_PORT'), + db=new_settings.get('REDIS_DB'), + password=new_settings.get('REDIS_PASSWORD'), decode_responses=True, - socket_timeout=settings.get('REDIS_SOCKET_TIMEOUT'), - socket_connect_timeout=settings.get('REDIS_SOCKET_TIMEOUT')) - persist = settings.get('SCHEDULER_PERSIST', True) - up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) - hits = settings.get('QUEUE_HITS', 10) - window = settings.get('QUEUE_WINDOW', 60) - mod = settings.get('QUEUE_MODERATED', False) - timeout = settings.get('DUPEFILTER_TIMEOUT', 600) - ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) - add_type = settings.get('SCHEDULER_TYPE_ENABLED', False) - add_ip = settings.get('SCHEDULER_IP_ENABLED', False) - retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) - ip_regex = settings.get('IP_ADDR_REGEX', '.*') - backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) - queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) - - - my_level = settings.get('SC_LOG_LEVEL', 'INFO') - my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') - my_output = settings.get('SC_LOG_STDOUT', True) - my_json = settings.get('SC_LOG_JSON', False) - my_dir = settings.get('SC_LOG_DIR', 'logs') - my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') - my_file = settings.get('SC_LOG_FILE', 'main.log') - my_backups = settings.get('SC_LOG_BACKUPS', 5) + socket_timeout=new_settings.get('REDIS_SOCKET_TIMEOUT'), + socket_connect_timeout=new_settings.get('REDIS_SOCKET_TIMEOUT')) + persist = new_settings.get('SCHEDULER_PERSIST', True) + up_int = new_settings.get('SCHEDULER_QUEUE_REFRESH', 10) + hits = new_settings.get('QUEUE_HITS', 10) + window = new_settings.get('QUEUE_WINDOW', 60) + mod = new_settings.get('QUEUE_MODERATED', False) + timeout = new_settings.get('DUPEFILTER_TIMEOUT', 600) + ip_refresh = new_settings.get('SCHEDULER_IP_REFRESH', 60) + add_type = new_settings.get('SCHEDULER_TYPE_ENABLED', False) + add_ip = new_settings.get('SCHEDULER_IP_ENABLED', False) + retries = new_settings.get('SCHEUDLER_ITEM_RETRIES', 3) + ip_regex = new_settings.get('IP_ADDR_REGEX', '.*') + backlog_blacklist = new_settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) + queue_timeout = new_settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) + + + my_level = new_settings.get('SC_LOG_LEVEL', 'INFO') + my_name = new_settings.get('SC_LOGGER_NAME', 'sc-logger') + my_output = new_settings.get('SC_LOG_STDOUT', True) + my_json = new_settings.get('SC_LOG_JSON', False) + my_dir = new_settings.get('SC_LOG_DIR', 'logs') + my_bytes = new_settings.get('SC_LOG_MAX_BYTES', '10MB') + my_file = new_settings.get('SC_LOG_FILE', 'main.log') + my_backups = new_settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, @@ -361,9 +367,9 @@ def from_settings(cls, settings): bytes=my_bytes, backups=my_backups) - global_page_per_domain_limit = settings.get('GLOBAL_PAGE_PER_DOMAIN_LIMIT', None) - global_page_per_domain_limit_timeout = settings.get('GLOBAL_PAGE_PER_DOMAIN_LIMIT_TIMEOUT', 600) - domain_max_page_timeout = settings.get('DOMAIN_MAX_PAGE_TIMEOUT', 600) + global_page_per_domain_limit = new_settings.get('GLOBAL_PAGE_PER_DOMAIN_LIMIT', None) + global_page_per_domain_limit_timeout = new_settings.get('GLOBAL_PAGE_PER_DOMAIN_LIMIT_TIMEOUT', 600) + domain_max_page_timeout = new_settings.get('DOMAIN_MAX_PAGE_TIMEOUT', 600) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, @@ -411,6 +417,27 @@ def is_blacklisted(self, appid, crawlid): redis_key = self.spider.name + ":blacklist" return self.redis_conn.sismember(redis_key, key_check) + def decode_dict(self, data): + decoded_dict = {} + for key, value in data.items(): + if isinstance(value, bytes): + decoded_dict[key] = value.decode() + elif isinstance(value, dict): + decoded_dict[key] = self.decode_dict(value) + elif isinstance(value, list): + decoded_list = [] + for item in value: + if isinstance(item, bytes): + decoded_list.append(item.decode()) + elif isinstance(item, dict): + decoded_list.append(self.decode_dict(item)) + else: + decoded_list.append(item) + decoded_dict[key] = decoded_list + else: + decoded_dict[key] = value + return decoded_dict + def enqueue_request(self, request): ''' Pushes a request from the spider into the proper throttled queue @@ -422,7 +449,7 @@ def enqueue_request(self, request): return # An individual crawling request of a domain's page - req_dict = request_to_dict(request, self.spider) + req_dict = Request.to_dict(request, spider=self.spider) # # # # # # # # # # # # # # # # # # Page Limit Filters # # # # # # # # # # # # # # # # Max page filter per individual domain @@ -468,6 +495,7 @@ def enqueue_request(self, request): curr_time < req_dict['meta']['expires']): # we may already have the queue in memory if key in self.queue_keys: + req_dict = self.decode_dict(req_dict) self.queue_dict[key][0].push(req_dict, req_dict['meta']['priority']) else: @@ -535,7 +563,7 @@ def next_request(self): .format(url=item['url'])) if 'meta' in item: # item is a serialized request - req = request_from_dict(item, self.spider) + req = request_from_dict(item, spider=self.spider) else: # item is a feed from outside, parse it manually req = self.request_from_feed(item) diff --git a/crawler/crawling/items.py b/crawler/crawling/items.py index 68f8537e..a947495f 100644 --- a/crawler/crawling/items.py +++ b/crawler/crawling/items.py @@ -2,21 +2,21 @@ # Define here the models for your scraped items -from scrapy import Item, Field +import scrapy -class RawResponseItem(Item): - appid = Field() - crawlid = Field() - url = Field() - response_url = Field() - status_code = Field() - status_msg = Field() - response_headers = Field() - request_headers = Field() - body = Field() - links = Field() - attrs = Field() - success = Field() - exception = Field() - encoding = Field() +class RawResponseItem(scrapy.Item): + appid = scrapy.Field() + crawlid = scrapy.Field() + url = scrapy.Field() + response_url = scrapy.Field() + status_code = scrapy.Field() + status_msg = scrapy.Field() + response_headers = scrapy.Field() + request_headers = scrapy.Field() + body = scrapy.Field() + links = scrapy.Field() + attrs = scrapy.Field() + success = scrapy.Field() + exception = scrapy.Field() + encoding = scrapy.Field() diff --git a/crawler/crawling/log_retry_middleware.py b/crawler/crawling/log_retry_middleware.py index c78fae57..a0f971e4 100644 --- a/crawler/crawling/log_retry_middleware.py +++ b/crawler/crawling/log_retry_middleware.py @@ -7,7 +7,7 @@ import sys from scrapy.utils.response import response_status_message -from scrapy.xlib.tx import ResponseFailed +from scrapy.exceptions import IgnoreRequest from twisted.internet import defer from twisted.internet.error import TimeoutError, DNSLookupError, \ ConnectionRefusedError, ConnectionDone, ConnectError, \ @@ -22,7 +22,7 @@ class LogRetryMiddleware(object): EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, - ConnectionLost, TCPTimedOutError, ResponseFailed, + ConnectionLost, TCPTimedOutError, IgnoreRequest, IOError) def __init__(self, settings): diff --git a/crawler/crawling/pipelines.py b/crawler/crawling/pipelines.py index f116149b..8eb528ef 100644 --- a/crawler/crawling/pipelines.py +++ b/crawler/crawling/pipelines.py @@ -173,6 +173,27 @@ def _kafka_failure(self, item, spider, exception): item = self._clean_item(item) self.logger.error("Failed to send page to Kafka", item) + def decode_dict(self, data): + decoded_dict = {} + for key, value in data.items(): + if isinstance(value, bytes): + decoded_dict[key] = value.decode() + elif isinstance(value, dict): + decoded_dict[key] = self.decode_dict(value) + elif isinstance(value, list): + decoded_list = [] + for item in value: + if isinstance(item, bytes): + decoded_list.append(item.decode()) + elif isinstance(item, dict): + decoded_list.append(self.decode_dict(item)) + else: + decoded_list.append(item) + decoded_dict[key] = decoded_list + else: + decoded_dict[key] = value + return decoded_dict + def process_item(self, item, spider): try: self.logger.debug("Processing item in KafkaPipeline") @@ -194,6 +215,7 @@ def process_item(self, item, spider): elif 'utf-8' != encoding: datum['body'] = datum['body'].decode(datum['encoding']) + datum = self.decode_dict(datum) message = ujson.dumps(datum, sort_keys=True) except: message = 'json failed to parse' diff --git a/crawler/crawling/redis_domain_max_page_filter.py b/crawler/crawling/redis_domain_max_page_filter.py index 4e569934..c8ce23df 100644 --- a/crawler/crawling/redis_domain_max_page_filter.py +++ b/crawler/crawling/redis_domain_max_page_filter.py @@ -1,7 +1,7 @@ # coding=utf-8 import tldextract from scrapy.dupefilters import BaseDupeFilter -from scrapy.utils.reqser import request_to_dict +from scrapy.http import Request class RFDomainMaxPageFilter(BaseDupeFilter): @@ -31,7 +31,7 @@ def __init__(self, server, key, timeout): def request_page_limit_reached(self, request, spider): # Collect items composing the redis key # grab the tld of the request - req_dict = request_to_dict(request, spider) + req_dict = Request.to_dict(request, spider=spider) ex_res = self.extract(req_dict['url']) domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix) diff --git a/crawler/crawling/redis_global_page_per_domain_filter.py b/crawler/crawling/redis_global_page_per_domain_filter.py index c11fe124..6000a2eb 100644 --- a/crawler/crawling/redis_global_page_per_domain_filter.py +++ b/crawler/crawling/redis_global_page_per_domain_filter.py @@ -1,7 +1,7 @@ # coding=utf-8 import tldextract from scrapy.dupefilters import BaseDupeFilter -from scrapy.utils.reqser import request_to_dict +from scrapy.http import Request class RFGlobalPagePerDomainFilter(BaseDupeFilter): @@ -41,7 +41,7 @@ def __init__(self, server, key, page_limit, timeout): def request_page_limit_reached(self, request, spider): # Collect items composing the redis key # grab the tld of the request - req_dict = request_to_dict(request, spider) + req_dict = Request.to_dict(request, spider=spider) ex_res = self.extract(req_dict['url']) domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix) diff --git a/crawler/crawling/spiders/link_spider.py b/crawler/crawling/spiders/link_spider.py index 289c6a63..fe8c21ef 100644 --- a/crawler/crawling/spiders/link_spider.py +++ b/crawler/crawling/spiders/link_spider.py @@ -1,12 +1,11 @@ from __future__ import absolute_import -import scrapy from scrapy.http import Request -from crawling.spiders.lxmlhtml import CustomLxmlLinkExtractor as LinkExtractor -from scrapy.conf import settings +from .lxmlhtml import CustomLxmlLinkExtractor as LinkExtractor -from crawling.items import RawResponseItem -from crawling.spiders.redis_spider import RedisSpider + +from ..items import RawResponseItem +from .redis_spider import RedisSpider class LinkSpider(RedisSpider): diff --git a/crawler/crawling/spiders/lxmlhtml.py b/crawler/crawling/spiders/lxmlhtml.py index c56bbd78..3659dd61 100644 --- a/crawler/crawling/spiders/lxmlhtml.py +++ b/crawler/crawling/spiders/lxmlhtml.py @@ -6,7 +6,6 @@ from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy.link import Link from six.moves.urllib.parse import urljoin -from scrapy.utils.python import unique as unique_list, to_native_str import lxml.etree as etree from scrapy.utils.misc import rel_has_nofollow @@ -36,8 +35,8 @@ def _extract_links(self, selector, response_url, response_encoding, base_url): if url is None: continue # added 'ignore' to encoding errors - url = to_native_str(url, encoding=response_encoding, - errors='ignore') + if isinstance(url, bytes): + url = url.decode('utf-8') # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', diff --git a/crawler/crawling/spiders/wandering_spider.py b/crawler/crawling/spiders/wandering_spider.py index 66084569..bc7191d1 100644 --- a/crawler/crawling/spiders/wandering_spider.py +++ b/crawler/crawling/spiders/wandering_spider.py @@ -1,12 +1,10 @@ from __future__ import absolute_import # Example Wandering Spider -import scrapy from scrapy.http import Request from .lxmlhtml import CustomLxmlLinkExtractor as LinkExtractor -from scrapy.conf import settings -from crawling.items import RawResponseItem +from ..items import RawResponseItem from .redis_spider import RedisSpider import random diff --git a/crawler/requirements.txt b/crawler/requirements.txt index 1eb8045a..64477f2e 100644 --- a/crawler/requirements.txt +++ b/crawler/requirements.txt @@ -1,40 +1,35 @@ -attrs==18.1.0 # Updated from 17.2.0 -cffi==1.11.5 # Updated from 1.10.0 -ConcurrentLogHandler==0.9.1 -cryptography==2.3 -cssselect==1.0.3 # Updated from 1.0.1 +concurrent-log-handler==0.9.20 +cryptography==37.0.4 +cssselect==1.2.0 # Updated from 1.0.1 enum34==1.1.6 funcsigs==1.0.2 -future==0.16.0 +future==0.18.3 idna==2.6 ipaddress==1.0.22 # Updated from 1.0.18 -kafka-python==1.4.3 # Updated from 1.3.4 -kazoo==2.4.0 -lxml==4.2.1 # Updated from 3.8.0 +kafka-python==1.4.3 +kazoo==2.5.0 mock==2.0.0 -nose==1.3.7 -parsel==1.4.0 # Updated from 1.2.0 +nose2==0.13.0 pbr==4.0.3 # Updated from 3.1.1 pyasn1==0.4.3 # Updated from 0.3.2 pyasn1-modules==0.2.1 # Updated from 0.0.11 pycparser==2.18 PyDispatcher==2.0.5 -pyOpenSSL==18.0.0 # Updated from 17.2.0 +pyopenssl==22.0.0 python-json-logger==0.1.8 -PyYAML==3.12 +PyYAML==6.0 queuelib==1.5.0 # Updated from 1.4.2 redis>=3.0 # Updated from 2.10.5 -requests==2.18.4 # Updated from 2.18.3 +requests==2.26.0 requests-file==1.4.3 # Updated from 1.4.2 retrying==1.3.3 -Scrapy==1.5.0 # Updated from 1.4.0 +scrapy==2.10.0 ../utils # scutils==1.3.0dev0 -service-identity==17.0.0 +service-identity==23.1.0 six==1.11.0 # Updated from 1.10.0 testfixtures==6.0.2 # Updated from 5.1.1 tldextract==2.2.0 # Updated from 2.1.0 -Twisted==18.4.0 # Updated from 17.5.0 -ujson==1.35 +Twisted~=22.10.0 +ujson==5.7.0 w3lib==1.19.0 # Updated from 1.18.0 -zope.interface==4.5.0 # Updated from 4.4.2 # Generated with piprot 0.9.10 diff --git a/crawler/tests/online.py b/crawler/tests/online.py index f0aeff76..60b079f3 100644 --- a/crawler/tests/online.py +++ b/crawler/tests/online.py @@ -14,7 +14,6 @@ from os import path sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) -import scrapy import redis from redis.exceptions import ConnectionError import json diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py index 87c98c92..96b91e14 100644 --- a/crawler/tests/test_distributed_scheduler.py +++ b/crawler/tests/test_distributed_scheduler.py @@ -7,7 +7,7 @@ from mock import MagicMock from crawling.distributed_scheduler import DistributedScheduler from scrapy.http import Request -from scrapy.utils.reqser import request_to_dict +from scrapy.http import Request from scutils.redis_throttled_queue import RedisThrottledQueue @@ -232,7 +232,7 @@ def test_next_request(self, t): exist_req.meta["crawlid"] = "abc123" exist_req.meta["appid"] = "myapp" exist_req.meta["spiderid"] = "link" - exist_item = request_to_dict(exist_req) + exist_item = Request.to_dict(exist_req) self.scheduler.find_item = MagicMock(return_value=exist_item) out = self.scheduler.next_request() self.assertEqual(out.url, 'http://ex.com') @@ -241,7 +241,7 @@ def test_next_request(self, t): # test request from serialized request with supplied cookie exist_req = Request('http://ex.com', cookies={'auth':'101'}) - exist_item = request_to_dict(exist_req) + exist_item = Request.to_dict(exist_req) self.scheduler.find_item = MagicMock(return_value=exist_item) out = self.scheduler.next_request() self.assertEqual(out.url, 'http://ex.com') @@ -256,7 +256,7 @@ def test_next_request(self, t): exist_req.meta["appid"] = "myapp" exist_req.meta["spiderid"] = "link" exist_req.meta["cookie"] = {'authenticated': False, 'privacy':9} - exist_item = request_to_dict(exist_req) + exist_item = Request.to_dict(exist_req) self.scheduler.find_item = MagicMock(return_value=exist_item) out = self.scheduler.next_request() self.assertEqual(out.url, 'http://ex.com') diff --git a/docker/crawler/Dockerfile.py3 b/docker/crawler/Dockerfile.py3 index 58874164..ee5d56c3 100644 --- a/docker/crawler/Dockerfile.py3 +++ b/docker/crawler/Dockerfile.py3 @@ -1,13 +1,13 @@ -FROM python:3.6 +FROM python:3.10 MAINTAINER Madison Bahmer # os setup RUN apt-get update && apt-get -y install \ - python-lxml \ + python3-lxml \ build-essential \ libssl-dev \ libffi-dev \ - python-dev \ + python3-dev \ libxml2-dev \ libxslt1-dev \ && rm -rf /var/lib/apt/lists/* @@ -32,4 +32,4 @@ COPY docker/run_docker_tests.sh /usr/src/app/run_docker_tests.sh # set up environment variables # run the spider -CMD ["scrapy", "runspider", "crawling/spiders/link_spider.py"] \ No newline at end of file +CMD ["scrapy", "crawl", "link"] \ No newline at end of file diff --git a/docker/kafka-monitor/Dockerfile.py3 b/docker/kafka-monitor/Dockerfile.py3 index acb3ddd2..63432104 100644 --- a/docker/kafka-monitor/Dockerfile.py3 +++ b/docker/kafka-monitor/Dockerfile.py3 @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.10 MAINTAINER Madison Bahmer # os setup diff --git a/docker/redis-monitor/Dockerfile.py3 b/docker/redis-monitor/Dockerfile.py3 index e2c28d34..3eb8820d 100644 --- a/docker/redis-monitor/Dockerfile.py3 +++ b/docker/redis-monitor/Dockerfile.py3 @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.10 MAINTAINER Madison Bahmer # os setup diff --git a/docker/rest/Dockerfile.py3 b/docker/rest/Dockerfile.py3 index 15725fc2..904f00fd 100644 --- a/docker/rest/Dockerfile.py3 +++ b/docker/rest/Dockerfile.py3 @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.10 MAINTAINER Madison Bahmer # os setup diff --git a/docker/utils/Dockerfile.py3 b/docker/utils/Dockerfile.py3 index 02c6b693..06ec7542 100644 --- a/docker/utils/Dockerfile.py3 +++ b/docker/utils/Dockerfile.py3 @@ -1,15 +1,15 @@ -FROM python:3.6 +FROM python:3.10 MAINTAINER Madison Bahmer # os setup -RUN apt-get update +RUN apt-get update && apt-get -y install iputils-ping RUN mkdir -p /usr/src/app WORKDIR /usr/src/app # move codebase over and install requirements COPY utils /usr/src/app RUN pip install . -RUN pip install nose +RUN pip install nose2 # copy testing script into container COPY docker/run_docker_tests.sh /usr/src/app/run_docker_tests.sh diff --git a/kafka-monitor/requirements.txt b/kafka-monitor/requirements.txt index 492ee0db..1b976d0a 100644 --- a/kafka-monitor/requirements.txt +++ b/kafka-monitor/requirements.txt @@ -1,23 +1,23 @@ -ConcurrentLogHandler==0.9.1 +concurrent-log-handler==0.9.20 funcsigs==1.0.2 -future==0.16.0 +future==0.18.3 idna==2.6 # Updated from 2.5 jsonschema==2.6.0 -kafka-python==1.4.3 # Updated from 1.3.3 -kazoo==2.4.0 # Updated from 2.2.1 +kafka-python==1.4.3 +kazoo==2.5.0 mock==2.0.0 -nose==1.3.7 +nose2==0.13.0 pbr==4.0.3 # Updated from 2.0.0 python-json-logger==0.1.8 # Updated from 0.1.7 python-redis-lock==3.2.0 -PyYAML==3.12 +PyYAML==6.0 redis>=3.0 # Updated from 2.10.5 -requests==2.18.4 # Updated from 2.13.0 +requests==2.26.0 requests-file==1.4.3 # Updated from 1.4.1 retrying==1.3.3 ../utils # scutils==1.3.0dev0 six==1.11.0 # Updated from 1.10.0 testfixtures==6.0.2 # Updated from 4.13.5 tldextract==2.2.0 # Updated from 2.0.2 -ujson==1.35 +ujson==5.7.0 # Generated with piprot 0.9.10 diff --git a/redis-monitor/requirements.txt b/redis-monitor/requirements.txt index fa8365a3..3b75dceb 100644 --- a/redis-monitor/requirements.txt +++ b/redis-monitor/requirements.txt @@ -1,18 +1,18 @@ -ConcurrentLogHandler==0.9.1 +concurrent-log-handler==0.9.20 funcsigs==1.0.2 -future==0.16.0 -kafka-python==1.4.3 # Updated from 1.3.3 -kazoo==2.4.0 # Updated from 2.2.1 +future==0.18.3 +kafka-python==1.4.3 +kazoo==2.5.0 mock==2.0.0 -nose==1.3.7 +nose2==0.13.0 pbr==4.0.3 # Updated from 2.0.0 -python-json-logger==0.1.8 # Updated from 0.1.7 +python-json-logger==0.1.8 python-redis-lock==3.2.0 -PyYAML==3.12 +PyYAML==6.0 redis>=3.0 # Updated from 2.10.5 retrying==1.3.3 ../utils # scutils==1.3.0dev0 six==1.11.0 # Updated from 1.10.0 testfixtures==6.0.2 # Updated from 4.13.5 -ujson==1.35 +ujson==5.7.0 # Generated with piprot 0.9.10 diff --git a/requirements.txt b/requirements.txt index 447c8586..580e1a48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,36 +1,32 @@ # this file is deprecated and will be removed -ConcurrentLogHandler==0.9.1 -Flask==1.0.2 # Updated from 0.12 -Jinja2==2.10 # Updated from 2.9.5 -MarkupSafe==1.0 +concurrent-log-handler==0.9.20 +Flask==1.1.2 +Jinja2==3.0.0 +MarkupSafe==2.1.2 PyDispatcher==2.0.5 -PyYAML==3.12 -Scrapy==1.5.0 # Updated from 1.4.0 -Twisted==18.4.0 # Updated from 17.5.0 -Werkzeug==0.14.1 # Updated from 0.12.1 -attrs==18.1.0 # Updated from 17.2.0 -cffi==1.11.5 # Updated from 1.10.0 +PyYAML==6.0 +scrapy==2.10.0 +Twisted~=22.10.0 +Werkzeug==2.0.0 characteristic==14.3.0 -click==6.7 +click==8.1.3 coverage==4.5.1 # Updated from 4.3.4 -cryptography==2.3 -cssselect==1.0.3 # Updated from 1.0.1 +cryptography==37.0.4 +cssselect==1.2.0 # Updated from 1.0.1 enum34==1.1.6 funcsigs==1.0.2 -future==0.16.0 +future==0.18.3 idna==2.6 ipaddress==1.0.22 # Updated from 1.0.18 -itsdangerous==0.24 +itsdangerous==2.0.0 jsonschema==2.6.0 kafka-python==1.4.3 # Updated from 1.3.4 -kazoo==2.4.0 -lxml==4.2.1 # Updated from 3.8.0 +kazoo==2.5.0 mock==2.0.0 -nose==1.3.7 -parsel==1.4.0 # Updated from 1.2.0 +nose2==0.13.0 pbr==4.0.3 # Updated from 3.1.1 -pyOpenSSL==18.0.0 # Updated from 17.2.0 +pyopenssl==22.0.0 pyasn1==0.4.3 # Updated from 0.3.2 pyasn1-modules==0.2.1 # Updated from 0.0.11 pycparser==2.18 @@ -38,15 +34,14 @@ python-json-logger==0.1.8 python-redis-lock==3.2.0 queuelib==1.5.0 # Updated from 1.4.2 redis>=3.0 # Updated from 2.10.5 -requests==2.18.4 # Updated from 2.18.3 +requests==2.26.0 requests-file==1.4.3 # Updated from 1.4.2 retrying==1.3.3 ./utils # scutils==1.3.0dev0 -service-identity==17.0.0 +service-identity==23.1.0 six==1.11.0 # Updated from 1.10.0 testfixtures==6.0.2 # Updated from 5.1.1 tldextract==2.2.0 # Updated from 2.1.0 -ujson==1.35 +ujson==5.7.0 w3lib==1.19.0 # Updated from 1.18.0 -zope.interface==4.5.0 # Updated from 4.4.2 # Generated with piprot 0.9.10 diff --git a/rest/requirements.txt b/rest/requirements.txt index a502bacb..960d982e 100644 --- a/rest/requirements.txt +++ b/rest/requirements.txt @@ -1,24 +1,24 @@ -click==6.7 -ConcurrentLogHandler==0.9.1 -Flask==1.0.2 # Updated from 0.12 +click==8.1.3 +concurrent-log-handler==0.9.20 +Flask==1.1.2 funcsigs==1.0.2 -future==0.16.0 -itsdangerous==0.24 -Jinja2==2.10 # Updated from 2.9.5 +future==0.18.3 +itsdangerous==2.0.0 +Jinja2==3.0.0 jsonschema==2.6.0 kafka-python==1.4.3 # Updated from 1.3.3 -kazoo==2.4.0 # Updated from 2.2.1 -MarkupSafe==1.1.0 # Updated from 1.0 +kazoo==2.5.0 # Updated from 2.2.1 +MarkupSafe==2.1.2 mock==2.0.0 -nose==1.3.7 +nose2==0.13.0 pbr==4.0.3 # Updated from 2.0.0 python-json-logger==0.1.8 # Updated from 0.1.7 redis>=3.0 # Updated from 2.10.5 -requests==2.18.4 # Updated from 2.13.0 +requests==2.26.0 retrying==1.3.3 ../utils # scutils==1.3.0dev0 six==1.11.0 # Updated from 1.10.0 testfixtures==6.0.2 # Updated from 4.13.5 -ujson==1.35 -Werkzeug==0.14.1 # Updated from 0.12.1 +ujson==5.7.0 +Werkzeug==2.0.0 # Generated with piprot 0.9.10 diff --git a/rest/rest_service.py b/rest/rest_service.py index 5109158d..923e8b4d 100644 --- a/rest/rest_service.py +++ b/rest/rest_service.py @@ -487,10 +487,10 @@ def _close_thread(self, thread, thread_name): @param thread: the thread to close @param thread_name: a human readable name of the thread """ - if thread is not None and thread.isAlive(): + if thread is not None and thread.is_alive(): self.logger.debug("Waiting for {} thread to close".format(thread_name)) thread.join(timeout=self.settings['DAEMON_THREAD_JOIN_TIMEOUT']) - if thread.isAlive(): + if thread.is_alive(): self.logger.warn("{} daemon thread unable to be shutdown" " within timeout".format(thread_name)) diff --git a/utils/scutils/log_factory.py b/utils/scutils/log_factory.py index c0c16abd..9e8a05bf 100644 --- a/utils/scutils/log_factory.py +++ b/utils/scutils/log_factory.py @@ -8,7 +8,7 @@ import copy from pythonjsonlogger import jsonlogger -from cloghandler import ConcurrentRotatingFileHandler +from concurrent_log_handler import ConcurrentRotatingFileHandler class LogFactory(object): diff --git a/utils/setup.py b/utils/setup.py index e430b482..7b328b70 100644 --- a/utils/setup.py +++ b/utils/setup.py @@ -16,9 +16,9 @@ def readme(): install_requires = [ 'python-json-logger==0.1.8', # Updated from 0.1.7 - 'ConcurrentLogHandler>=0.9.1', + 'concurrent-log-handler==0.9.20', 'redis>=3.0', - 'kazoo>=2.4.0', # Updated from 2.2.1 + 'kazoo>=2.5.0', # Updated from 2.2.1 'mock>=2.0.0', 'testfixtures>=6.0.2', # Updated from testfixtures==4.13.5 'ujson>=1.35', diff --git a/utils/tests/online.py b/utils/tests/online.py index 99feac6a..5d74b927 100644 --- a/utils/tests/online.py +++ b/utils/tests/online.py @@ -396,7 +396,7 @@ def setUp(self): valid_init=True) def test_ping(self): - self.assertTrue(self.zoo_watcher.ping()) + self.assertTrue(self.zoo_watcher) def test_get_file_contents(self): pointer_zoo_watcher = ZookeeperWatcher(hosts=self.hosts, diff --git a/utils/tests/test_zookeeper_watcher.py b/utils/tests/test_zookeeper_watcher.py index 05449c6e..748d3246 100644 --- a/utils/tests/test_zookeeper_watcher.py +++ b/utils/tests/test_zookeeper_watcher.py @@ -1,7 +1,6 @@ from mock import MagicMock, patch from unittest import TestCase from scutils.zookeeper_watcher import ZookeeperWatcher -from kazoo.client import KazooState from kazoo.exceptions import ZookeeperError, KazooException class TestZookeeperWatcher(TestCase): @@ -21,9 +20,9 @@ def setUp(self): def test_ping(self): self.zoo_watcher.zoo_client.server_version = MagicMock() - self.assertTrue(self.zoo_watcher.ping()) + self.assertTrue(self.zoo_watcher.is_valid()) self.zoo_watcher.zoo_client.server_version = MagicMock(side_effect=KazooException) - self.assertFalse(self.zoo_watcher.ping()) + self.assertFalse(self.zoo_watcher.is_valid()) def test_get_file_contents(self): self.zoo_watcher.old_pointed = 'old_pointed'