From 4882f379e3846908f8fccfd0801c8bc836ddc91e Mon Sep 17 00:00:00 2001 From: anderson Date: Wed, 5 Jun 2019 15:33:07 +0800 Subject: [PATCH 1/3] support python3 --- firmware/loader.py | 10 +- firmware/pipelines.py | 63 ++++++---- firmware/settings.py | 2 +- firmware/spiders/360.py | 107 ++++++++-------- firmware/spiders/actiontec.py | 4 +- firmware/spiders/airlink101.py | 4 +- firmware/spiders/asus.py | 8 +- firmware/spiders/att.py | 4 +- firmware/spiders/belkin.py | 8 +- firmware/spiders/buffalo.py | 2 +- firmware/spiders/centurylink.py | 4 +- firmware/spiders/dlink.py | 9 +- firmware/spiders/huawei_en.py | 10 +- firmware/spiders/huawei_zh.py | 10 +- firmware/spiders/linksys.py | 7 +- firmware/spiders/mercury.py | 201 +++++++++++++++--------------- firmware/spiders/mikrotik.py | 4 +- firmware/spiders/netcore.py | 4 +- firmware/spiders/netgear.py | 6 +- firmware/spiders/openwrt.py | 8 +- firmware/spiders/pfsense.py | 6 +- firmware/spiders/phicomm.py | 119 +++++++++--------- firmware/spiders/polycom.py | 16 ++- firmware/spiders/qnap.py | 8 +- firmware/spiders/synology.py | 8 +- firmware/spiders/tenda_en.py | 5 +- firmware/spiders/tenda_zh.py | 3 +- firmware/spiders/ti.py | 8 +- firmware/spiders/tomato-shibby.py | 5 +- firmware/spiders/tomato.py | 5 +- firmware/spiders/tp-link_en.py | 11 +- firmware/spiders/tp-link_zh.py | 7 +- firmware/spiders/trendnet.py | 7 +- firmware/spiders/ubiquiti.py | 10 +- firmware/spiders/ublox.py | 3 +- firmware/spiders/verizon.py | 5 +- firmware/spiders/xerox.py | 5 +- firmware/spiders/zyxel.py | 5 +- 38 files changed, 345 insertions(+), 366 deletions(-) diff --git a/firmware/loader.py b/firmware/loader.py index 58e4b07..2d00cd7 100644 --- a/firmware/loader.py +++ b/firmware/loader.py @@ -4,7 +4,7 @@ import datetime import re import string -import urlparse +from urllib.parse import urlparse, urljoin class FirmwareLoader(ItemLoader): @@ -38,18 +38,18 @@ def find_date(self, text): fmt = "(" + re.escape(fmt).replace("\%b", "[a-zA-Z]{3}").replace("\%B", "[a-zA-Z]+").replace( "\%m", "\d{1,2}").replace("\%d", "\d{1,2}").replace("\%y", "\d{2}").replace("\%Y", "\d{4}") + ")" match = re.search(fmt, "".join(text).strip()) - res = filter(lambda x: x, match.groups()) if match else None + res = list(filter(lambda x: x, match.groups())) if match else None if res: return res[0] return None def clean(s): - return filter(lambda x: x in string.printable, s).replace("\r", "").replace("\n", "").replace(u"\xa0", " ").strip() + return ''.join(list(filter(lambda x: x in string.printable, s))).replace("\r", "").replace("\n", "").replace(u"\xa0", " ").strip() def fix_url(url, loader_context): - if not urlparse.urlparse(url).netloc: - return urlparse.urljoin(loader_context.get("response").url, url) + if not urlparse(url).netloc: + return urljoin(loader_context.get("response").url, url) return url def parse_date(date, loader_context): diff --git a/firmware/pipelines.py b/firmware/pipelines.py index d00889d..9665144 100644 --- a/firmware/pipelines.py +++ b/firmware/pipelines.py @@ -5,11 +5,12 @@ import os import hashlib import logging -import urlparse -import urllib +from urllib.parse import urlparse, urlsplit, urlunsplit, unquote +from urllib.request import urlopen logger = logging.getLogger(__name__) + class FirmwarePipeline(FilesPipeline): def __init__(self, store_uri, download_func=None, settings=None): if settings and "SQL_SERVER" in settings: @@ -20,7 +21,8 @@ def __init__(self, store_uri, download_func=None, settings=None): else: self.database = None - super(FirmwarePipeline, self).__init__(store_uri, download_func,settings) + super(FirmwarePipeline, self).__init__(store_uri, download_func, settings) + @classmethod def from_settings(cls, settings): store_uri = settings['FILES_STORE'] @@ -33,7 +35,7 @@ def from_settings(cls, settings): # overrides function from FilesPipeline def file_path(self, request, response=None, info=None): extension = os.path.splitext(os.path.basename( - urlparse.urlsplit(request.url).path))[1] + urlsplit(request.url).path))[1] return "%s/%s%s" % (request.meta["vendor"], hashlib.sha1(request.url).hexdigest(), extension) @@ -43,23 +45,24 @@ def get_media_requests(self, item, info): for x in ["vendor", "url"]: if x not in item: raise DropItem( - "Missing required field '%s' for item: " % (x, item)) + "Missing required field '%s' for item %s: " % (x, item)) # resolve dynamic redirects in urls for x in ["mib", "sdk", "url"]: if x in item: - split = urlparse.urlsplit(item[x]) + split = urlsplit(item[x]) # remove username/password if only one provided if split.username or split.password and not (split.username and split.password): - item[x] = urlparse.urlunsplit( + item[x] = urlunsplit( (split[0], split[1][split[1].find("@") + 1:], split[2], split[3], split[4])) if split.scheme == "http": - item[x] = urllib.urlopen(item[x]).geturl() + item[x] = urlopen(item[x]).geturl() # check for filtered url types in path - url = urlparse.urlparse(item["url"]) - if any(url.path.endswith(x) for x in [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]): + url = urlparse(item["url"]) + if any(url.path.endswith(x) for x in + [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]): raise DropItem("Filtered path extension: %s" % url.path) elif any(x in url.path for x in ["driver", "utility", "install", "wizard", "gpl", "login"]): raise DropItem("Filtered path type: %s" % url.path) @@ -69,7 +72,9 @@ def get_media_requests(self, item, info): for x in ["mib", "url"] if x in item] # pass vendor so we can generate the correct file path and name - return [Request(x, meta={"ftp_user": "anonymous", "ftp_password": "chrome@example.com", "vendor": item["vendor"]}) for x in item[self.files_urls_field]] + return [ + Request(x, meta={"ftp_user": "anonymous", "ftp_password": "chrome@example.com", "vendor": item["vendor"]}) + for x in item[self.files_urls_field]] # overrides function from FilesPipeline def item_completed(self, results, item, info): @@ -86,7 +91,7 @@ def item_completed(self, results, item, info): for ok, x in results: for y in ["mib", "url", "sdk"]: # verify URL's are the same after unquoting - if ok and y in item and urllib.unquote(item[y]) == urllib.unquote(x["url"]): + if ok and y in item and unquote(item[y]) == unquote(x["url"]): status[y] = x elif y not in status: status[y] = {"checksum": None, "path": None} @@ -97,25 +102,26 @@ def item_completed(self, results, item, info): # attempt to find a matching image_id cur.execute("SELECT id FROM image WHERE hash=%s", - (status["url"]["checksum"], )) + (status["url"]["checksum"],)) image_id = cur.fetchone() if not image_id: - cur.execute("SELECT id FROM brand WHERE name=%s", (item["vendor"], )) + cur.execute("SELECT id FROM brand WHERE name=%s", (item["vendor"],)) brand_id = cur.fetchone() if not brand_id: - cur.execute("INSERT INTO brand (name) VALUES (%s) RETURNING id", (item["vendor"], )) + cur.execute("INSERT INTO brand (name) VALUES (%s) RETURNING id", (item["vendor"],)) brand_id = cur.fetchone() logger.info("Inserted database entry for brand: %d!" % brand_id) - cur.execute("INSERT INTO image (filename, description, brand_id, hash) VALUES (%s, %s, %s, %s) RETURNING id", - (status["url"]["path"], item.get("description", None), brand_id, status["url"]["checksum"])) + cur.execute( + "INSERT INTO image (filename, description, brand_id, hash) VALUES (%s, %s, %s, %s) RETURNING id", + (status["url"]["path"], item.get("description", None), brand_id, status["url"]["checksum"])) image_id = cur.fetchone() logger.info("Inserted database entry for image: %d!" % image_id) else: cur.execute("SELECT filename FROM image WHERE hash=%s", - (status["url"]["checksum"], )) + (status["url"]["checksum"],)) path = cur.fetchone() logger.info( @@ -127,19 +133,28 @@ def item_completed(self, results, item, info): status["url"]["path"]) # attempt to find a matching product_id - cur.execute("SELECT id FROM product WHERE iid=%s AND product IS NOT DISTINCT FROM %s AND version IS NOT DISTINCT FROM %s AND build IS NOT DISTINCT FROM %s", - (image_id, item.get("product", None), item.get("version", None), item.get("build", None))) + cur.execute( + "SELECT id FROM product WHERE iid=%s AND product IS NOT DISTINCT FROM %s AND version IS NOT DISTINCT FROM %s AND build IS NOT DISTINCT FROM %s", + (image_id, item.get("product", None), item.get("version", None), item.get("build", None))) product_id = cur.fetchone() if not product_id: - cur.execute("INSERT INTO product (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id", - (image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"], status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"], item.get("product", None), item.get("version", None), item.get("build", None), item.get("date", None))) + cur.execute( + "INSERT INTO product (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id", + (image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"], + status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"], + item.get("product", None), item.get("version", None), item.get("build", None), + item.get("date", None))) product_id = cur.fetchone() logger.info( "Inserted database entry for product: %d!" % product_id) else: - cur.execute("UPDATE product SET (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) = (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) WHERE id=%s", - (image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"], status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"], item.get("product", None), item.get("version", None), item.get("build", None), item.get("date", None), image_id)) + cur.execute( + "UPDATE product SET (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) = (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) WHERE id=%s", + (image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"], + status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"], + item.get("product", None), item.get("version", None), item.get("build", None), + item.get("date", None), image_id)) logger.info("Updated database entry for product: %d!" % product_id) self.database.commit() diff --git a/firmware/settings.py b/firmware/settings.py index 99aa7db..1ac57a5 100644 --- a/firmware/settings.py +++ b/firmware/settings.py @@ -25,4 +25,4 @@ ROBOTSTXT_OBEY = False USER_AGENT = "FirmwareBot/1.0 (+https://github.com/firmadyne/scraper)" -#SQL_SERVER = "127.0.0.1" +SQL_SERVER = "127.0.0.1" diff --git a/firmware/spiders/360.py b/firmware/spiders/360.py index 0b67f35..a1a7c49 100644 --- a/firmware/spiders/360.py +++ b/firmware/spiders/360.py @@ -1,56 +1,53 @@ -#coding:utf-8 -from scrapy import Spider -from scrapy.http import Request - -from firmware.items import FirmwareImage -from firmware.loader import FirmwareLoader - -import urlparse -import json - -class A360Spider(Spider): - name = "360" - #allowed_domains = ["luyou.360.cn"] - json_url = "http://s7.qhres.com/static/ef5bacdd3d93fa90/common_info.js" - start_urls = ["http://luyou.360.cn/download_center.html?from=nav"] - - def parse(self, response): - yield Request( - url=self.json_url, - headers={"Referer": response.url}, - callback=self.parse_product) - - def parse_product(self, response): - js = response.text - if js.startswith("var commonInfo"): - print response.url - print js - - p_product = u"id:\"(?P.*?)\"" - p_description = u"title:\"(?P.*?)\"" - p_version = u"romVersions:\"(?P.*?)\"" - p_url = u"romUrl:\"(?P.*?)\"" - p_date = u"updateDate:\"(?P.*?)\"" - - import re - products = re.findall(p_version, js) - descriptions = re.findall(p_description, js) - versions = re.findall(p_version, js) - urls = re.findall(p_url, js) - dates = re.findall(p_date, js) - - for i in xrange(len(products)): - product = products[i] - url = urls[i] - version = versions[i] - description = descriptions[i] - date = dates[i] - - item = FirmwareLoader( - item=FirmwareImage(), response=response) - item.add_value("url", url) - item.add_value("product", product) - item.add_value("description", description) - item.add_value("date", date) - item.add_value("vendor", self.name) +#coding:utf-8 +from scrapy import Spider +from scrapy.http import Request + +from firmware.items import FirmwareImage +from firmware.loader import FirmwareLoader + +class A360Spider(Spider): + name = "360" + #allowed_domains = ["luyou.360.cn"] + json_url = "http://s7.qhres.com/static/ef5bacdd3d93fa90/common_info.js" + start_urls = ["http://luyou.360.cn/download_center.html?from=nav"] + + def parse(self, response): + yield Request( + url=self.json_url, + headers={"Referer": response.url}, + callback=self.parse_product) + + def parse_product(self, response): + js = response.text + if js.startswith("var commonInfo"): + print(response.url) + print(js) + + p_product = u"id:\"(?P.*?)\"" + p_description = u"title:\"(?P.*?)\"" + p_version = u"romVersions:\"(?P.*?)\"" + p_url = u"romUrl:\"(?P.*?)\"" + p_date = u"updateDate:\"(?P.*?)\"" + + import re + products = re.findall(p_version, js) + descriptions = re.findall(p_description, js) + versions = re.findall(p_version, js) + urls = re.findall(p_url, js) + dates = re.findall(p_date, js) + + for i in xrange(len(products)): + product = products[i] + url = urls[i] + version = versions[i] + description = descriptions[i] + date = dates[i] + + item = FirmwareLoader( + item=FirmwareImage(), response=response) + item.add_value("url", url) + item.add_value("product", product) + item.add_value("description", description) + item.add_value("date", date) + item.add_value("vendor", self.name) yield item.load_item() \ No newline at end of file diff --git a/firmware/spiders/actiontec.py b/firmware/spiders/actiontec.py index 2d2cbda..5136285 100644 --- a/firmware/spiders/actiontec.py +++ b/firmware/spiders/actiontec.py @@ -5,7 +5,7 @@ from firmware.loader import FirmwareLoader import re -import urlparse +from urllib.parse import urljoin class ActiontecSpider(Spider): @@ -23,7 +23,7 @@ def parse(self, response): product = actual.group(1).replace("(", "").replace(")", "") yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, link.xpath(".//@href").extract()[0]), headers={"Referer": response.url}, meta={"product": product}, diff --git a/firmware/spiders/airlink101.py b/firmware/spiders/airlink101.py index 55d647b..653f7b5 100644 --- a/firmware/spiders/airlink101.py +++ b/firmware/spiders/airlink101.py @@ -4,7 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse +from urllib.parse import urljoin class Airlink101Spider(Spider): @@ -23,7 +23,7 @@ def parse(self, response): if "_a=download" not in href: yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"product": text.strip().split(' ')}, callback=self.parse) diff --git a/firmware/spiders/asus.py b/firmware/spiders/asus.py index f49831b..2c68eb3 100644 --- a/firmware/spiders/asus.py +++ b/firmware/spiders/asus.py @@ -3,7 +3,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse +from urllib.parse import urljoin class AsusSpider(Spider): @@ -18,7 +18,7 @@ def parse(self, response): if "cid" not in response.meta: for category in response.xpath("//div[@class='product-category']//a/@l1_id").extract(): yield Request( - url=urlparse.urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s" % (self.region, category)), + url=urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s" % (self.region, category)), meta={"cid": category}, headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -27,7 +27,7 @@ def parse(self, response): elif "sid" not in response.meta: for series in response.xpath("//table/id/text()").extract(): yield Request( - url=urlparse.urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s&s=%s" % (self.region, response.meta["cid"], series)), + url=urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s&s=%s" % (self.region, response.meta["cid"], series)), meta={"cid": response.meta["cid"], "sid": series}, headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -41,7 +41,7 @@ def parse(self, response): # choose "Others" = 8 yield Request( - url=urlparse.urljoin(response.url, "/support/Download/%s/%s/%s/%s/%d" % (response.meta["cid"], response.meta["sid"], pid, mid, 8)), + url=urljoin(response.url, "/support/Download/%s/%s/%s/%s/%d" % (response.meta["cid"], response.meta["sid"], pid, mid, 8)), meta={"product": product}, headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, diff --git a/firmware/spiders/att.py b/firmware/spiders/att.py index 9c1592b..87b23c5 100644 --- a/firmware/spiders/att.py +++ b/firmware/spiders/att.py @@ -4,7 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse +from urllib.parse import urljoin class ATTSpider(Spider): @@ -23,6 +23,6 @@ def parse(self, response): yield item.load_item() elif "/" in href: yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, callback=self.parse) diff --git a/firmware/spiders/belkin.py b/firmware/spiders/belkin.py index 627ea9c..e6c3e0b 100644 --- a/firmware/spiders/belkin.py +++ b/firmware/spiders/belkin.py @@ -4,7 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse +from urllib.parse import urljoin class BelkinSpider(Spider): @@ -31,7 +31,7 @@ def parse(self, response): else: for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract(): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/us/support-product?pid=%s" % (product)), headers={"Referer": response.url}, callback=self.parse_product) @@ -40,7 +40,7 @@ def parse_product(self, response): for item in response.xpath("//div[@id='main-content']//a"): if "firmware" in item.xpath(".//text()").extract()[0].lower(): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, item.xpath(".//@href").extract()[0]), headers={"Referer": response.url}, meta={"product": response.xpath("//p[@class='product-part-number']/text()").extract()[0].split(' ')[-1]}, @@ -61,7 +61,7 @@ def parse_redirect(self, response): for text in response.body.split('\''): if "articles/" in text.lower() and "download/" in text.lower(): yield Request( - url=urlparse.urljoin(response.url, text), + url=urljoin(response.url, text), headers={"Referer": response.url}, meta={"product": response.meta["product"]}, callback=self.parse_kb) diff --git a/firmware/spiders/buffalo.py b/firmware/spiders/buffalo.py index 86c3eb2..9c16acb 100644 --- a/firmware/spiders/buffalo.py +++ b/firmware/spiders/buffalo.py @@ -24,7 +24,7 @@ def parse_product(self, response): #

Firmware

if response.xpath('//h3[@class="firm"]').extract(): for tr in response.xpath('//*[@id="tab-downloads"]/table[1]/tbody/tr'): - print tr.extract() + print(tr.extract()) url = tr.xpath("./td[2]/a/@href").extract()[0] date = tr.xpath("./td[4]/text()").extract()[0] version = tr.xpath("./td[5]/text()").extract()[0] diff --git a/firmware/spiders/centurylink.py b/firmware/spiders/centurylink.py index 6725005..36b3c35 100644 --- a/firmware/spiders/centurylink.py +++ b/firmware/spiders/centurylink.py @@ -4,7 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse +from urllib.parse import urljoin # http://home.centurytel.net/ihd/ @@ -24,7 +24,7 @@ def parse(self, response): for link in section.xpath(".//a/@href").extract(): if link.endswith(".html"): yield Request( - url=urlparse.urljoin(response.url, link), + url=urljoin(response.url, link), meta={"product": product, "version": FirmwareLoader.find_version(text)}, headers={"Referer": response.url}, diff --git a/firmware/spiders/dlink.py b/firmware/spiders/dlink.py index 5dd83ab..736f320 100644 --- a/firmware/spiders/dlink.py +++ b/firmware/spiders/dlink.py @@ -5,7 +5,7 @@ from firmware.loader import FirmwareLoader import json -import urlparse +from urllib.parse import urljoin class DLinkSpider(Spider): @@ -22,7 +22,7 @@ def start_requests(self): def parse(self, response): for entry in response.xpath("//tr/td[1]/a/@alt").extract(): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "ProductInfo.aspx?m=%s" % entry), headers={"Referer": response.url}, meta={"product": entry}, @@ -35,7 +35,7 @@ def parse_product(self, response): if val: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/ajax/ajax.ashx?action=productfile&ver=%s" % val), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -49,8 +49,7 @@ def parse_json(self, response): for entry in reversed(json_response["item"]): for file in reversed(entry["file"]): - if file["filetypename"].lower() == "firmware" or file[ - "isFirmF"] == "1": + if file["filetypename"].lower() == "firmware" or file["isFirmF"] == "1": item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%m/%d/%y"]) diff --git a/firmware/spiders/huawei_en.py b/firmware/spiders/huawei_en.py index 3a61d83..e3a090b 100644 --- a/firmware/spiders/huawei_en.py +++ b/firmware/spiders/huawei_en.py @@ -6,9 +6,7 @@ import datetime import json -import urlparse - - +from urllib.parse import urljoin class HuaweiENSpider(Spider): name = "huawei_en" vendor = "huawei" @@ -18,7 +16,7 @@ class HuaweiENSpider(Spider): def parse(self, response): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/support/services/service/product/category?siteCode=%s" % (self.region)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -29,7 +27,7 @@ def parse_category(self, response): for category in json_response: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/support/services/service/product/list?productID=%s&siteCode=%s" % (category["productId"], self.region)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -40,7 +38,7 @@ def parse_product(self, response): for product in json_response: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/support/services/service/file/list?productID=%s&siteCode=%s" % (product["productId"], self.region)), meta={"product": product["productCode"]}, headers={"Referer": response.url, diff --git a/firmware/spiders/huawei_zh.py b/firmware/spiders/huawei_zh.py index b712563..1c9c84b 100644 --- a/firmware/spiders/huawei_zh.py +++ b/firmware/spiders/huawei_zh.py @@ -6,9 +6,7 @@ import datetime import json -import urlparse - - +from urllib.parse import urljoin class HuaweiZHSpider(Spider): name = "huawei_zh" vendor = "huawei" @@ -18,7 +16,7 @@ class HuaweiZHSpider(Spider): def parse(self, response): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/support/services/service/product/category?siteCode=%s" % (self.region)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -29,7 +27,7 @@ def parse_category(self, response): for category in json_response: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/support/services/service/product/list?productID=%s&siteCode=%s" % (category["productId"], self.region)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -40,7 +38,7 @@ def parse_product(self, response): for product in json_response: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/support/services/service/file/list?productID=%s&siteCode=%s" % (product["productId"], self.region)), meta={"product": product["productCode"]}, headers={"Referer": response.url, diff --git a/firmware/spiders/linksys.py b/firmware/spiders/linksys.py index df95a12..9df59a5 100644 --- a/firmware/spiders/linksys.py +++ b/firmware/spiders/linksys.py @@ -4,8 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - +from urllib.parse import urljoin # see: http://www.dd-wrt.com/phpBB2/viewtopic.php?t=145255&postdays=0&postorder=asc&start=0 # and http://download.modem-help.co.uk/mfcs-L/LinkSys/ @@ -18,7 +17,7 @@ class LinksysSpider(Spider): def parse(self, response): for link in response.xpath("//div[@class='item']//a/@href").extract(): yield Request( - url=urlparse.urljoin(response.url, link), + url=urljoin(response.url, link), headers={"Referer": response.url}, callback=self.parse_support) @@ -29,7 +28,7 @@ def parse_support(self, response): if "download" in text.lower(): yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), meta={"product": response.xpath( "//span[@class='part-number']/text()").extract()[0].replace("SKU", "").strip()}, headers={"Referer": response.url}, diff --git a/firmware/spiders/mercury.py b/firmware/spiders/mercury.py index 37d5402..d6ff8bd 100644 --- a/firmware/spiders/mercury.py +++ b/firmware/spiders/mercury.py @@ -1,102 +1,101 @@ -#coding:utf-8 - -from scrapy import Spider -from scrapy.http import Request - -from firmware.items import FirmwareImage -from firmware.loader import FirmwareLoader -import urlparse - -class MercurySpider(Spider): - name = "mercury" - vendor = "mercury" - allowed_domains = ["mercurycom.com.cn"] - start_urls = ["http://service.mercurycom.com.cn/download-list.html"] - download_path = "http://service.mercurycom.com.cn" - - def parse(self, response): - end_page = int(response.xpath("//*[@class='pagebar']//a[last()]//text()").extract()[0]) - cur_page = 0 - while cur_page < end_page: - cur_page += 1 - url = 'http://service.mercurycom.com.cn/download-tip-software-{}-0-1.html'.format(cur_page) - yield Request( - url = url, - headers={"Referer": response.url}, - callback = self.parse_list) - - def parse_list(self, response): - href = response.xpath("//tbody//a//@href").extract()[0] - yield Request( - url = urlparse.urljoin(self.download_path, href), - headers={"Referer": response.url}, - callback = self.parse_product - ) - - def parse_product(self, response): - - tmp = [] - for p in response.xpath("//table//tr//td[2]"): - tmp.append(p) - - title = tmp[0].xpath("./p/text()").extract()[0] - url = urlparse.urljoin(self.download_path, tmp[3].xpath("./a/@href").extract()[0]) - - def parse(title): - - print title - product = version = date = None - - tmp = title.split(' ') - product = tmp[0] - - if len(tmp) == 2: - #MR814v1_070807 升级程序 - if '_' in tmp[0]: - tmp2 = tmp[0].split('_') - version = tmp2[0] - date = tmp2[1][:6] - #MWR300T V1(081210)标准版 - elif tmp[1][0] in ['v', 'V']: - pass - else: - tmp2 = tmp[1].split('_') - version = tmp2[0] - date = tmp2[1][:6] - - elif len(tmp) == 3: - tmp2 = tmp[1].split('_') - version = tmp2[0] - date = tmp2[1] - - if version: - if version[0] not in ['v', 'V']: - if 'v' in product: - t = product.split('v') - product = t[0] - version = t[1] - - #MR814v1_070807 升级程序 - if product.count('_'): - tmp = product.split('_') - product = tmp[0] - if product.count('v'): - product = product.split('v')[0] - elif product.count('V'): - product = product.split('v')[0] - - - return product, version, date - - product, version, date = parse(title) - - item = FirmwareLoader( - item=FirmwareImage()) - item.add_value("url", url), - item.add_value("product", product), - #item.add_value("date", date), - #item.add_value("version", version), - item.add_value("vendor", self.vendor), - item.add_value("description", title) - +#coding:utf-8 + +from scrapy import Spider +from scrapy.http import Request + +from firmware.items import FirmwareImage +from firmware.loader import FirmwareLoader +from urllib.parse import urljoin +class MercurySpider(Spider): + name = "mercury" + vendor = "mercury" + allowed_domains = ["mercurycom.com.cn"] + start_urls = ["http://service.mercurycom.com.cn/download-list.html"] + download_path = "http://service.mercurycom.com.cn" + + def parse(self, response): + end_page = int(response.xpath("//*[@class='pagebar']//a[last()]//text()").extract()[0]) + cur_page = 0 + while cur_page < end_page: + cur_page += 1 + url = 'http://service.mercurycom.com.cn/download-tip-software-{}-0-1.html'.format(cur_page) + yield Request( + url = url, + headers={"Referer": response.url}, + callback = self.parse_list) + + def parse_list(self, response): + href = response.xpath("//tbody//a//@href").extract()[0] + yield Request( + url = urljoin(self.download_path, href), + headers={"Referer": response.url}, + callback = self.parse_product + ) + + def parse_product(self, response): + + tmp = [] + for p in response.xpath("//table//tr//td[2]"): + tmp.append(p) + + title = tmp[0].xpath("./p/text()").extract()[0] + url = urljoin(self.download_path, tmp[3].xpath("./a/@href").extract()[0]) + + def parse(title): + + print(title) + product = version = date = None + + tmp = title.split(' ') + product = tmp[0] + + if len(tmp) == 2: + #MR814v1_070807 升级程序 + if '_' in tmp[0]: + tmp2 = tmp[0].split('_') + version = tmp2[0] + date = tmp2[1][:6] + #MWR300T V1(081210)标准版 + elif tmp[1][0] in ['v', 'V']: + pass + else: + tmp2 = tmp[1].split('_') + version = tmp2[0] + date = tmp2[1][:6] + + elif len(tmp) == 3: + tmp2 = tmp[1].split('_') + version = tmp2[0] + date = tmp2[1] + + if version: + if version[0] not in ['v', 'V']: + if 'v' in product: + t = product.split('v') + product = t[0] + version = t[1] + + #MR814v1_070807 升级程序 + if product.count('_'): + tmp = product.split('_') + product = tmp[0] + if product.count('v'): + product = product.split('v')[0] + elif product.count('V'): + product = product.split('v')[0] + + + return product, version, date + + product, version, date = parse(title) + + item = FirmwareLoader( + item=FirmwareImage()) + item.add_value("url", url), + item.add_value("product", product), + #item.add_value("date", date), + #item.add_value("version", version), + item.add_value("vendor", self.vendor), + item.add_value("description", title) + yield item.load_item() \ No newline at end of file diff --git a/firmware/spiders/mikrotik.py b/firmware/spiders/mikrotik.py index 42f2b55..4424b0b 100644 --- a/firmware/spiders/mikrotik.py +++ b/firmware/spiders/mikrotik.py @@ -4,9 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - - +from urllib.parse import urljoin class MikrotikSpider(Spider): name = "mikrotik" allowed_domains = ["mikrotik.com"] diff --git a/firmware/spiders/netcore.py b/firmware/spiders/netcore.py index 23f2c6c..1b499ee 100644 --- a/firmware/spiders/netcore.py +++ b/firmware/spiders/netcore.py @@ -6,9 +6,7 @@ from firmware.loader import FirmwareLoader import re -import urlparse - - +from urllib.parse import urljoin class NetcoreSpider(Spider): name = "netcore" allowed_domains = ["netcoretec.com"] diff --git a/firmware/spiders/netgear.py b/firmware/spiders/netgear.py index b84e87a..05d2437 100644 --- a/firmware/spiders/netgear.py +++ b/firmware/spiders/netgear.py @@ -4,9 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - - +from urllib.parse import urljoin class NetgearSpider(Spider): name = "netgear" allowed_domains = ["netgear.com"] @@ -60,7 +58,7 @@ def parse(self, response): yield item.load_item() elif "mib" in " ".join(text).lower(): - mib = urlparse.urljoin(response.url, href[0].strip()) + mib = urljoin(response.url, href[0].strip()) elif "" not in response.xpath("//select[@name='ctl00$ctl00$ctl00$mainContent$localizedContent$bodyCenter$adsPanel$lbProduct']/option/@value").extract(): for entry in response.xpath( diff --git a/firmware/spiders/openwrt.py b/firmware/spiders/openwrt.py index bf6bf07..8d68aac 100644 --- a/firmware/spiders/openwrt.py +++ b/firmware/spiders/openwrt.py @@ -4,9 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - - +from urllib.parse import urljoin class OpenWRTSpider(Spider): name = "openwrt" allowed_domains = ["downloads.openwrt.org"] @@ -18,7 +16,7 @@ def parse(self, response): href = link.xpath("@href").extract()[0] yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"version": FirmwareLoader.find_version_period(text)}, callback=self.parse_url) @@ -35,7 +33,7 @@ def parse_url(self, response): product = "%s-%s" % (response.meta["product"], text[0: -1]) if "product" in response.meta else text[0: -1] yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"version": response.meta[ "version"], "product": product}, diff --git a/firmware/spiders/pfsense.py b/firmware/spiders/pfsense.py index accd390..ab24a16 100644 --- a/firmware/spiders/pfsense.py +++ b/firmware/spiders/pfsense.py @@ -5,9 +5,7 @@ from firmware.loader import FirmwareLoader import os -import urlparse - - +from urllib.parse import urljoin class PfSenseSpider(Spider): name = "pfsense" allowed_domains = ["pfsense.org"] @@ -22,7 +20,7 @@ def parse(self, response): continue elif href.endswith('/'): yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, callback=self.parse) elif href.endswith(".gz") and ".iso" not in href: diff --git a/firmware/spiders/phicomm.py b/firmware/spiders/phicomm.py index 444015f..a89dfb0 100644 --- a/firmware/spiders/phicomm.py +++ b/firmware/spiders/phicomm.py @@ -1,61 +1,60 @@ -#coding:utf-8 - -from scrapy import Spider -from scrapy.http import Request - -from firmware.items import FirmwareImage -from firmware.loader import FirmwareLoader -import urlparse - -class PhicommSpider(Spider): - name = "phicomm" - vendor = "phicomm" - allowed_domains = ["phicomm.com"] - #Routers are K serials - start_urls = ["http://www.phicomm.com/cn/support.php/Soho/search_support/col/6,/keys/K.html"] - download_path = "http://www.phicomm.com/" - - def parse(self, response): - head = False - for tr in response.xpath("//table//tr"): - if not head: - head = True - continue - - description = tr.xpath("./td[2]/text()").extract()[0] - product = description.split(u'(')[0] - version = tr.xpath("./td[4]/text()").extract()[0] - #2017-03-14 - date = tr.xpath("./td[6]/p/text()").extract()[0] - downloadid = tr.xpath("./td[7]/a/@downloadid").extract()[0] - - #http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id=437 - firmware_url = "http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id={}".format(downloadid) - yield Request( - url = firmware_url, - headers={"Referer": response.url}, - meta={ - "product":product, - "version":version, - "date":date, - 'description':description - }, - callback = self.parse_product) - - def parse_product(self, response): - import re - #/cn/Uploads/files/20161024/K1_V22.4.2.15.bin - print response.text - path = re.findall(u"(/cn/Uploads/files/.*?\.bin)", response.text)[0] - url = "http://www.phicomm.com/{}".format(path) - - item = FirmwareLoader( - item=FirmwareImage()) - item.add_value("url", url), - item.add_value("product", response.meta['product']), - item.add_value("date", response.meta['date']), - item.add_value("version", response.meta['version']), - item.add_value("vendor", self.vendor), - item.add_value("description", response.meta['description']), - +#coding:utf-8 + +from scrapy import Spider +from scrapy.http import Request + +from firmware.items import FirmwareImage +from firmware.loader import FirmwareLoader +from urllib.parse import urljoin +class PhicommSpider(Spider): + name = "phicomm" + vendor = "phicomm" + allowed_domains = ["phicomm.com"] + #Routers are K serials + start_urls = ["http://www.phicomm.com/cn/support.php/Soho/search_support/col/6,/keys/K.html"] + download_path = "http://www.phicomm.com/" + + def parse(self, response): + head = False + for tr in response.xpath("//table//tr"): + if not head: + head = True + continue + + description = tr.xpath("./td[2]/text()").extract()[0] + product = description.split(u'(')[0] + version = tr.xpath("./td[4]/text()").extract()[0] + #2017-03-14 + date = tr.xpath("./td[6]/p/text()").extract()[0] + downloadid = tr.xpath("./td[7]/a/@downloadid").extract()[0] + + #http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id=437 + firmware_url = "http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id={}".format(downloadid) + yield Request( + url = firmware_url, + headers={"Referer": response.url}, + meta={ + "product":product, + "version":version, + "date":date, + 'description':description + }, + callback = self.parse_product) + + def parse_product(self, response): + import re + #/cn/Uploads/files/20161024/K1_V22.4.2.15.bin + print(response.text) + path = re.findall(u"(/cn/Uploads/files/.*?\.bin)", response.text)[0] + url = "http://www.phicomm.com/{}".format(path) + + item = FirmwareLoader( + item=FirmwareImage()) + item.add_value("url", url), + item.add_value("product", response.meta['product']), + item.add_value("date", response.meta['date']), + item.add_value("version", response.meta['version']), + item.add_value("vendor", self.vendor), + item.add_value("description", response.meta['description']), + yield item.load_item() \ No newline at end of file diff --git a/firmware/spiders/polycom.py b/firmware/spiders/polycom.py index 42faaf6..5b49660 100644 --- a/firmware/spiders/polycom.py +++ b/firmware/spiders/polycom.py @@ -4,14 +4,17 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse +from urllib.parse import urljoin, urlparse class PolycomSpider(Spider): name = "polycom" allowed_domains = ["polycom.com"] - start_urls = ["http://support.polycom.com/PolycomService/support/us/support/video/index.html", "http://support.polycom.com/PolycomService/support/us/support/voice/index.html", "http://support.polycom.com/PolycomService/support/us/support/network/index.html", - "http://support.polycom.com/PolycomService/support/us/support/cloud_hosted_solutions/index.html", "http://support.polycom.com/PolycomService/support/us/support/strategic_partner_solutions/index.html"] + start_urls = ["http://support.polycom.com/PolycomService/support/us/support/video/index.html", + "http://support.polycom.com/PolycomService/support/us/support/voice/index.html", + "http://support.polycom.com/PolycomService/support/us/support/network/index.html", + "http://support.polycom.com/PolycomService/support/us/support/cloud_hosted_solutions/index.html", + "http://support.polycom.com/PolycomService/support/us/support/strategic_partner_solutions/index.html"] download = "/PolycomService/support/us" @@ -45,15 +48,16 @@ def parse(self, response): href = entry.xpath("./a/@href").extract()[0].strip() date = entry.xpath("./span//text()").extract() - path = urlparse.urlparse(href).path + path = urlparse(href).path if any(x in text.lower() for x in ["end user license agreement", "eula", "release notes", - "mac os", "windows", "guide", "(pdf)", "sample"]) or href.endswith(".pdf"): + "mac os", "windows", "guide", "(pdf)", "sample"]) or href.endswith( + ".pdf"): continue elif any(path.endswith(x) for x in [".htm", ".html"]) or "(html)" in text.lower(): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, PolycomSpider.fix_url(href)), meta={"product": response.meta["product"] if "product" in response.meta else text, "date": date, "version": FirmwareLoader.find_version_period([text]), "description": text}, diff --git a/firmware/spiders/qnap.py b/firmware/spiders/qnap.py index 624fb41..8d1440e 100644 --- a/firmware/spiders/qnap.py +++ b/firmware/spiders/qnap.py @@ -4,9 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - - +from urllib.parse import urljoin class QNAPSpider(Spider): name = "qnap" allowed_domains = ["qnap.com"] @@ -14,7 +12,7 @@ class QNAPSpider(Spider): def parse(self, response): yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/i/useng/product_x_down/ajax/get_module.php"), headers={"Referer": response.url}, callback=self.parse_products) @@ -27,7 +25,7 @@ def parse_products(self, response): if value: yield Request( # firmware = 1, utility = 4, etc - url=urlparse.urljoin( + url=urljoin( response.url, "/i/useng/product_x_down/product_down.php?II=%s&cat_choose=%d" % (value[0], 1)), meta={"product": text[0]}, callback=self.parse_product) diff --git a/firmware/spiders/synology.py b/firmware/spiders/synology.py index 7dddf77..f3973f0 100644 --- a/firmware/spiders/synology.py +++ b/firmware/spiders/synology.py @@ -5,9 +5,7 @@ from firmware.loader import FirmwareLoader import os -import urlparse - - +from urllib.parse import urljoin class SynologySpider(Spider): name = "synology" allowed_domains = ["synology.com"] @@ -32,7 +30,7 @@ def parse(self, response): build = text[0: -1] yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "version": version}, callback=self.parse) @@ -64,7 +62,7 @@ def parse(self, response): version, build = text[0: -1].split("-") yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "version": version}, callback=self.parse) diff --git a/firmware/spiders/tenda_en.py b/firmware/spiders/tenda_en.py index 6de1610..bc49035 100644 --- a/firmware/spiders/tenda_en.py +++ b/firmware/spiders/tenda_en.py @@ -6,8 +6,7 @@ from firmware.loader import FirmwareLoader import json -import urlparse - +from urllib.parse import urljoin class TendaENSpider(Spider): name = "tenda_en" vendor = "tenda" @@ -39,7 +38,7 @@ def parse_product(self, response): #O3 Firmware V1.0.0.3_EN #i6 Firmware V1.0.0.9(3857)_EN import re - p = ur'^(?P([a-uw-zA-UW-Z0-9])+)[\(\uff08]?(V\d\.0)?' + p = r'^(?P([a-uw-zA-UW-Z0-9])+)[\(\uff08]?(V\d\.0)?' try: ret = re.search(p, items[0].decode('utf-8')) diff --git a/firmware/spiders/tenda_zh.py b/firmware/spiders/tenda_zh.py index be2fc70..5bfddff 100644 --- a/firmware/spiders/tenda_zh.py +++ b/firmware/spiders/tenda_zh.py @@ -7,8 +7,7 @@ from firmware.loader import FirmwareLoader import json -import urlparse - +from urllib.parse import urljoin class TendaZHSpider(Spider): name = "tenda_zh" vendor = "tenda" diff --git a/firmware/spiders/ti.py b/firmware/spiders/ti.py index 566ddf6..1448838 100644 --- a/firmware/spiders/ti.py +++ b/firmware/spiders/ti.py @@ -4,9 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - - +from urllib.parse import urljoin class TISpider(Spider): name = "ti" allowed_domains = ["ti.com"] @@ -16,7 +14,7 @@ def parse(self, response): for product in response.xpath( "//select[@id='placeholdersitebody_0_ctl02_ctl00_ddlClassification']/option[position() > 1]"): yield Request( - url=urlparse.urljoin(response.url + "/", + url=urljoin(response.url + "/", product.xpath("./@value").extract()[0]), meta={"product": product.xpath("./text()").extract()[0]}, callback=self.parse_product) @@ -27,7 +25,7 @@ def parse_product(self, response): if link.xpath("./th[1]/a/text()").extract() and "Operating System" in link.xpath( "./th[1]/a/text()").extract()[0]: yield Request( - url=urlparse.urljoin(response.url, link.xpath( + url=urljoin(response.url, link.xpath( "./th[1]/a/@href").extract()[0]), meta={"product": response.meta["product"]}, callback=self.parse_link) diff --git a/firmware/spiders/tomato-shibby.py b/firmware/spiders/tomato-shibby.py index 1370239..f27e4ff 100644 --- a/firmware/spiders/tomato-shibby.py +++ b/firmware/spiders/tomato-shibby.py @@ -5,8 +5,7 @@ from firmware.loader import FirmwareLoader import os -import urlparse - +from urllib.parse import urljoin class TomatoShibbySpider(Spider): name = "tomato-shibby" allowed_domains = ["tomato.groov.pl"] @@ -32,7 +31,7 @@ def parse(self, response): build = text.replace("build", "") yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "product": product}, callback=self.parse) diff --git a/firmware/spiders/tomato.py b/firmware/spiders/tomato.py index 1370239..f27e4ff 100644 --- a/firmware/spiders/tomato.py +++ b/firmware/spiders/tomato.py @@ -5,8 +5,7 @@ from firmware.loader import FirmwareLoader import os -import urlparse - +from urllib.parse import urljoin class TomatoShibbySpider(Spider): name = "tomato-shibby" allowed_domains = ["tomato.groov.pl"] @@ -32,7 +31,7 @@ def parse(self, response): build = text.replace("build", "") yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "product": product}, callback=self.parse) diff --git a/firmware/spiders/tp-link_en.py b/firmware/spiders/tp-link_en.py index df6612a..5314b29 100644 --- a/firmware/spiders/tp-link_en.py +++ b/firmware/spiders/tp-link_en.py @@ -5,8 +5,7 @@ from firmware.loader import FirmwareLoader import json -import urlparse - +from urllib.parse import urljoin class TPLinkENSpider(Spider): name = "tp-link_en" vendor = "tp-link" @@ -19,7 +18,7 @@ def parse(self, response): for cid in response.xpath( "//select[@id='slcProductCat']//option/@value").extract(): yield Request( - url=urlparse.urljoin( + url=urljoin( self.base_path, "/getMenuList.html?action=getsubcatlist&catid=%s&appPath=us" % cid), meta={"cid": cid}, headers={"Referer": response.url, @@ -32,7 +31,7 @@ def parse_json(self, response): if json_response: for entry in json_response: yield Request( - url=urlparse.urljoin( + url=urljoin( self.base_path, "/getMenuList.html?action=getsubcatlist&catid=%s&appPath=us" % entry["id"]), meta={"cid": entry["id"]}, headers={"Referer": response.url, @@ -40,7 +39,7 @@ def parse_json(self, response): callback=self.parse_json) else: yield Request( - url=urlparse.urljoin( + url=urljoin( self.base_path, "phppage/down-load-model-list.html?showEndLife=false&catid={}&appPath=us".format(response.meta["cid"])), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -52,7 +51,7 @@ def parse_products(self, response): #description = json_response[0]['title'] for row in json_response[0]['row']: yield Request( - url = urlparse.urljoin(self.base_path, row['href']), + url = urljoin(self.base_path, row['href']), meta = {"product": row['model'], }, callback = self.parse_product_version) diff --git a/firmware/spiders/tp-link_zh.py b/firmware/spiders/tp-link_zh.py index 1f2acb6..b667911 100644 --- a/firmware/spiders/tp-link_zh.py +++ b/firmware/spiders/tp-link_zh.py @@ -6,8 +6,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - +from urllib.parse import urljoin class TPLinkZHSpider(Spider): name = "tp-link_zh" vendor = "tp-link" @@ -19,13 +18,13 @@ def parse(self, response): for product in response.xpath( "//table[@id='mainlist']//a/@href").extract(): yield Request( - url=urlparse.urljoin(response.url, product), + url=urljoin(response.url, product), headers={"Referer": response.url}, callback=self.parse_product) for page in response.xpath("//div[@id='paging']/a/@href").extract(): yield Request( - url=urlparse.urljoin(response.url, page), + url=urljoin(response.url, page), headers={"Referer": response.url}, callback=self.parse) diff --git a/firmware/spiders/trendnet.py b/firmware/spiders/trendnet.py index bbc543a..b405648 100644 --- a/firmware/spiders/trendnet.py +++ b/firmware/spiders/trendnet.py @@ -4,8 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - +from urllib.parse import urljoin class TrendnetSpider(Spider): name = "trendnet" allowed_domains = ["trendnet.com"] @@ -18,7 +17,7 @@ def parse(self, response): href = entry.xpath("./@value").extract()[0] yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), meta={"product": text}, headers={"Referer": response.url}, callback=self.parse_product) @@ -30,7 +29,7 @@ def parse_product(self, response): if "downloads" in text.lower(): yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), meta={"product": response.meta["product"]}, headers={"Referer": response.url}, callback=self.parse_download) diff --git a/firmware/spiders/ubiquiti.py b/firmware/spiders/ubiquiti.py index 4f0d022..8c62487 100644 --- a/firmware/spiders/ubiquiti.py +++ b/firmware/spiders/ubiquiti.py @@ -5,9 +5,7 @@ from firmware.loader import FirmwareLoader import json -import urlparse - - +from urllib.parse import urljoin class UbiquitiSpider(Spider): name = "ubiquiti" allowed_domains = ["ubnt.com"] @@ -17,7 +15,7 @@ def parse(self, response): for platform in response.xpath( "//a[@data-ga-category='download-nav']/@data-slug").extract(): yield Request( - url=urlparse.urljoin(response.url, "?group=%s" % (platform)), + url=urljoin(response.url, "?group=%s" % (platform)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_json) @@ -28,7 +26,7 @@ def parse_json(self, response): if "products" in json_response: for product in json_response["products"]: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "?product=%s" % (product["slug"])), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, @@ -54,7 +52,7 @@ def parse_json(self, response): if entry["sdk__id"]: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "?gpl=%s&eula=True" % (entry["sdk__id"])), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, diff --git a/firmware/spiders/ublox.py b/firmware/spiders/ublox.py index 31f1727..2efbe0f 100644 --- a/firmware/spiders/ublox.py +++ b/firmware/spiders/ublox.py @@ -4,8 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - +from urllib.parse import urljoin class UbloxSpider(Spider): name = "ublox" allowed_domains = ["u-blox.com"] diff --git a/firmware/spiders/verizon.py b/firmware/spiders/verizon.py index aca262b..e9e209b 100644 --- a/firmware/spiders/verizon.py +++ b/firmware/spiders/verizon.py @@ -4,8 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - +from urllib.parse import urljoin # http://www.fiosfaq.com/index.php?action=cat&catnum=12 # http://verizontest.blogspot.com/p/general-info.html # https://github.com/jameshilliard/jungo-image @@ -29,7 +28,7 @@ def parse(self, response): "//select[@id='router']/option/@value").extract(): if product and product != "allrouters": yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "?router=%s" % (product)), headers={"Referer": response.url}, callback=self.parse) diff --git a/firmware/spiders/xerox.py b/firmware/spiders/xerox.py index 4371e7b..869ae57 100644 --- a/firmware/spiders/xerox.py +++ b/firmware/spiders/xerox.py @@ -4,8 +4,7 @@ from firmware.items import FirmwareImage from firmware.loader import FirmwareLoader -import urlparse - +from urllib.parse import urljoin class XeroxSpider(Spider): name = "xerox" allowed_domains = ["xerox.com"] @@ -17,7 +16,7 @@ def parse(self, response): "//div[@class='productResults a2z']//a/@href").extract(): if "downloads" in href: yield Request( - url=urlparse.urljoin(response.url, href), + url=urljoin(response.url, href), headers={"Referer": response.url}, callback=self.parse_download) diff --git a/firmware/spiders/zyxel.py b/firmware/spiders/zyxel.py index d64931b..573f83c 100644 --- a/firmware/spiders/zyxel.py +++ b/firmware/spiders/zyxel.py @@ -5,8 +5,7 @@ from firmware.loader import FirmwareLoader import json -import urlparse - +from urllib.parse import urljoin class ZyXELSpider(Spider): name = "zyxel" allowed_domains = ["zyxel.com"] @@ -19,7 +18,7 @@ def parse(self, response): "//div[@id='searchDropUlWrap']/script//text()").extract()[0].split('=')[2].strip()[0: -1]) for entry in script: yield Request( - url=urlparse.urljoin( + url=urljoin( response.url, "/us/en/support/SearchResultTab.shtml?c=us&l=en&t=dl&md=%s&mt=Firmware&mt=MIBFile" % script[entry][1]), headers={"Referer": response.url}, meta={"product": script[entry][1]}, From a30fd394eecf2e3abf7f57de7ff2f6469a6bb6d8 Mon Sep 17 00:00:00 2001 From: Anderson Date: Wed, 5 Jun 2019 18:28:33 +0800 Subject: [PATCH 2/3] update xrange to range. --- firmware/spiders/360.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/firmware/spiders/360.py b/firmware/spiders/360.py index a1a7c49..a4e1a09 100644 --- a/firmware/spiders/360.py +++ b/firmware/spiders/360.py @@ -36,7 +36,7 @@ def parse_product(self, response): urls = re.findall(p_url, js) dates = re.findall(p_date, js) - for i in xrange(len(products)): + for i in range(len(products)): product = products[i] url = urls[i] version = versions[i] @@ -50,4 +50,4 @@ def parse_product(self, response): item.add_value("description", description) item.add_value("date", date) item.add_value("vendor", self.name) - yield item.load_item() \ No newline at end of file + yield item.load_item() From b8971edc389eb8315b8ac841f5ea68cd5d38ba54 Mon Sep 17 00:00:00 2001 From: Anderson Date: Wed, 5 Jun 2019 18:34:04 +0800 Subject: [PATCH 3/3] fix bug. fix bug and add version to item. --- firmware/spiders/360.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/firmware/spiders/360.py b/firmware/spiders/360.py index a4e1a09..9a271c2 100644 --- a/firmware/spiders/360.py +++ b/firmware/spiders/360.py @@ -30,7 +30,7 @@ def parse_product(self, response): p_date = u"updateDate:\"(?P.*?)\"" import re - products = re.findall(p_version, js) + products = re.findall(p_product, js) descriptions = re.findall(p_description, js) versions = re.findall(p_version, js) urls = re.findall(p_url, js) @@ -46,6 +46,7 @@ def parse_product(self, response): item = FirmwareLoader( item=FirmwareImage(), response=response) item.add_value("url", url) + item.add_value("version", version) item.add_value("product", product) item.add_value("description", description) item.add_value("date", date)