Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support python3 #14

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions firmware/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import datetime
import re
import string
import urlparse
from urllib.parse import urlparse, urljoin


class FirmwareLoader(ItemLoader):
Expand Down Expand Up @@ -38,18 +38,18 @@ def find_date(self, text):
fmt = "(" + re.escape(fmt).replace("\%b", "[a-zA-Z]{3}").replace("\%B", "[a-zA-Z]+").replace(
"\%m", "\d{1,2}").replace("\%d", "\d{1,2}").replace("\%y", "\d{2}").replace("\%Y", "\d{4}") + ")"
match = re.search(fmt, "".join(text).strip())
res = filter(lambda x: x, match.groups()) if match else None
res = list(filter(lambda x: x, match.groups())) if match else None

if res:
return res[0]
return None

def clean(s):
return filter(lambda x: x in string.printable, s).replace("\r", "").replace("\n", "").replace(u"\xa0", " ").strip()
return ''.join(list(filter(lambda x: x in string.printable, s))).replace("\r", "").replace("\n", "").replace(u"\xa0", " ").strip()

def fix_url(url, loader_context):
if not urlparse.urlparse(url).netloc:
return urlparse.urljoin(loader_context.get("response").url, url)
if not urlparse(url).netloc:
return urljoin(loader_context.get("response").url, url)
return url

def parse_date(date, loader_context):
Expand Down
63 changes: 39 additions & 24 deletions firmware/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import os
import hashlib
import logging
import urlparse
import urllib
from urllib.parse import urlparse, urlsplit, urlunsplit, unquote
from urllib.request import urlopen

logger = logging.getLogger(__name__)


class FirmwarePipeline(FilesPipeline):
def __init__(self, store_uri, download_func=None, settings=None):
if settings and "SQL_SERVER" in settings:
Expand All @@ -20,7 +21,8 @@ def __init__(self, store_uri, download_func=None, settings=None):
else:
self.database = None

super(FirmwarePipeline, self).__init__(store_uri, download_func,settings)
super(FirmwarePipeline, self).__init__(store_uri, download_func, settings)

@classmethod
def from_settings(cls, settings):
store_uri = settings['FILES_STORE']
Expand All @@ -33,7 +35,7 @@ def from_settings(cls, settings):
# overrides function from FilesPipeline
def file_path(self, request, response=None, info=None):
extension = os.path.splitext(os.path.basename(
urlparse.urlsplit(request.url).path))[1]
urlsplit(request.url).path))[1]
return "%s/%s%s" % (request.meta["vendor"],
hashlib.sha1(request.url).hexdigest(), extension)

Expand All @@ -43,23 +45,24 @@ def get_media_requests(self, item, info):
for x in ["vendor", "url"]:
if x not in item:
raise DropItem(
"Missing required field '%s' for item: " % (x, item))
"Missing required field '%s' for item %s: " % (x, item))

# resolve dynamic redirects in urls
for x in ["mib", "sdk", "url"]:
if x in item:
split = urlparse.urlsplit(item[x])
split = urlsplit(item[x])
# remove username/password if only one provided
if split.username or split.password and not (split.username and split.password):
item[x] = urlparse.urlunsplit(
item[x] = urlunsplit(
(split[0], split[1][split[1].find("@") + 1:], split[2], split[3], split[4]))

if split.scheme == "http":
item[x] = urllib.urlopen(item[x]).geturl()
item[x] = urlopen(item[x]).geturl()

# check for filtered url types in path
url = urlparse.urlparse(item["url"])
if any(url.path.endswith(x) for x in [".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]):
url = urlparse(item["url"])
if any(url.path.endswith(x) for x in
[".pdf", ".php", ".txt", ".doc", ".rtf", ".docx", ".htm", ".html", ".md5", ".sha1", ".torrent"]):
raise DropItem("Filtered path extension: %s" % url.path)
elif any(x in url.path for x in ["driver", "utility", "install", "wizard", "gpl", "login"]):
raise DropItem("Filtered path type: %s" % url.path)
Expand All @@ -69,7 +72,9 @@ def get_media_requests(self, item, info):
for x in ["mib", "url"] if x in item]

# pass vendor so we can generate the correct file path and name
return [Request(x, meta={"ftp_user": "anonymous", "ftp_password": "[email protected]", "vendor": item["vendor"]}) for x in item[self.files_urls_field]]
return [
Request(x, meta={"ftp_user": "anonymous", "ftp_password": "[email protected]", "vendor": item["vendor"]})
for x in item[self.files_urls_field]]

# overrides function from FilesPipeline
def item_completed(self, results, item, info):
Expand All @@ -86,7 +91,7 @@ def item_completed(self, results, item, info):
for ok, x in results:
for y in ["mib", "url", "sdk"]:
# verify URL's are the same after unquoting
if ok and y in item and urllib.unquote(item[y]) == urllib.unquote(x["url"]):
if ok and y in item and unquote(item[y]) == unquote(x["url"]):
status[y] = x
elif y not in status:
status[y] = {"checksum": None, "path": None}
Expand All @@ -97,25 +102,26 @@ def item_completed(self, results, item, info):

# attempt to find a matching image_id
cur.execute("SELECT id FROM image WHERE hash=%s",
(status["url"]["checksum"], ))
(status["url"]["checksum"],))
image_id = cur.fetchone()

if not image_id:
cur.execute("SELECT id FROM brand WHERE name=%s", (item["vendor"], ))
cur.execute("SELECT id FROM brand WHERE name=%s", (item["vendor"],))
brand_id = cur.fetchone()

if not brand_id:
cur.execute("INSERT INTO brand (name) VALUES (%s) RETURNING id", (item["vendor"], ))
cur.execute("INSERT INTO brand (name) VALUES (%s) RETURNING id", (item["vendor"],))
brand_id = cur.fetchone()
logger.info("Inserted database entry for brand: %d!" % brand_id)

cur.execute("INSERT INTO image (filename, description, brand_id, hash) VALUES (%s, %s, %s, %s) RETURNING id",
(status["url"]["path"], item.get("description", None), brand_id, status["url"]["checksum"]))
cur.execute(
"INSERT INTO image (filename, description, brand_id, hash) VALUES (%s, %s, %s, %s) RETURNING id",
(status["url"]["path"], item.get("description", None), brand_id, status["url"]["checksum"]))
image_id = cur.fetchone()
logger.info("Inserted database entry for image: %d!" % image_id)
else:
cur.execute("SELECT filename FROM image WHERE hash=%s",
(status["url"]["checksum"], ))
(status["url"]["checksum"],))
path = cur.fetchone()

logger.info(
Expand All @@ -127,19 +133,28 @@ def item_completed(self, results, item, info):
status["url"]["path"])

# attempt to find a matching product_id
cur.execute("SELECT id FROM product WHERE iid=%s AND product IS NOT DISTINCT FROM %s AND version IS NOT DISTINCT FROM %s AND build IS NOT DISTINCT FROM %s",
(image_id, item.get("product", None), item.get("version", None), item.get("build", None)))
cur.execute(
"SELECT id FROM product WHERE iid=%s AND product IS NOT DISTINCT FROM %s AND version IS NOT DISTINCT FROM %s AND build IS NOT DISTINCT FROM %s",
(image_id, item.get("product", None), item.get("version", None), item.get("build", None)))
product_id = cur.fetchone()

if not product_id:
cur.execute("INSERT INTO product (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id",
(image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"], status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"], item.get("product", None), item.get("version", None), item.get("build", None), item.get("date", None)))
cur.execute(
"INSERT INTO product (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id",
(image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"],
status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"],
item.get("product", None), item.get("version", None), item.get("build", None),
item.get("date", None)))
product_id = cur.fetchone()
logger.info(
"Inserted database entry for product: %d!" % product_id)
else:
cur.execute("UPDATE product SET (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) = (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) WHERE id=%s",
(image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"], status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"], item.get("product", None), item.get("version", None), item.get("build", None), item.get("date", None), image_id))
cur.execute(
"UPDATE product SET (iid, url, mib_filename, mib_url, mib_hash, sdk_filename, sdk_url, sdk_hash, product, version, build, date) = (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) WHERE id=%s",
(image_id, item["url"], status["mib"]["path"], item.get("mib", None), status["mib"]["checksum"],
status["sdk"]["path"], item.get("sdk", None), status["sdk"]["checksum"],
item.get("product", None), item.get("version", None), item.get("build", None),
item.get("date", None), image_id))
logger.info("Updated database entry for product: %d!" % product_id)

self.database.commit()
Expand Down
2 changes: 1 addition & 1 deletion firmware/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
ROBOTSTXT_OBEY = False
USER_AGENT = "FirmwareBot/1.0 (+https://github.com/firmadyne/scraper)"

#SQL_SERVER = "127.0.0.1"
SQL_SERVER = "127.0.0.1"
110 changes: 54 additions & 56 deletions firmware/spiders/360.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,54 @@
#coding:utf-8
from scrapy import Spider
from scrapy.http import Request

from firmware.items import FirmwareImage
from firmware.loader import FirmwareLoader

import urlparse
import json

class A360Spider(Spider):
name = "360"
#allowed_domains = ["luyou.360.cn"]
json_url = "http://s7.qhres.com/static/ef5bacdd3d93fa90/common_info.js"
start_urls = ["http://luyou.360.cn/download_center.html?from=nav"]

def parse(self, response):
yield Request(
url=self.json_url,
headers={"Referer": response.url},
callback=self.parse_product)

def parse_product(self, response):
js = response.text
if js.startswith("var commonInfo"):
print response.url
print js

p_product = u"id:\"(?P<product>.*?)\""
p_description = u"title:\"(?P<description>.*?)\""
p_version = u"romVersions:\"(?P<version>.*?)\""
p_url = u"romUrl:\"(?P<url>.*?)\""
p_date = u"updateDate:\"(?P<date>.*?)\""

import re
products = re.findall(p_version, js)
descriptions = re.findall(p_description, js)
versions = re.findall(p_version, js)
urls = re.findall(p_url, js)
dates = re.findall(p_date, js)

for i in xrange(len(products)):
product = products[i]
url = urls[i]
version = versions[i]
description = descriptions[i]
date = dates[i]

item = FirmwareLoader(
item=FirmwareImage(), response=response)
item.add_value("url", url)
item.add_value("product", product)
item.add_value("description", description)
item.add_value("date", date)
item.add_value("vendor", self.name)
yield item.load_item()
#coding:utf-8
from scrapy import Spider
from scrapy.http import Request

from firmware.items import FirmwareImage
from firmware.loader import FirmwareLoader

class A360Spider(Spider):
name = "360"
#allowed_domains = ["luyou.360.cn"]
json_url = "http://s7.qhres.com/static/ef5bacdd3d93fa90/common_info.js"
start_urls = ["http://luyou.360.cn/download_center.html?from=nav"]

def parse(self, response):
yield Request(
url=self.json_url,
headers={"Referer": response.url},
callback=self.parse_product)

def parse_product(self, response):
js = response.text
if js.startswith("var commonInfo"):
print(response.url)
print(js)

p_product = u"id:\"(?P<product>.*?)\""
p_description = u"title:\"(?P<description>.*?)\""
p_version = u"romVersions:\"(?P<version>.*?)\""
p_url = u"romUrl:\"(?P<url>.*?)\""
p_date = u"updateDate:\"(?P<date>.*?)\""

import re
products = re.findall(p_product, js)
descriptions = re.findall(p_description, js)
versions = re.findall(p_version, js)
urls = re.findall(p_url, js)
dates = re.findall(p_date, js)

for i in range(len(products)):
product = products[i]
url = urls[i]
version = versions[i]
description = descriptions[i]
date = dates[i]

item = FirmwareLoader(
item=FirmwareImage(), response=response)
item.add_value("url", url)
item.add_value("version", version)
item.add_value("product", product)
item.add_value("description", description)
item.add_value("date", date)
item.add_value("vendor", self.name)
yield item.load_item()
4 changes: 2 additions & 2 deletions firmware/spiders/actiontec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from firmware.loader import FirmwareLoader

import re
import urlparse
from urllib.parse import urljoin


class ActiontecSpider(Spider):
Expand All @@ -23,7 +23,7 @@ def parse(self, response):
product = actual.group(1).replace("(", "").replace(")", "")

yield Request(
url=urlparse.urljoin(
url=urljoin(
response.url, link.xpath(".//@href").extract()[0]),
headers={"Referer": response.url},
meta={"product": product},
Expand Down
4 changes: 2 additions & 2 deletions firmware/spiders/airlink101.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from firmware.items import FirmwareImage
from firmware.loader import FirmwareLoader

import urlparse
from urllib.parse import urljoin


class Airlink101Spider(Spider):
Expand All @@ -23,7 +23,7 @@ def parse(self, response):

if "_a=download" not in href:
yield Request(
url=urlparse.urljoin(response.url, href),
url=urljoin(response.url, href),
headers={"Referer": response.url},
meta={"product": text.strip().split(' ')},
callback=self.parse)
Expand Down
8 changes: 4 additions & 4 deletions firmware/spiders/asus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from firmware.items import FirmwareImage
from firmware.loader import FirmwareLoader
import urlparse
from urllib.parse import urljoin


class AsusSpider(Spider):
Expand All @@ -18,7 +18,7 @@ def parse(self, response):
if "cid" not in response.meta:
for category in response.xpath("//div[@class='product-category']//a/@l1_id").extract():
yield Request(
url=urlparse.urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s" % (self.region, category)),
url=urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s" % (self.region, category)),
meta={"cid": category},
headers={"Referer": response.url,
"X-Requested-With": "XMLHttpRequest"},
Expand All @@ -27,7 +27,7 @@ def parse(self, response):
elif "sid" not in response.meta:
for series in response.xpath("//table/id/text()").extract():
yield Request(
url=urlparse.urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s&s=%s" % (self.region, response.meta["cid"], series)),
url=urljoin(response.url, "/support/utilities/GetProducts.aspx?ln=%s&p=%s&s=%s" % (self.region, response.meta["cid"], series)),
meta={"cid": response.meta["cid"], "sid": series},
headers={"Referer": response.url,
"X-Requested-With": "XMLHttpRequest"},
Expand All @@ -41,7 +41,7 @@ def parse(self, response):

# choose "Others" = 8
yield Request(
url=urlparse.urljoin(response.url, "/support/Download/%s/%s/%s/%s/%d" % (response.meta["cid"], response.meta["sid"], pid, mid, 8)),
url=urljoin(response.url, "/support/Download/%s/%s/%s/%s/%d" % (response.meta["cid"], response.meta["sid"], pid, mid, 8)),
meta={"product": product},
headers={"Referer": response.url,
"X-Requested-With": "XMLHttpRequest"},
Expand Down
4 changes: 2 additions & 2 deletions firmware/spiders/att.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from firmware.items import FirmwareImage
from firmware.loader import FirmwareLoader

import urlparse
from urllib.parse import urljoin


class ATTSpider(Spider):
Expand All @@ -23,6 +23,6 @@ def parse(self, response):
yield item.load_item()
elif "/" in href:
yield Request(
url=urlparse.urljoin(response.url, href),
url=urljoin(response.url, href),
headers={"Referer": response.url},
callback=self.parse)
Loading