Merge pull request #13 from Girbons/development

Added support for mangahere
Girbons · Oct 31, 2018 · 6245a83 · 6245a83
2 parents 124006c + e7edba8
commit 6245a83
Show file tree

Hide file tree

Showing 13 changed files with 120 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### 0.5 [2018-10-31]
+
+* Added support for http://www.mangahere.cc/
+
 ### 0.4.1 [2018-10-30]
 
 * updated requests (#8 -- thanks @michaelbukachi)

diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@
 
 - http://www.comicextra.com/
 - https://www.mangareader.net/
+- http://www.mangahere.cc/
 - http://readcomiconline.to/
 
 Didn't find your site? Open an issue and it will be added or submit a pull request.

diff --git a/comics/__init__.py b/comics/__init__.py
@@ -1,3 +1,3 @@
 __title__ = 'comics-downloader'
 __autor__ = 'Alessandro De Angelis'
-__version__ = '0.4.1'
+__version__ = '0.5'
diff --git a/comics/core/comics.py b/comics/core/comics.py
@@ -6,17 +6,16 @@
 
 import requests
 import img2pdf
-import validators
 
 from bs4 import BeautifulSoup
-from natsort import natsorted
 from tqdm import tqdm
+from natsort import natsorted
 
 from .scraper import Scraper
 
 from ..compat import TemporaryDirectory
 from ..settings import comics_settings
-from ..utils import create_and_change_dir
+from ..utils import create_and_change_dir, is_url_valid
 
 
 class BaseComics(object):
@@ -40,7 +39,7 @@ def images_links(self, response):
         """
         :param response: is the response instance.
 
-        from a response contet extract images link
+        from a response content extract images link
         and return a list of link
         exclude .gif
         """
@@ -49,7 +48,7 @@ def images_links(self, response):
         links = []
 
         for link in match:
-            if not link.endswith('.gif') and validators.url(link):
+            if is_url_valid(link):
                 links.append(link)
 
         return links

diff --git a/comics/settings.py b/comics/settings.py
@@ -6,9 +6,10 @@
 
 
 DEFAULT_SETTINGS = {
-    'comicextra.com': 'comics.sites.comicextra.ComicExtra',
-    'mangareader.net': 'comics.sites.mangareader.MangaReader',
-    'readcomiconline.to': 'comics.sites.readcomiconline.ReadComicOnline',
+    'comicextra.com': 'comics.sites.ComicExtra',
+    'mangahere.cc': 'comics.sites.MangaHere',
+    'mangareader.net': 'comics.sites.MangaReader',
+    'readcomiconline.to': 'comics.sites.ReadComicOnline',
 
     # is the path where you will find all your downloaded comics
     # divided by domain
@@ -18,6 +19,7 @@
 IMPORT_STRINGS = (
     'comicextra.com',
     'mangareader.net',
+    'mangahere.cc',
     'readcomiconline.to',
 )
 

diff --git a/comics/sites/__init__.py b/comics/sites/__init__.py
@@ -0,0 +1,5 @@
+from .comicextra import ComicExtra # noqa
+from .mangahere import MangaHere # noqa
+from .mangareader import MangaReader # noqa
+from .readcomiconline import ReadComicOnline # noqa
+from .readcomicsio import ReadComics # noqa
diff --git a/comics/sites/mangahere.py b/comics/sites/mangahere.py
@@ -0,0 +1,46 @@
+import re
+
+import requests
+
+from bs4 import BeautifulSoup
+
+from ..core.comics import BaseComics
+from ..utils import is_url_valid
+
+
+class MangaHere(BaseComics):
+    """
+    class for http://www.mangahere.cc/
+    """
+    def __init__(self, url):
+        self.base_url = 'http://www.mangahere.cc/'
+        self._image_regex = r'<img[^>]+src="([^">]+)"'
+        self.antibot = False
+        super(MangaHere, self).__init__(url)
+
+    @property
+    def name(self):
+        return self.splitted_url[4]
+
+    @property
+    def issue_number(self):
+        return self.splitted_url[5]
+
+    def images_links(self, response):
+        session = requests.Session()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # retrieve the <options> in page
+        options = soup.findAll('option')
+        links = []
+
+        for option in options:
+            links.append('http:{}'.format(option['value']))
+
+        images_links = []
+        for link in links:
+            response = session.get(link)
+            image_url = re.findall(self._image_regex, response.text)[1]
+            if is_url_valid(image_url):
+                images_links.append(image_url)
+
+        return images_links
diff --git a/comics/sites/mangareader.py b/comics/sites/mangareader.py
@@ -5,6 +5,7 @@
 from bs4 import BeautifulSoup
 
 from ..core.comics import BaseComics
+from ..utils import is_url_valid
 
 
 class MangaReader(BaseComics):
@@ -40,6 +41,7 @@ def images_links(self, response):
             response = session.get(link)
             # we'll find only 1 image
             image_url = re.findall(self._image_regex, response.text)[0]
-            images_links.append(image_url)
+            if is_url_valid(image_url):
+                images_links.append(image_url)
 
         return images_links
diff --git a/comics/utils.py b/comics/utils.py
@@ -1,5 +1,7 @@
 import os
 
+import validators
+
 
 def create_and_change_dir(dir_name):
     """
@@ -11,3 +13,15 @@ def create_and_change_dir(dir_name):
         os.mkdir(dir_name)
 
     os.chdir(dir_name)
+
+
+def is_url_valid(url):
+    """
+    :param string url: is a url
+
+    check if the given url is valid and is not a gif
+    """
+    if not url.endswith('.gif') and validators.url(url):
+        return True
+
+    return False
diff --git a/tests/test_comics_scraper.py b/tests/test_comics_scraper.py
@@ -43,3 +43,13 @@ def test_mangareader():
         'comics-download', '-u', 'https://www.mangareader.net/naruto/1'
     ])
     assert os.path.isfile("downloaded_comics/MangaReader/naruto/1.pdf")
+
+
+def test_mangahere():
+    """
+    Test download from http://mangahere.cc
+    """
+    subprocess.call([
+        'comics-download', '-u', 'http://www.mangahere.cc/manga/shingeki_no_kyojin_before_the_fall/c048/'
+    ])
+    assert os.path.isfile("downloaded_comics/MangaHere/shingeki_no_kyojin_before_the_fall/c048.pdf")
diff --git a/tests/tests_unit/test_downloader.py b/tests/tests_unit/test_downloader.py
@@ -4,7 +4,7 @@
 from comics.core.downloader import Downloader
 from comics.exceptions import NotSupportedSite
 
-from comics.sites.comicextra import ComicExtra
+from comics.sites import ComicExtra
 from comics.settings import comics_settings
 
 

diff --git a/tests/tests_unit/test_sites.py b/tests/tests_unit/test_sites.py
@@ -2,10 +2,14 @@
 import pytest
 
 from comics.core.comics import BaseComics
-from comics.sites.readcomiconline import ReadComicOnline
-from comics.sites.comicextra import ComicExtra
-from comics.sites.readcomicsio import ReadComics
-from comics.sites.mangareader import MangaReader
+
+from comics.sites import (
+    ComicExtra,
+    ReadComics,
+    ReadComicOnline,
+    MangaHere,
+    MangaReader,
+)
 
 
 def test_base_comic():
@@ -55,3 +59,14 @@ def test_mangareader():
     assert comics.name == 'naruto'
     assert comics.issue_number == '1'
     assert len(result) == 53
+
+
+def test_mangahere():
+    url = 'http://www.mangahere.cc/manga/shingeki_no_kyojin_before_the_fall/c048/'
+    comics = MangaHere(url)
+    response = comics.scraper.scrape_comic(False)
+    result = comics.images_links(response)
+
+    assert comics.name == 'shingeki_no_kyojin_before_the_fall'
+    assert comics.issue_number == 'c048'
+    assert len(result) == 132
diff --git a/tests/tests_unit/test_utils.py b/tests/tests_unit/test_utils.py
@@ -1,11 +1,16 @@
 import os
 
-
-from comics.utils import create_and_change_dir
+from comics.utils import create_and_change_dir, is_url_valid
 
 
 def test_create_and_change():
     create_and_change_dir('test_dir')
     current_wd = os.getcwd()
     assert 'test_dir' in current_wd
     os.removedirs(current_wd)
+
+
+def test_valid_url():
+    assert is_url_valid('http://example.com') is True
+    assert is_url_valid('http://foo.gif') is False
+    assert is_url_valid('aaaaaa') is False