From e7319bc5dec6b943850b561126825fbe0d83de1a Mon Sep 17 00:00:00 2001 From: vgalin <47885533+vgalin@users.noreply.github.com> Date: Wed, 30 Jun 2021 00:18:59 +0200 Subject: [PATCH 1/5] change the project structure to facilitate the addition of new browsers chrome flags are now stored in a class attribute and easily modified --- html2image/browsers/__init__.py | 0 html2image/browsers/browser.py | 22 ++++ html2image/browsers/chrome.py | 183 ++++++++++++++++++++++++++++ html2image/browsers/firefox.py | 21 ++++ html2image/html2image.py | 203 ++++++-------------------------- tests/test_main.py | 11 ++ 6 files changed, 270 insertions(+), 170 deletions(-) create mode 100644 html2image/browsers/__init__.py create mode 100644 html2image/browsers/browser.py create mode 100644 html2image/browsers/chrome.py create mode 100644 html2image/browsers/firefox.py diff --git a/html2image/browsers/__init__.py b/html2image/browsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/html2image/browsers/browser.py b/html2image/browsers/browser.py new file mode 100644 index 0000000..2cd90df --- /dev/null +++ b/html2image/browsers/browser.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod + + +class Browser(ABC): + """Abstract class representing a web browser.""" + + def __init__(self, flags): + pass + + @property + @abstractmethod + def executable_path(self): + pass + + @executable_path.setter + @abstractmethod + def executable_path(self, value): + pass + + @abstractmethod + def screenshot(self, *args, **kwargs): + pass diff --git a/html2image/browsers/chrome.py b/html2image/browsers/chrome.py new file mode 100644 index 0000000..7c5df2e --- /dev/null +++ b/html2image/browsers/chrome.py @@ -0,0 +1,183 @@ +from .browser import Browser + +import subprocess +import platform +import os +import shutil + + +def _find_chrome(user_given_path=None): + """ Finds a Chrome executable. + + Search Chrome on a given path. If no path given, + try to find Chrome or Chromium-browser on a Windows or Unix system. + + Raises + ------ + - `FileNotFoundError` + + If a suitable chrome executable could not be found. + + Returns + ------- + - str + + Path of the chrome executable on the current machine. + """ + + # TODO when other browsers will be available: + # Ensure that the given executable is a chrome one. + + if user_given_path is not None: + if os.path.isfile(user_given_path): + return user_given_path + else: + raise FileNotFoundError('Could not find chrome in the given path.') + + if platform.system() == 'Windows': + prefixes = [ + os.getenv('PROGRAMFILES(X86)'), + os.getenv('PROGRAMFILES'), + os.getenv('LOCALAPPDATA'), + ] + + suffix = "Google\\Chrome\\Application\\chrome.exe" + + for prefix in prefixes: + path_candidate = os.path.join(prefix, suffix) + if os.path.isfile(path_candidate): + return path_candidate + + elif platform.system() == "Linux": + + # search google-chrome + version_result = subprocess.check_output( + ["google-chrome", "--version"] + ) + + if 'Google Chrome' in str(version_result): + return "google-chrome" + + # else search chromium-browser + + # snap seems to be a special case? + # see https://stackoverflow.com/q/63375327/12182226 + version_result = subprocess.check_output( + ["chromium-browser", "--version"] + ) + if 'snap' in str(version_result): + chrome_snap = ( + '/snap/chromium/current/usr/lib/chromium-browser/chrome' + ) + if os.path.isfile(chrome_snap): + return chrome_snap + else: + which_result = shutil.which('chromium-browser') + if which_result is not None and os.path.isfile(which_result): + return which_result + + elif platform.system() == "Darwin": + # MacOS system + chrome_app = ( + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' + ) + version_result = subprocess.check_output( + [chrome_app, "--version"] + ) + if "Google Chrome" in str(version_result): + return chrome_app + + raise FileNotFoundError( + 'Could not find a Chrome executable on this ' + 'machine, please specify it yourself.' + ) + + +class ChromeHeadless(Browser): + """ + Chrome/Chromium browser wrapper. + + Parameters + ---------- + - `executable_path` : str, optional + + Path to a chrome executable. + + - `flags` : list of str + + Flags to be used by the headless browser. + + Default flags are : + - '--default-background-color=0' + - '--hide-scrollbars' + - `print_command` : bool + + Whether or not to print the command used to take a screenshot. + """ + + def __init__(self, executable_path=None, flags=None, print_command=False): + self.executable_path = executable_path + if not flags: + flags = [ + '--default-background-color=0', + '--hide-scrollbars', + ] + self.flags = [flags] if isinstance(flags, str) else flags + self.print_command = print_command + + @property + def executable_path(self): + return self._executable_path + + @executable_path.setter + def executable_path(self, value): + self._executable_path = _find_chrome(value) + + def screenshot( + self, + input_file, + output_path, + output_file='screenshot.png', + size=(1920, 1080), + ): + """ Calls Chrome or Chromium headless to take a screenshot. + + Parameters + ---------- + - `output_file`: str + + Name as which the screenshot will be saved. + + File extension (e.g. .png) has to be included. + + Default is screenshot.png + - `input`: str + + File or url that will be screenshotted. + + Cannot be None + - `size`: (int, int), optional + + Two values representing the window size of the headless + + browser and by extention, the screenshot size. + + These two values must be greater than 0. + Raises + ------ + - `ValueError` + + If the value of `size` is incorrect. + + If `input` is empty. + """ + + if not input_file: + raise ValueError('The `input` parameter is empty.') + + if size[0] < 1 or size[1] < 1: + raise ValueError( + f'Could not screenshot "{output_file}" ' + f'with a size of {size}:\n' + 'A valid size consists of two integers greater than 0.' + ) + + # command used to launch chrome in + # headless mode and take a screenshot + command = [ + f'{self.executable_path}', + '--headless', + f'--screenshot={os.path.join(output_path, output_file)}', + f'--window-size={size[0]},{size[1]}', + *self.flags, + f'{input_file}', + ] + + if self.print_command: + print(f'{command}\n') + + subprocess.run(command) diff --git a/html2image/browsers/firefox.py b/html2image/browsers/firefox.py new file mode 100644 index 0000000..dce303d --- /dev/null +++ b/html2image/browsers/firefox.py @@ -0,0 +1,21 @@ +from .browser import Browser + + +class FirefoxHeadless(Browser): + + def __init__(self): + raise NotImplementedError( + "Could not make screenshot work on Firefox headless yet ...\n" + "See https://bugzilla.mozilla.org/show_bug.cgi?id=1715450" + ) + + @property + def executable_path(self): + pass + + @executable_path.setter + def executable_path(self, value): + pass + + def render(self, **kwargs): + pass diff --git a/html2image/html2image.py b/html2image/html2image.py index a9d35d8..cfb12a9 100644 --- a/html2image/html2image.py +++ b/html2image/html2image.py @@ -9,93 +9,21 @@ """ import os -import platform import shutil -import subprocess from textwrap import dedent +from html2image.browsers import chrome, firefox -def _find_chrome(user_given_path=None): - """ Finds a Chrome executable. - - Search Chrome on a given path. If no path given, - try to find Chrome or Chromium-browser on a Windows or Unix system. - - Raises - ------ - - `FileNotFoundError` - + If a suitable chrome executable could not be found. - - Returns - ------- - - str - + Path of the chrome executable on the current machine. - """ - - if user_given_path is not None: - if os.path.isfile(user_given_path): - return user_given_path - else: - raise FileNotFoundError('Could not find chrome in the given path.') - - if platform.system() == 'Windows': - prefixes = [ - os.getenv('PROGRAMFILES(X86)'), - os.getenv('PROGRAMFILES'), - os.getenv('LOCALAPPDATA'), - ] - - suffix = "Google\\Chrome\\Application\\chrome.exe" - - for prefix in prefixes: - path_candidate = os.path.join(prefix, suffix) - if os.path.isfile(path_candidate): - return path_candidate - - elif platform.system() == "Linux": - - # search google-chrome - version_result = subprocess.check_output( - ["google-chrome", "--version"] - ) - - if 'Google Chrome' in str(version_result): - return "google-chrome" - - # else search chromium-browser - - # snap seems to be a special case? - # see https://stackoverflow.com/q/63375327/12182226 - version_result = subprocess.check_output( - ["chromium-browser", "--version"] - ) - if 'snap' in str(version_result): - chrome_snap = ( - '/snap/chromium/current/usr/lib/chromium-browser/chrome' - ) - if os.path.isfile(chrome_snap): - return chrome_snap - else: - which_result = shutil.which('chromium-browser') - if which_result is not None and os.path.isfile(which_result): - return which_result - - elif platform.system() == "Darwin": - # MacOS system - chrome_app = ( - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' - ) - version_result = subprocess.check_output( - [chrome_app, "--version"] - ) - if "Google Chrome" in str(version_result): - return chrome_app - - raise FileNotFoundError( - 'Could not find a Chrome executable on this ' - 'machine, please specify it yourself.' - ) +browser_map = { + 'chrome': chrome.ChromeHeadless, + 'chromium': chrome.ChromeHeadless, + 'google-chrome': chrome.ChromeHeadless, + 'googlechrome': chrome.ChromeHeadless, + 'firefox': firefox.FirefoxHeadless, + 'mozilla-firefox': firefox.FirefoxHeadless, + 'mozilla firefox': firefox.FirefoxHeadless, +} class Html2Image(): @@ -109,11 +37,8 @@ class Html2Image(): + Type of the browser that will be used to take screenshots. + Default is Chrome. - - `chrome_path` : str, optional - + Path to a Chrome/Chromium executable. - - - `firefox_path` : str, optional - + Path to a Firefox executable. + - `browser_path` : str, optional + + Path to a browser executable. - `output_path` : str, optional + Path to a directory in which the taken screenshots will be saved. @@ -139,42 +64,28 @@ class Html2Image(): def __init__( self, browser='chrome', - chrome_path=None, - firefox_path=None, + browser_path=None, output_path=os.getcwd(), size=(1920, 1080), temp_path=None, custom_flags=[], - print_command=False ): - self.browser = browser + if browser.lower() not in browser_map: + raise ValueError( + f'"{browser}" is not a browser known by HTML2Image.' + ) + + browser_class = browser_map[browser.lower()] + self.browser = browser_class(executable_path=browser_path) + self.output_path = output_path self.size = size self.temp_path = temp_path - self.print_command = print_command self.custom_flags = ( [custom_flags] if isinstance(custom_flags, str) else custom_flags ) - # TODO : add @property + setter on self.browser to do the following - if self.browser == "chrome": - self._render = self._chrome_render - self.chrome_path = chrome_path - - elif self.browser == "firefox": - raise NotImplementedError - else: - raise NotImplementedError - - @property - def chrome_path(self): - return self._chrome_path - - @chrome_path.setter - def chrome_path(self, value): - self._chrome_path = _find_chrome(value) - @property def temp_path(self): return self._temp_path @@ -206,64 +117,6 @@ def output_path(self, value): self._output_path = value - def _chrome_render( - self, output_file='render.png', input_file='', size=None - ): - """ Calls Chrome or Chromium headless to take a screenshot. - - Parameters - ---------- - - `output_file`: str - + Name as which the screenshot will be saved. - + File extension (e.g. .png) has to be included. - + Default is screenshot.png - - `input_file`: str - + File (or url...) that will be screenshotted. - - `size`: (int, int), optional - + Two values representing the window size of the headless - + browser and by extention, the screenshot size. - + These two values must be greater than 0. - Raises - ------ - - `ValueError` - + If the value of `size` is incorrect. - """ - - if size is None: - size = self.size - - if size[0] < 1 or size[1] < 1: - raise ValueError( - f'Could not screenshot "{output_file}" ' - f'with a size of {size}:\n' - 'A valid size consists of two integers greater than 0.' - ) - - # command used to launch chrome in - # headless mode and take a screenshot - command = [ - f'{self.chrome_path}', - '--headless', - f'--screenshot={os.path.join(self.output_path, output_file)}', - f'--window-size={size[0]},{size[1]}', - '--default-background-color=0', - '--hide-scrollbars', - # TODO : make it possible to choose to display the scrollbar or not - *self.custom_flags, - f'{input_file}', - ] - - if self.print_command: - print(command) - print() - - subprocess.run(command) - - def _firefox_render(self, output_file='render.png', input_file=''): - """ Not implemented. - """ - raise NotImplementedError - def load_str(self, content, as_filename): """ Loads a string containing HTML or CSS so that html2image can use it @@ -337,7 +190,12 @@ def screenshot_loaded_file( "modifying the output_path attribute." ) - self._render(output_file=output_file, input_file=file, size=size) + self.browser.screenshot( + output_path=self.output_path, + output_file=output_file, + input=file, + size=size, + ) def screenshot_url(self, url, output_file='screenshot.png', size=None): """ Takes a screenshot of a given URL. @@ -370,7 +228,12 @@ def screenshot_url(self, url, output_file='screenshot.png', size=None): "modifying the output_path attribute." ) - self._render(input_file=url, output_file=output_file, size=size) + self.browser.screenshot( + output_path=self.output_path, + output_file=output_file, + input=url, + size=size + ) @staticmethod def _extend_save_as_param(save_as, desired_length): diff --git a/tests/test_main.py b/tests/test_main.py index 444b200..a0e37a8 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,9 +1,20 @@ from html2image import Html2Image from PIL import Image +import pytest + OUTPUT_PATH = "tests_output" +def test_bad_browser(): + with pytest.raises(ValueError): + Html2Image(browser='watergoupil') + + +def test_good_browser(): + Html2Image(browser='cHrOme') + + def test_screenshot_url(): hti = Html2Image(output_path=OUTPUT_PATH) From c0eacdbf235f467aaa25afdab914376a94ef8b35 Mon Sep 17 00:00:00 2001 From: vgalin <47885533+vgalin@users.noreply.github.com> Date: Wed, 30 Jun 2021 21:44:35 +0200 Subject: [PATCH 2/5] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 258edb3..a15b818 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "html2image" -version = "1.1.3" +version = "2.0.0" description = "Package acting as a wrapper around the headless mode of existing web browsers to generate images from URLs and from HTML+CSS strings or files." authors = ["vgalin"] license = "MIT" From f0507f0459c4c4ad96902fec9a76a8c1d9889f3a Mon Sep 17 00:00:00 2001 From: vgalin <47885533+vgalin@users.noreply.github.com> Date: Wed, 30 Jun 2021 21:48:05 +0200 Subject: [PATCH 3/5] fix parameter name and flags not being used --- html2image/browsers/chrome.py | 14 ++++++++------ html2image/html2image.py | 10 +++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/html2image/browsers/chrome.py b/html2image/browsers/chrome.py index 7c5df2e..6ab4f98 100644 --- a/html2image/browsers/chrome.py +++ b/html2image/browsers/chrome.py @@ -112,11 +112,13 @@ class ChromeHeadless(Browser): def __init__(self, executable_path=None, flags=None, print_command=False): self.executable_path = executable_path if not flags: - flags = [ + self.flags = [ '--default-background-color=0', '--hide-scrollbars', ] - self.flags = [flags] if isinstance(flags, str) else flags + else: + self.flags = [flags] if isinstance(flags, str) else flags + self.print_command = print_command @property @@ -129,7 +131,7 @@ def executable_path(self, value): def screenshot( self, - input_file, + input, output_path, output_file='screenshot.png', size=(1920, 1080), @@ -156,7 +158,7 @@ def screenshot( + If `input` is empty. """ - if not input_file: + if not input: raise ValueError('The `input` parameter is empty.') if size[0] < 1 or size[1] < 1: @@ -174,10 +176,10 @@ def screenshot( f'--screenshot={os.path.join(output_path, output_file)}', f'--window-size={size[0]},{size[1]}', *self.flags, - f'{input_file}', + f'{input}', ] if self.print_command: - print(f'{command}\n') + print(' '.join(command)) subprocess.run(command) diff --git a/html2image/html2image.py b/html2image/html2image.py index cfb12a9..d8f361f 100644 --- a/html2image/html2image.py +++ b/html2image/html2image.py @@ -76,14 +76,14 @@ def __init__( f'"{browser}" is not a browser known by HTML2Image.' ) - browser_class = browser_map[browser.lower()] - self.browser = browser_class(executable_path=browser_path) - self.output_path = output_path self.size = size self.temp_path = temp_path - self.custom_flags = ( - [custom_flags] if isinstance(custom_flags, str) else custom_flags + + browser_class = browser_map[browser.lower()] + self.browser = browser_class( + executable_path=browser_path, + flags=custom_flags, ) @property From 50cf80d67d6fb5e6af6b69df3d84159ab5f7873b Mon Sep 17 00:00:00 2001 From: vgalin <47885533+vgalin@users.noreply.github.com> Date: Wed, 30 Jun 2021 21:48:49 +0200 Subject: [PATCH 4/5] add documentation for flags and updated parameter names --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5520fa5..e2b1abe 100644 --- a/README.md +++ b/README.md @@ -49,10 +49,10 @@ hti = Html2Image() Multiple arguments can be passed to the constructor (click to expand): - `browser` : Browser that will be used, set by default to `'chrome'` (the only browser supported by HTML2Image at the moment) -- `chrome_path` and `firefox_path` : The path or the command that can be used to find the executable of a specific browser. +- `browser_path` : The path or the command that can be used to find the executable of a specific browser. - `output_path` : Path to the folder to which taken screenshots will be outputed. Default is the current working directory of your python program. - `size` : 2-Tuple reprensenting the size of the screenshots that will be taken. Default value is `(1920, 1080)`. -- `temp_path` : Path that will be used by html2image to put together different resources *loaded* with the `load_str` and `load_file` methods. Default value is `%TEMP%/html2image` on Windows, and `/tmp/html2image` on Linux and MacOS. +- `temp_path` : Path that will be used to put together different resources when screenshotting strings of files. Default value is `%TEMP%/html2image` on Windows, and `/tmp/html2image` on Linux and MacOS. Example: ```python @@ -208,6 +208,62 @@ print(paths) # >>> ['D:\\myFiles\\letters_0.png', 'D:\\myFiles\\letters_1.png', 'D:\\myFiles\\letters_2.png'] ``` +--- + +#### Change browser flags +In some cases, you may need to change the *flags* that are used to run the headless mode of a browser. + +Flags can be used to: +- Change the default background color of the pages; +- Hide the scrollbar; +- Add delay before taking a screenshot; +- Allow you to use Html2Image when you're root, as you will have to specify the `--no-sandbox` flag; + +You can find the full list of Chrome / Chromium flags [here](https://peter.sh/experiments/chromium-command-line-switches/). + +There is two ways to specify custom flags: +```python +# At the object instanciation +hti = Html2image(custom_flags=['--my_flag', '--my_other_flag=value']) + +# Afterwards +hti.browser.flags = ['--my_flag', '--my_other_flag=value'] +``` + +- **Flags example use-case: adding a delay before taking a screenshot** + +With Chrome / Chromium, screenshots are fired directly after there is no more "pending network fetches", but you may sometimes want to add a delay before taking a screenshot, to wait for animations to end for example. +There is a flag for this purpose, `--virtual-time-budget=VALUE_IN_MILLISECONDS`. You can use it like so: + +```python +hti = Html2Image( + custom_flags=['--virtual-time-budget=10000', '--hide-scrollbars'] +) + +hti.screenshot(url='http://example.org') +``` + +- **Default flags** + +For ease of use, some flags are set by default. However default flags are not used if you decide to specify `custom_flags` or change the value of `browser.flags`: + +```python +# Taking a look at the default flags +>>> hti = Html2Image() +>>> hti.browser.flags +['--default-background-color=0', '--hide-scrollbars'] + +# Changing the value of browser.flags gets rid of the default flags. +>>> hti.browser.flags = ['--1', '--2'] +>>> hti.browser.flags +['--1', '--2'] + +# Using the custom_flags parameter gets rid of the default flags. +>>> hti = Html2Image(custom_flags=['--a', '--b']) +>>> hti.browser.flags +['--a', '--b'] +``` + ## Using the CLI HTML2image comes with a Command Line Interface which you can use to generate screenshots from files and urls on the go. From c9caab6e4c4e8c9f140c830719a17f232076b962 Mon Sep 17 00:00:00 2001 From: vgalin <47885533+vgalin@users.noreply.github.com> Date: Wed, 30 Jun 2021 23:22:06 +0200 Subject: [PATCH 5/5] add FAQ to readme + minor changes --- README.md | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e2b1abe..6f3d8de 100644 --- a/README.md +++ b/README.md @@ -290,16 +290,32 @@ You can call it by typing `hti` or `html2image` into a terminal. ## Testing -Only basic testing is available at the moment. To run tests, run PyTest at the root of the project: -``` +Only basic testing is available at the moment. To run tests, install the requirements (Pillow) and run PyTest at the root of the project: +``` console +pip install -r requirements-test.txt python -m pytest ``` + +## FAQ + +- Can I automatically take a full page screenshot? +**Sadly no**, it is not easily possible. Html2Image relies on the headless mode of Chrome/Chromium browsers to take screenshots and there is no way to "ask" for a full page screenshot at the moment. If you know a way to take one (by estimating the page size for example) I would be happy to see it, so please open an issue or a discussion! + +- Can I add delay before taking a screenshot? +**Yes** you can, please take a look at the `Change browser flags` section of the readme. + +- Can I speed up the screenshot taking process? +**Yes**, when you are taking a lot of screenshots, you can achieve better "performances" using Parallel Processing or Multiprocessing methods. You can find an [example of it here](https://github.com/vgalin/html2image/issues/28#issuecomment-862608053). + +- Can I make a cookie modal disappear? +**Yes and no**. **No** because there is no options to do it magically and [extensions are not supported in headless Chrome](https://bugs.chromium.org/p/chromium/issues/detail?id=706008#c5) (The [`I don't care about cookies`](https://www.i-dont-care-about-cookies.eu/) extension would have been useful in this case). **Yes** because you can make any element of a page disappear by retrieving its source code, modifying it as you wish, and finally screenshotting the modified source code. ## TODO List -- A nice CLI (Currently in a WIP state) - - A better way to name the CLI's outputed files ? -- Support of other browsers, such as Firefox -- More extensive doc + comments +- A nice CLI (currently in a WIP state). +- Support of other browsers (such as Firefox when their screenshot feature will work). - PDF generation? -- Testing on push/PR with GitHub Actions -- Use threads or multiprocessing to speed up screenshot taking \ No newline at end of file +- Contributing, issue templates, pull request template, code of conduct. + +--- + +*If you see any typos or notice things that are odly said, feel free to create an issue or a pull request.* \ No newline at end of file