diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f77e34e..9dfe149 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] poetry-version: [1.1.4] os: [ubuntu-18.04, macos-latest, windows-latest] runs-on: ${{ matrix.os }} @@ -33,6 +33,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest + pip install -r requirements.txt pip install -r requirements-test.txt - name: Flake8 linting run: | diff --git a/.gitignore b/.gitignore index 31dc65f..3af9e71 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,7 @@ dmypy.json # Added .vscode/ -tests_output/ \ No newline at end of file +tests_output/ + +# .lock file not relevant for libraries +poetry.lock \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 607545f..67b8ce5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,12 @@ FROM python -RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python - +RUN curl -sSL https://install.python-poetry.org | python3 - WORKDIR /pkgs/html2image COPY . . -RUN $HOME/.poetry/bin/poetry install -RUN $HOME/.poetry/bin/poetry build +RUN $HOME/.local/bin/poetry install +RUN $HOME/.local/bin/poetry build RUN pip install dist/*.whl RUN apt-get update -y && apt-get install -y chromium @@ -19,4 +19,4 @@ RUN echo 'export CHROMIUM_FLAGS="$CHROMIUM_FLAGS --no-sandbox"' >> /etc/chromium # MOTD RUN echo " \n =============HTML2IMAGE============= \n Welcome to the html2image CLI container ! \n Type html2image -h for help :)" >> /etc/motd RUN echo "clear" >> /root/.bashrc -RUN echo "cat /etc/motd" >> /root/.bashrc \ No newline at end of file +RUN echo "cat /etc/motd" >> /root/.bashrc diff --git a/README.md b/README.md index 8d98e3a..f939a40 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,36 @@ -html2image logo -# HTML2Image -[ + + +

+ + html2image logo + +

+ + +
+ ![PyPI](https://img.shields.io/pypi/v/html2image.svg) ![PyPI](https://img.shields.io/pypi/pyversions/html2image.svg) ![PyPI](https://img.shields.io/github/license/vgalin/html2image.svg) -](https://pypi.org/project/html2image/) -[ ![GitHub](https://img.shields.io/github/v/release/vgalin/html2image?include_prereleases) ![GitHub](https://img.shields.io/github/languages/code-size/vgalin/html2image) -](https://github.com/vgalin/html2image) -HTML2Image is a lightweight Python package that acts as a wrapper around the headless mode of existing web browsers to generate images from URLs and from HTML+CSS strings or files. + +|[PyPI Package](https://pypi.org/project/html2image/)|[GitHub Repository](https://github.com/vgalin/html2image)| +|-|-| + +**A lightweight Python package acting a wrapper around the headless mode of existing web browsers, allowing images generation from HTML/CSS strings, files and URLs.** + +
+  This package has been tested on Windows, Ubuntu (desktop and server) and MacOS. It is currently in a work in progress stage. If you encounter any problem or difficulties while using it, feel free to open an issue on the GitHub page of this project. Feedback is also welcome! ## Principle + Most web browsers have a Headless Mode, which is a way to run them without displaying any graphical interface. Headless mode is mainly used for automated testings but also comes in handy if you want to take screenshots of web pages that are exact replicas of what you would see on your screen if you were using the browser yourself. However, for the sake of taking screenshots, headless mode is not very convenient to use. HTML2Image aims to hide the inconveniences of the browsers' headless modes while adding useful features such as allowing to create an image from as little as a string. @@ -36,6 +49,7 @@ pip install --upgrade html2image In addition to this package, at least one of the following browsers **must** be installed on your machine : - Google Chrome (Windows, MacOS) - Chromium Browser (Linux) +- Microsoft Edge ## Usage @@ -45,14 +59,14 @@ from html2image import Html2Image hti = Html2Image() ``` -
- Multiple arguments can be passed to the constructor (click to expand): +Multiple arguments can be passed to the constructor: -- `browser` : Browser that will be used, set by default to `'chrome'` (the only browser supported by HTML2Image at the moment) +- `browser` : Browser that will be used, can be set to `'chrome'` (default) or `'edge'`. - `browser_executable` : The path or the command that can be used to find the executable of a specific browser. - `output_path` : Path to the folder to which taken screenshots will be outputed. Default is the current working directory of your python program. - `size` : 2-Tuple representing the size of the screenshots that will be taken. Default value is `(1920, 1080)`. - `temp_path` : Path that will be used to put together different resources when screenshotting strings of files. Default value is `%TEMP%/html2image` on Windows, and `/tmp/html2image` on Linux and MacOS. +- `keep_temp_files` : Pass True to this argument to not automatically remove temporary files created in `temp_path`. Default is False. Example: ```python @@ -63,8 +77,6 @@ You can also change these values later: ``` python hti.size = (500, 200) ``` -
-
### Then take a screenshot diff --git a/html2image/browsers/browser.py b/html2image/browsers/browser.py index 9586592..06ed590 100644 --- a/html2image/browsers/browser.py +++ b/html2image/browsers/browser.py @@ -4,7 +4,7 @@ class Browser(ABC): """Abstract class representing a web browser.""" - def __init__(self, flags): + def __init__(self, flags, disable_logging): pass @property @@ -20,3 +20,24 @@ def executable(self, value): @abstractmethod def screenshot(self, *args, **kwargs): pass + + @abstractmethod + def __enter__(self): + pass + + @abstractmethod + def __exit__(self, *exc): + pass + + @property + @abstractmethod + def disable_logging(self): + pass + + +class CDPBrowser(Browser): + """A web browser that can be interacted with via Chrome DevTools Protocol. + """ + + def __init__(self, flags, cdp_port, disable_logging): + pass diff --git a/html2image/browsers/chrome.py b/html2image/browsers/chrome.py index 141fd57..e678f09 100644 --- a/html2image/browsers/chrome.py +++ b/html2image/browsers/chrome.py @@ -1,10 +1,10 @@ -from .browser import Browser +from .chromium import ChromiumHeadless from .search_utils import get_command_origin, find_first_defined_env_var import subprocess -import platform import os import shutil +import platform ENV_VAR_LOOKUP_TOGGLE = 'HTML2IMAGE_TOGGLE_ENV_VAR_LOOKUP' @@ -15,7 +15,6 @@ 'CHROME_EXE', ] - def _find_chrome(user_given_executable=None): """ Finds a Chrome executable. @@ -161,8 +160,7 @@ def _find_chrome(user_given_executable=None): 'machine, please specify it yourself.' ) - -class ChromeHeadless(Browser): +class ChromeHeadless(ChromiumHeadless): """ Chrome/Chromium browser wrapper. @@ -174,23 +172,16 @@ class ChromeHeadless(Browser): - `flags` : list of str + Flags to be used by the headless browser. + Default flags are : - - '--default-background-color=000000' + - '--default-background-color=00000000' - '--hide-scrollbars' - `print_command` : bool + Whether or not to print the command used to take a screenshot. + - `disable_logging` : bool + + Whether or not to disable Chrome's output. """ - def __init__(self, executable=None, flags=None, print_command=False): - self.executable = executable - if not flags: - self.flags = [ - '--default-background-color=000000', - '--hide-scrollbars', - ] - else: - self.flags = [flags] if isinstance(flags, str) else flags - - self.print_command = print_command + def __init__(self, executable=None, flags=None, print_command=False, disable_logging=False): + super().__init__(executable=executable, flags=flags, print_command=print_command, disable_logging=disable_logging) @property def executable(self): @@ -199,58 +190,3 @@ def executable(self): @executable.setter def executable(self, value): self._executable = _find_chrome(value) - - def screenshot( - self, - input, - output_path, - output_file='screenshot.png', - size=(1920, 1080), - ): - """ Calls Chrome or Chromium headless to take a screenshot. - - Parameters - ---------- - - `output_file`: str - + Name as which the screenshot will be saved. - + File extension (e.g. .png) has to be included. - + Default is screenshot.png - - `input`: str - + File or url that will be screenshotted. - + Cannot be None - - `size`: (int, int), optional - + Two values representing the window size of the headless - + browser and by extention, the screenshot size. - + These two values must be greater than 0. - Raises - ------ - - `ValueError` - + If the value of `size` is incorrect. - + If `input` is empty. - """ - - if not input: - raise ValueError('The `input` parameter is empty.') - - if size[0] < 1 or size[1] < 1: - raise ValueError( - f'Could not screenshot "{output_file}" ' - f'with a size of {size}:\n' - 'A valid size consists of two integers greater than 0.' - ) - - # command used to launch chrome in - # headless mode and take a screenshot - command = [ - f'{self.executable}', - '--headless', - f'--screenshot={os.path.join(output_path, output_file)}', - f'--window-size={size[0]},{size[1]}', - *self.flags, - f'{input}', - ] - - if self.print_command: - print(' '.join(command)) - - subprocess.run(command) diff --git a/html2image/browsers/chrome_cdp.py b/html2image/browsers/chrome_cdp.py new file mode 100644 index 0000000..06a85ad --- /dev/null +++ b/html2image/browsers/chrome_cdp.py @@ -0,0 +1,217 @@ +from .browser import CDPBrowser +from .search_utils import find_chrome + +import os +import subprocess + +import requests +import json +from websocket import create_connection +import base64 + +import websocket +# websocket.enableTrace(True) + + +class ChromeCDP(CDPBrowser): + + def __init__( + self, executable=None, flags=None, + print_command=False, cdp_port=9222, + disable_logging=False, + ): + self.executable = executable + if not flags: + # for some reason, default-background-color prevents + # the browser from running + self.flags = [ + '--hide-scrollbars', + ] + else: + self.flags = [flags] if isinstance(flags, str) else flags + + self.print_command = print_command + self.cdp_port = cdp_port + self._disable_logging = disable_logging + + self._ws = None # Websocket connection + self.proc = None # Headless browser Popen object + + self.__id = 0 + + @property + def executable(self): + return self._executable + + @executable.setter + def executable(self, value): + self._executable = find_chrome(value) + + @property + def disable_logging(self): + return self._disable_logging + + @disable_logging.setter + def disable_logging(self, value): + self._disable_logging = value + + @property + def ws(self): + + if not self._ws: + print(f'----------- http://localhost:{self.cdp_port}/json/version') + r = requests.get(f'http://localhost:{self.cdp_port}/json') # TODO use page websocket instead of browser one + print(f'{r.json()=}') + print(f'Using ws url= {r.json()[0]["webSocketDebuggerUrl"]}') + self._ws = create_connection(r.json()[0]['webSocketDebuggerUrl']) + print('Successfully connected to ws.') + return self._ws + + @property + def _id(self): + self.__id += 1 + return self.__id + + def cdp_send(self, method, **params): + """ + """ + print(f'cdp_send: {method=} {params=}') + return self.ws.send( + json.dumps({ + 'id': self._id, + 'method': method, + 'params': params, + }) + ) + + def screenshot( + self, + input, + output_path, + output_file='screenshot.png', + size=(1920, 1080), + ): + """ + """ + # Useful documentation about the Chrome DevTools Protocol: + # https://chromedevtools.github.io/devtools-protocol/ + + # "Enabling" the page allows to receive the Page.loadEventFired event + self.cdp_send('Page.enable') + + self.cdp_send('Page.navigate', url=input) + + print('wait for page to load') + + # Wait for page to load entirely + while True: + message = json.loads(self.ws.recv()) + method = message.get('method') + print(f'{method=}') + if method == 'Page.loadEventFired': + break + + print('page disable') + self.cdp_send('Page.disable') + + self.cdp_send( + 'Emulation.setDeviceMetricsOverride', + width=size[0], + height=size[1], + deviceScaleFactor=0, # 0 disables the override + mobile=False, + ) + + print('send Page.captureScreenshot') + + self.cdp_send( + 'Page.captureScreenshot', + # captureBeyondViewport=True, + # clip={ + # 'width': size[0], + # 'height': size[1], + # 'x': 500, + # 'y': 200, + # 'scale': 4 + # } + ) + + print('writing to file..') + + # get screenshot data when ready, + # while potentially skipping unneeded messages + while True: + message = json.loads(self.ws.recv()) + # todo capture and display errors ? + if 'result' in message and 'data' in message['result']: + # retrive base64 encoded image data + img_data = message['result']['data'] + break + + # Decode and write image data to file + with open(os.path.join(output_path, output_file), 'wb') as f: + f.write(base64.b64decode(img_data)) + + def get_page_infos(self): + """ + """ + self.cdp_send('Page.getLayoutMetrics') + + while True: + message = json.loads(self.ws.recv()) + print(f'{message=}') + if 'result' in message and 'layoutViewport' in message['result']: + return message['result'] + + def print_pdf(): + # TODO : Page.printToPDF + pass + + def __enter__(self): + """ + """ + if not self.disable_logging: + print( + 'Starting headless Chrome with ' + f'--remote-debugging-port={self.cdp_port}.' + ) + + self.flags.append('--remote-allow-origins=*') + + command = [ + f'{self.executable}', + '--window-size=1920,1080', + f'--remote-debugging-port={self.cdp_port}', + '--headless=new', + '--no-first-run', + '--no-default-browser-check', + *self.flags, + ] + + # if self.print_command: + if True: + print(' '.join(command)) + + self.proc = subprocess.Popen(command, shell=True) + + def __exit__(self, *exc): + """ + """ + if self.disable_logging: + print(f'Closing headless Chrome instance on port {self.cdp_port}.') + + # check if the process is still running + if self.proc.poll() is None: + # ensure that it is properly killed + try: + self.cdp_send('Browser.close') + self.ws.close() + print('(TODO) Closed CDP and WebSocket connections properly.') + except Exception: + print('Could not properly close the CDP and WebSocket connections.') + + try: + self.proc.terminate() + print('Closed Chrome properly.') + except Exception: + print('Could not properly kill Chrome.') diff --git a/html2image/browsers/chromium.py b/html2image/browsers/chromium.py new file mode 100644 index 0000000..d27601c --- /dev/null +++ b/html2image/browsers/chromium.py @@ -0,0 +1,97 @@ +from .browser import Browser + +import os +import subprocess + +class ChromiumHeadless(Browser): + def __init__(self, executable=None, flags=None, print_command=False, disable_logging=False): + self.executable = executable + if not flags: + self.flags = [ + '--default-background-color=00000000', + '--hide-scrollbars', + ] + else: + self.flags = [flags] if isinstance(flags, str) else flags + + self.print_command = print_command + self.disable_logging = disable_logging + + def screenshot( + self, + input, + output_path, + output_file='screenshot.png', + size=(1920, 1080), + ): + """ Calls Chrome or Chromium headless to take a screenshot. + + Parameters + ---------- + - `output_file`: str + + Name as which the screenshot will be saved. + + File extension (e.g. .png) has to be included. + + Default is screenshot.png + - `input`: str + + File or url that will be screenshotted. + + Cannot be None + - `size`: (int, int), optional + + Two values representing the window size of the headless + + browser and by extention, the screenshot size. + + These two values must be greater than 0. + Raises + ------ + - `ValueError` + + If the value of `size` is incorrect. + + If `input` is empty. + """ + + if not input: + raise ValueError('The `input` parameter is empty.') + + if size[0] < 1 or size[1] < 1: + raise ValueError( + f'Could not screenshot "{output_file}" ' + f'with a size of {size}:\n' + 'A valid size consists of two integers greater than 0.' + ) + + # command used to launch chrome in + # headless mode and take a screenshot + command = [ + f'{self.executable}', + '--headless', + f'--screenshot={os.path.join(output_path, output_file)}', + f'--window-size={size[0]},{size[1]}', + *self.flags, + f'{input}', + ] + + if self.print_command: + print(' '.join(command)) + + subprocess.run(command, **self._subprocess_run_kwargs) + + @property + def disable_logging(self): + return self._disable_logging + + @disable_logging.setter + def disable_logging(self, value): + self._disable_logging = value + + # dict that will be passed unpacked as a parameter + # to the subprocess.call() method to take a screenshot + self._subprocess_run_kwargs = { + 'stdout': subprocess.DEVNULL, + 'stderr': subprocess.DEVNULL, + } if value else {} + + def __enter__(self): + print( + 'Context manager (with ... as:) is', + f'not supported for {__class__.__name__}.' + ) + + def __exit__(self, *exc): + pass diff --git a/html2image/browsers/edge.py b/html2image/browsers/edge.py new file mode 100644 index 0000000..38f2332 --- /dev/null +++ b/html2image/browsers/edge.py @@ -0,0 +1,172 @@ +from .chromium import ChromiumHeadless +from .search_utils import get_command_origin, find_first_defined_env_var + +import subprocess +import platform +import os +import shutil + +ENV_VAR_LOOKUP_TOGGLE = 'HTML2IMAGE_TOGGLE_ENV_VAR_LOOKUP' + +EDGE_EXECUTABLE_ENV_VAR_CANDIDATES = [ + 'HTML2IMAGE_EDGE_BIN', + 'HTML2IMAGE_EDGE_EXE', + 'EDGE_BIN', + 'EDGE_EXE', +] + + +def _find_edge(user_given_executable=None): + """ Finds a edge executable. + + Search Edge on a given path. If no path given, + try to find Edge or Chromium-browser on a Windows or Unix system. + + Parameters + ---------- + - `user_given_executable`: str (optional) + + A filepath leading to a Edge executable + + Or a filename found in the current working directory + + Or a keyword that executes Edge/ Chromium, ex: + - 'msedge' on linux and windows systems (typing `start msedge` in a windows cmd works) + + Raises + ------ + - `FileNotFoundError` + + If a suitable edge executable could not be found. + + Returns + ------- + - str + + Path of the edge executable on the current machine. + """ + + # try to find a edge bin/exe in ENV + path_from_env = find_first_defined_env_var( + env_var_list=EDGE_EXECUTABLE_ENV_VAR_CANDIDATES, + toggle=ENV_VAR_LOOKUP_TOGGLE + ) + + if path_from_env: + print( + f'Found a potential edge executable in the {path_from_env} ' + f'environment variable:\n{path_from_env}\n' + ) + return path_from_env + + # if an executable is given, try to use it + if user_given_executable is not None: + + # On Windows, we cannot "safely" validate that user_given_executable + # seems to be a edge executable, as we cannot run it with + # the --version flag. + # https://bugs.chromium.org/p/chromium/issues/detail?id=158372 + # + # We thus do the "bare minimum" and check if user_given_executable + # is a file, a filepath, or corresponds to a keyword that can be used + # with the start command, like so: `start user_given_executable` + if platform.system() == 'Windows': + command_origin = get_command_origin(user_given_executable) + if command_origin: + return command_origin + + # cannot validate user_given_executable + raise FileNotFoundError() + + # On a non-Windows OS, we can validate in a basic way that + # user_given_executable leads to a Edge executable, + # or is a command, using the --version flag + else: + try: + if 'edge' in subprocess.check_output( + [user_given_executable, '--version'] + ).decode('utf-8').lower(): + return user_given_executable + except Exception: + pass + + # We got a user_given_executable but couldn't validate it + raise FileNotFoundError( + 'Failed to find a seemingly valid edge executable ' + 'in the given path.' + ) + + # Executable not in ENV or given by the user, try to find it + # Search for executable on a Windows OS + if platform.system() == 'Windows': + prefixes = [ + os.getenv('PROGRAMFILES(X86)'), + os.getenv('PROGRAMFILES'), + os.getenv('LOCALAPPDATA'), + ] + + suffix = "Microsoft\\Edge\\Application\\msedge.exe" + + for prefix in prefixes: + path_candidate = os.path.join(prefix, suffix) + if os.path.isfile(path_candidate): + return path_candidate + + # Search for executable on a Linux OS + elif platform.system() == "Linux": + + edge_commands = [ + 'msedge', + '/opt/microsoft/msedge/msedge' + ] + + for edge_command in edge_commands: + if shutil.which(edge_command): + # check the --version for "edge" ? + return edge_command + + # Search for executable on MacOS + elif platform.system() == "Darwin": + # MacOS system + edge_app = ( + '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge' + ) + + try: + version_result = subprocess.check_output( + [edge_app, "--version"] + ) + if "Microsoft Edge" in str(version_result): + return edge_app + except Exception: + pass + + # Couldn't find an executable (or OS not in Windows, Linux or Mac) + raise FileNotFoundError( + 'Could not find a Edge executable on this ' + 'machine, please specify it yourself.' + ) + +class EdgeHeadless(ChromiumHeadless): + """ + Edge browser wrapper. + + Parameters + ---------- + - `executable` : str, optional + + Path to a edge executable. + + - `flags` : list of str + + Flags to be used by the headless browser. + + Default flags are : + - '--default-background-color=00000000' + - '--hide-scrollbars' + - `print_command` : bool + + Whether or not to print the command used to take a screenshot. + """ + + def __init__(self, executable=None, flags=None, print_command=False, disable_logging=False): + super().__init__(executable=executable, flags=flags, print_command=print_command, disable_logging=disable_logging) + + @property + def executable(self): + return self._executable + + @executable.setter + def executable(self, value): + self._executable = _find_edge(value) diff --git a/html2image/browsers/firefox.py b/html2image/browsers/firefox.py index 95f2d0f..30cb9a5 100644 --- a/html2image/browsers/firefox.py +++ b/html2image/browsers/firefox.py @@ -19,3 +19,9 @@ def executable(self, value): def render(self, **kwargs): pass + + def __enter__(self): + pass + + def __exit__(self, *exc): + pass diff --git a/html2image/browsers/firefox_cdp.py b/html2image/browsers/firefox_cdp.py new file mode 100644 index 0000000..4790616 --- /dev/null +++ b/html2image/browsers/firefox_cdp.py @@ -0,0 +1,167 @@ +from .browser import Browser +from .search_utils import find_firefox + +import os +import subprocess + +import requests +import json +from websocket import create_connection +import base64 + + +class FirefoxCDP(Browser): + def __init__(self, executable=None, flags=None, print_command=False): + self.executable = executable + if not flags: + self.flags = [ + # '--default-background-color=0', + # '--hide-scrollbars', + ] + else: + self.flags = [flags] if isinstance(flags, str) else flags + + self.print_command = print_command + + self.ws = None # Websocket connection + self.proc = None # Headless browser Popen object + + self.__id = 0 + + @property + def executable(self): + return self._executable + + @executable.setter + def executable(self, value): + self._executable = find_firefox(value) + + @property + def _id(self): + self.__id += 1 + return self.__id + + def cdp_send(self, method, **params): + """ + """ + return self.ws.send( + json.dumps({ + 'id': self._id, + 'method': method, + 'params': params, + }) + ) + + def screenshot( + self, + input, + output_path, + output_file='screenshot.png', + size=(1920, 1080), + ): + """ + """ + # Useful documentation about the Chrome DevTools Protocol: + # https://chromedevtools.github.io/devtools-protocol/ + + # enabling the page allow to receive the Page.loadEventFired event + self.cdp_send('Page.enable') + print(self.ws.recv()) + + self.cdp_send('Page.navigate', url=input) + print('Navigated to:', input) + + # wait for page to load entirely + while True: + print('Waiting for page to load...') + message = json.loads(self.ws.recv()) + print(message) + if message.get('method') == 'Page.loadEventFired': + break + + print('Page loaded.') + + self.cdp_send('Page.disable') + self.cdp_send( + 'Emulation.setDeviceMetricsOverride', + width=size[0], + height=size[1], + deviceScaleFactor=0, # 0 disables the override + mobile=False, + ) + self.cdp_send('Page.captureScreenshot') + + # get screenshot data when ready, + # while potentially skipping unneeded messages + while True: + message = json.loads(self.ws.recv()) + if 'result' in message and 'data' in message['result']: + # retrive base64 encoded image data + img_data = message['result']['data'] + break + + # decode and write image data to file + with open(os.path.join(output_path, output_file), 'wb') as f: + f.write(base64.b64decode(img_data)) + + def __enter__(self): + """ + """ + print('Entering context. (ChromeHeadlessServer)') + + temp_dir = os.environ['TMP'] if os.name == 'nt' else '/tmp' + temp_dir = os.path.join(temp_dir, 'firefox-html2image-profile') + + # create a user.js file that overrides Firefox's default config + # http://kb.mozillazine.org/User.js_file + os.makedirs(temp_dir, exist_ok=True) + with open(os.path.join(temp_dir, 'user.js'), 'w') as f: + f.write( + '// File generated by html2image\n' + 'user_pref("devtools.chrome.enabled", true);\n' + 'user_pref("devtools.debugger.prompt-connection", false);\n' + 'user_pref("devtools.debugger.remote-enabled", true);\n' + '\n' + '// https://bugzilla.mozilla.org/show_bug.cgi?id=1746154#c4 \n' + 'user_pref("fission.bfcacheInParent", false);\n' + 'user_pref("fission.webContentIsolationStrategy", 0);\n' + ) + + command = [ + f'{self.executable}', + # '--headless', + *self.flags, + '--profile', temp_dir, + '--new-instance', + '--remote-debugging-port=9223', + # 'about:home', + # browser.newtabpage.activity-stream.asrouter.providers.onboarding + ] + + if self.print_command or True: + print(' '.join(command)) + + self.proc = subprocess.Popen(command) + + r = requests.get('http://localhost:9223/json/list') + + self.ws = create_connection( + r.json()[0]['webSocketDebuggerUrl'], + + # Firefox 97.0 requires an empty origin header + suppress_origin=True, + ) + # except ConnectionRefusedError as e: + # # devtools.chrome.enabled true + # # devtools.debugger.prompt-connection false + # # devtools.debugger.remote-enabled true + # print('Could not connect to Firefox cdp instance.') + + def __exit__(self, *exc): + """ + """ + print(f'Exiting context. (ChromeHeadlessServer), {exc}') + + self.cdp_send('Browser.close') + self.ws.close() + self.proc.terminate() # ensure that the process is terminated diff --git a/html2image/browsers/search_utils.py b/html2image/browsers/search_utils.py index 793734d..070194d 100644 --- a/html2image/browsers/search_utils.py +++ b/html2image/browsers/search_utils.py @@ -1,5 +1,8 @@ -import shutil import os +import platform +import shutil +import subprocess + try: from winreg import ConnectRegistry, OpenKey, QueryValueEx,\ HKEY_LOCAL_MACHINE, HKEY_CURRENT_USER, KEY_READ @@ -7,6 +10,22 @@ # os is not Windows, and there is no need for winreg pass +ENV_VAR_LOOKUP_TOGGLE = 'HTML2IMAGE_TOGGLE_ENV_VAR_LOOKUP' + +CHROME_EXECUTABLE_ENV_VAR_CANDIDATES = [ + 'HTML2IMAGE_CHROME_BIN', + 'HTML2IMAGE_CHROME_EXE', + 'CHROME_BIN', + 'CHROME_EXE', +] + +FIREFOX_EXECUTABLE_ENV_VAR_CANDIDATES = [ + 'HTML2IMAGE_FIREFOX_BIN', + 'HTML2IMAGE_FIREFOX_EXE', + 'FIREFOX_BIN', + 'FIREFOX_EXE', +] + def get_command_origin(command): ''' Finds the path of a given command (windows only). @@ -93,3 +112,260 @@ def find_first_defined_env_var(env_var_list, toggle): if value: return value return None + + +def find_chrome(user_given_executable=None): + """ Finds a Chrome executable. + + Search Chrome on a given path. If no path given, + try to find Chrome or Chromium-browser on a Windows or Unix system. + + Parameters + ---------- + - `user_given_executable`: str (optional) + + A filepath leading to a Chrome/ Chromium executable + + Or a filename found in the current working directory + + Or a keyword that executes Chrome/ Chromium, ex: + - 'chromium' on linux systems + - 'chrome' on windows (if typing `start chrome` in a cmd works) + + Raises + ------ + - `FileNotFoundError` + + If a suitable chrome executable could not be found. + + Returns + ------- + - str + + Path of the chrome executable on the current machine. + """ + + # try to find a chrome bin/exe in ENV + path_from_env = find_first_defined_env_var( + env_var_list=CHROME_EXECUTABLE_ENV_VAR_CANDIDATES, + toggle=ENV_VAR_LOOKUP_TOGGLE + ) + + if path_from_env: + print( + f'Found a potential chrome executable in the {path_from_env} ' + f'environment variable:\n{path_from_env}\n' + ) + return path_from_env + + # if an executable is given, try to use it + if user_given_executable is not None: + + # On Windows, we cannot "safely" validate that user_given_executable + # seems to be a chrome executable, as we cannot run it with + # the --version flag. + # https://bugs.chromium.org/p/chromium/issues/detail?id=158372 + # + # We thus do the "bare minimum" and check if user_given_executable + # is a file, a filepath, or corresponds to a keyword that can be used + # with the start command, like so: `start user_given_executable` + if platform.system() == 'Windows': + command_origin = get_command_origin(user_given_executable) + if command_origin: + return command_origin + + # cannot validate user_given_executable + raise FileNotFoundError() + + # On a non-Windows OS, we can validate in a basic way that + # user_given_executable leads to a Chrome / Chromium executable, + # or is a command, using the --version flag + else: + try: + if 'chrom' in subprocess.check_output( + [user_given_executable, '--version'] + ).decode('utf-8').lower(): + return user_given_executable + except Exception: + pass + + # We got a user_given_executable but couldn't validate it + raise FileNotFoundError( + 'Failed to find a seemingly valid chrome executable ' + 'in the given path.' + ) + + # Executable not in ENV or given by the user, try to find it + # Search for executable on a Windows OS + if platform.system() == 'Windows': + prefixes = [ + os.getenv('PROGRAMFILES(X86)'), + os.getenv('PROGRAMFILES'), + os.getenv('LOCALAPPDATA'), + ] + + suffix = "Google\\Chrome\\Application\\chrome.exe" + + for prefix in prefixes: + path_candidate = os.path.join(prefix, suffix) + if os.path.isfile(path_candidate): + return path_candidate + + # Search for executable on a Linux OS + elif platform.system() == "Linux": + + chrome_commands = [ + 'chromium', + 'chromium-browser', + 'chrome', + 'google-chrome' + ] + + for chrome_command in chrome_commands: + if shutil.which(chrome_command): + # check the --version for "chrom" ? + return chrome_command + + # snap seems to be a special case? + # see https://stackoverflow.com/q/63375327/12182226 + + try: + version_result = subprocess.check_output( + ["chromium-browser", "--version"] + ) + if 'snap' in str(version_result): + chrome_snap = ( + '/snap/chromium/current/usr/lib/chromium-browser/chrome' + ) + if os.path.isfile(chrome_snap): + return chrome_snap + except Exception: + pass + + # Search for executable on MacOS + elif platform.system() == "Darwin": + # MacOS system + chrome_app = ( + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' + ) + + try: + version_result = subprocess.check_output( + [chrome_app, "--version"] + ) + if "Google Chrome" in str(version_result): + return chrome_app + except Exception: + pass + + # Couldn't find an executable (or OS not in Windows, Linux or Mac) + raise FileNotFoundError( + 'Could not find a Chrome executable on this ' + 'machine, please specify it yourself.' + ) + +def find_firefox(user_given_executable=None): + """ Finds a Firefox executable. + + Search Firefox on a given path. If no path given, + try to find Firefox on a Windows or Unix system. + + Parameters + ---------- + - `user_given_executable`: str (optional) + + A filepath leading to a Firefox executable + + Or a filename found in the current working directory + + Or a keyword that executes Firefox, ex: + - 'firefox' on linux systems + - 'firefox' on windows (if typing `start firefox` in a cmd works) + + Raises + ------ + - `FileNotFoundError` + + If a suitable Firefox executable could not be found. + + Returns + ------- + - str + + Path of the Firefox executable on the current machine. + """ + + # try to find a firefox bin/exe in ENV + path_from_env = find_first_defined_env_var( + env_var_list=FIREFOX_EXECUTABLE_ENV_VAR_CANDIDATES, + toggle=ENV_VAR_LOOKUP_TOGGLE + ) + + if path_from_env: + print( + f'Found a potential Firefox executable in the {path_from_env} ' + f'environment variable:\n{path_from_env}\n' + ) + return path_from_env + + # if an executable is given, try to use it + if user_given_executable is not None: + + if platform.system() == 'Windows': + user_given_executable = get_command_origin(user_given_executable) + + try: + version_output = subprocess.check_output( + [user_given_executable, '--version'] + ).decode('utf-8').lower() + + if 'Mozilla Firefox' in version_output: + return user_given_executable + else: + print( + 'Could not validate Firefox executable', + '(--version does not contains "Mozilla Firefox").' + ) + except Exception: + pass + + # We got a user_given_executable but couldn't validate it + raise FileNotFoundError( + 'Failed to find a seemingly valid Firefox executable ' + 'in the given path.' + ) + + # Executable not in ENV or given by the user, try to find it + # Search for executable on a Windows OS + if platform.system() == 'Windows': + prefixes = [ + os.getenv('PROGRAMFILES(X86)'), + os.getenv('PROGRAMFILES'), + os.getenv('LOCALAPPDATA'), + ] + + suffix = 'Mozilla Firefox\\firefox.exe' + + for prefix in prefixes: + path_candidate = os.path.join(prefix, suffix) + if os.path.isfile(path_candidate): + return path_candidate + + # Search for executable on a Linux OS + elif platform.system() == 'Linux': + if shutil.which('firefox'): + return 'firefox' + + # Search for executable on MacOS + elif platform.system() == 'Darwin': + # MacOS system + + # TODO : check if this is the right path + firefox_app = ( + '/Applications/Firefox.app/Contents/MacOS/firefox' # ? + ) + + try: + version_result = subprocess.check_output( + [firefox_app, '--version'] + ) + if 'Mozilla Firefox' in str(version_result): + return firefox_app + except Exception: + pass + + # Couldn't find an executable (or OS not in Windows, Linux or Mac) + raise FileNotFoundError( + 'Could not find a Chrome executable on this ' + 'machine, please specify it yourself.' + ) diff --git a/html2image/cli.py b/html2image/cli.py index 2ab556a..f4c8ca5 100644 --- a/html2image/cli.py +++ b/html2image/cli.py @@ -17,13 +17,6 @@ def size_type(string): f"size should be int,int, instead got {string}" ) - try: - hti = Html2Image() - except Exception as e: - print('Could not instanciate html2image.') - print(e) - exit(1) - parser = argparse.ArgumentParser() parser.add_argument('-U', '--url', nargs='*', required=False, default=[]) @@ -43,15 +36,23 @@ def size_type(string): parser.add_argument('-q', '--quiet', required=False, action="store_true") parser.add_argument('-v', '--verbose', required=False, action="store_true") - parser.add_argument('--browser', required=False) + # parser.add_argument('--browser', required=False) parser.add_argument('--chrome_path', required=False) # parser.add_argument('--firefox_path', required=False) parser.add_argument('--temp_path', required=False) args = parser.parse_args() + try: + hti = Html2Image(disable_logging=args.quiet) + except Exception as e: + print('Could not instanciate html2image.') + print(e) + exit(1) + if args.verbose: print(f'args = {args}') + hti.browser.print_command = True if args.output_path: hti.output_path = args.output_path @@ -64,7 +65,8 @@ def size_type(string): paths = hti.screenshot( html_file=args.html, css_file=args.css, other_file=args.other, - url=args.url, save_as=args.save_as, size=args.size + url=args.url, save_as=args.save_as, size=args.size, + browser_executable=args.chrome_path, ) if not args.quiet: diff --git a/html2image/html2image.py b/html2image/html2image.py index e0e8437..ca34d0b 100644 --- a/html2image/html2image.py +++ b/html2image/html2image.py @@ -13,16 +13,21 @@ from textwrap import dedent -from html2image.browsers import chrome, firefox +from html2image.browsers import chrome, chrome_cdp, edge # , firefox, firefox_cdp +from html2image.browsers.browser import Browser, CDPBrowser + browser_map = { 'chrome': chrome.ChromeHeadless, 'chromium': chrome.ChromeHeadless, 'google-chrome': chrome.ChromeHeadless, 'googlechrome': chrome.ChromeHeadless, - 'firefox': firefox.FirefoxHeadless, - 'mozilla-firefox': firefox.FirefoxHeadless, - 'mozilla firefox': firefox.FirefoxHeadless, + 'edge': edge.EdgeHeadless, + 'chrome-cdp': chrome_cdp.ChromeCDP, + 'chromium-cdp': chrome_cdp.ChromeCDP, + # 'firefox': firefox.FirefoxHeadless, + # 'mozilla-firefox': firefox.FirefoxHeadless, + # 'firefox-cdp': firefox_cdp.FirefoxCDP, } @@ -51,6 +56,9 @@ class Html2Image(): - `temp_path` : str, optional + Path to a directory that will be used to store temporary files. + - `keep_temp_files` : bool, optional + + If True, will not automatically remove temporary files created. + - `custom_flags`: list of str or str, optional + Additional custom flags for the headless browser. @@ -65,10 +73,13 @@ def __init__( self, browser='chrome', browser_executable=None, + browser_cdp_port=None, output_path=os.getcwd(), size=(1920, 1080), temp_path=None, + keep_temp_files=False, custom_flags=None, + disable_logging=False, ): if browser.lower() not in browser_map: @@ -79,12 +90,24 @@ def __init__( self.output_path = output_path self.size = size self.temp_path = temp_path + self.keep_temp_files = keep_temp_files + self.browser: Browser = None browser_class = browser_map[browser.lower()] - self.browser = browser_class( - executable=browser_executable, - flags=custom_flags, - ) + + if isinstance(browser_class, CDPBrowser): + self.browser = browser_class( + executable=browser_executable, + flags=custom_flags, + cdp_port=browser_cdp_port, + disable_logging=disable_logging, + ) + else: + self.browser = browser_class( + executable=browser_executable, + flags=custom_flags, + disable_logging=disable_logging, + ) @property def temp_path(self): @@ -161,6 +184,21 @@ def load_file(self, src, as_filename=None): dest = os.path.join(self.temp_path, as_filename) shutil.copyfile(src, dest) + def _remove_temp_file(self, filename): + """ Removes a file in the tmp directory. + + This function is used after a temporary file is created in order to + load an HTML string. + This prevents the temp directory to end up bloated by temp files. + + Parameters + ---------- + - `filename`: str + + Filename of the file to be removed + + (path is the temp_path directory) + """ + os.remove(os.path.join(self.temp_path, filename)) + def screenshot_loaded_file( self, file, output_file='screenshot.png', size=None ): @@ -390,7 +428,7 @@ def screenshot( other_file=[], url=[], save_as='screenshot.png', - size=[] + size=[], ): """ Takes a screeshot using different resources. @@ -482,6 +520,8 @@ def screenshot( output_file=name, size=current_size, ) + if not self.keep_temp_files: + self._remove_temp_file(html_filename) screenshot_paths.append(os.path.join(self.output_path, name)) @@ -515,6 +555,13 @@ def screenshot( return screenshot_paths + def __enter__(self): + self.browser.__enter__() + return self + + def __exit__(self, *exc): + self.browser.__exit__(*exc) + if __name__ == '__main__': pass diff --git a/pyproject.toml b/pyproject.toml index 84f5d3a..cd0384d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "html2image" -version = "2.0.3" +version = "2.0.4" description = "Package acting as a wrapper around the headless mode of existing web browsers to generate images from URLs and from HTML+CSS strings or files." authors = ["vgalin"] license = "MIT" @@ -26,8 +26,11 @@ include = [ [tool.poetry.dependencies] python = "^3.6" +websocket-client = "1.2.3" +requests = "*" [tool.poetry.dev-dependencies] +Pillow = "^8.2.0" [build-system] requires = ["poetry>=0.12"] diff --git a/requirements-test.txt b/requirements-test.txt index 128a5cd..d7f52e4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,2 +1,2 @@ -Pillow -pytest +Pillow>=8.2.0 +pytest \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..49bc7f8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +websocket-client==1.2.3 +requests \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py index a0e37a8..a38137c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -5,26 +5,27 @@ OUTPUT_PATH = "tests_output" +TEST_BROWSERS = ["edGe", "cHrOme"] def test_bad_browser(): with pytest.raises(ValueError): Html2Image(browser='watergoupil') +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_good_browser(browser): + Html2Image(browser=browser) -def test_good_browser(): - Html2Image(browser='cHrOme') - - -def test_screenshot_url(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_url(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) paths = hti.screenshot(url='https://www.python.org', save_as="pyorg.png") img = Image.open(paths[0]) assert (1920, 1080) == img.size # default size - -def test_screenshot_multiple_urls(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_multiple_urls(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) paths = hti.screenshot( url=['https://www.python.org', "https://www.example.org/"], save_as="mixed_urls.png", @@ -34,9 +35,9 @@ def test_screenshot_multiple_urls(): img = Image.open(path) assert (1920, 1080) == img.size # default size - -def test_screenshot_url_custom_size(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_url_custom_size(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) test_size = (334, 485) @@ -49,9 +50,9 @@ def test_screenshot_url_custom_size(): img = Image.open(paths[0]) assert test_size == img.size # default size - -def test_screenshot_url_custom_sizes(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_url_custom_sizes(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) test_sizes = [ (100, 100), @@ -73,9 +74,9 @@ def test_screenshot_url_custom_sizes(): img = Image.open(path) assert wanted_size == img.size - -def test_screenshot_url_sizes_missing_custom_names(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_url_sizes_missing_custom_names(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) test_sizes = [ (100, 100), @@ -100,9 +101,9 @@ def test_screenshot_url_sizes_missing_custom_names(): img = Image.open(path) assert wanted_size == img.size - -def test_screenshot_string(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_string(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) html = "Hello" css = "body{background: blue; font-size: 50px;}" @@ -119,9 +120,9 @@ def test_screenshot_string(): # check colors at top left corner assert pixels[0, 0] == (0, 0, 255, 255) # blue + no transparency - -def test_screenshot_string_different_sizes(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_string_different_sizes(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) test_sizes = [ (100, 100), @@ -141,9 +142,9 @@ def test_screenshot_string_different_sizes(): img = Image.open(path) assert wanted_size == img.size - -def test_screenshot_other_svg(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_other_svg(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) paths = hti.screenshot( other_file='./examples/star.svg', save_as="star_svg.png" @@ -157,9 +158,9 @@ def test_screenshot_other_svg(): # check colors at top left corner assert pixels[0, 0] == (0, 0, 0, 0) # full transparency no color - -def test_screenshot_file(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_file(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) paths = hti.screenshot( html_file="./examples/blue_page.html", @@ -175,9 +176,9 @@ def test_screenshot_file(): # check colors at top left corner assert pixels[0, 0] == (0, 0, 255, 255) # blue + no transparency - -def test_screenshot_file_different_sizes(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_screenshot_file_different_sizes(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) test_sizes = [ (100, 100), @@ -195,9 +196,9 @@ def test_screenshot_file_different_sizes(): img = Image.open(path) assert wanted_size == img.size - -def test_extend_size_param(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_extend_size_param(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) assert hti._extend_size_param([(50, 50)], 1) \ == [(50, 50)] @@ -211,9 +212,9 @@ def test_extend_size_param(): assert hti._extend_size_param([], 3) \ == [(1920, 1080), (1920, 1080), (1920, 1080)] - -def test_extend_save_as_param(): - hti = Html2Image(output_path=OUTPUT_PATH) +@pytest.mark.parametrize("browser", TEST_BROWSERS) +def test_extend_save_as_param(browser): + hti = Html2Image(browser=browser, output_path=OUTPUT_PATH) assert hti._extend_save_as_param(['a.png', 'b.png'], 2) == \ ['a.png', 'b.png']