* Added force parameter to ignore versions and just overwrite

* Include JS scripts * Exclude /_export/... urls
Netzvamp · Dec 12, 2018 · 35f2d81 · 35f2d81
1 parent cb5317d
commit 35f2d81
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -1,30 +1,39 @@
 # Dokuwiki offline
+[![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) [![Open Source Love svg2](https://badges.frapsoft.com/os/v2/open-source.svg?v=103)](https://github.com/ellerbrock/open-source-badges/) [![GPLv3 license](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0.html)
 
 A tool to download and store all pages and media files from Dokuwiki through the XML-RPC API, possible thanks to the excellent python dokuwiki package (https://pypi.org/project/dokuwiki/)!
 
-## Preparation
-
-    # Prepare a venv in the current directory and install the requirements with
-    pip install -r requirements.txt
+## Precompiled binarys
+[Download Releases](https://github.com/Netzvamp/dokuwiki_offline/releases)
 
 ## Usage
 
-    python dokuwiki_offline.py [OPTIONS]
+    dokuwiki_offline [OPTIONS]
 
     Options:
       --url TEXT       URL to a Dokuwiki i.e. https://mywiki.example
       --username TEXT  Login username
       --password TEXT  Login password
       --skipcert       Skip https certificate checks
+      --force          Skip local versioncheck and force download
       --help           Show this message and exit.
 
     Example:
 
-    python dokuwiki_offline.py --url https://mywiki.example --username Testuser --password mypassword123 --skipcert
+    dokuwiki_offline --url https://mywiki.example --username Testuser --password mypassword123 --skipcert
+
+    This will create a dump folder with a domain name subfolder with all html.
 
-Tested with Python 3.7.
+## Development / Build
 
-## Build a binary package/ Executable with PyInstaller
+### Preparation
+
+Developed with Python 3.7, but should also work with Python 3.5.
+
+    # Prepare a venv in the current directory and install the requirements with
+    pip install -r requirements.txt
+
+### Build a binary package/ Executable with PyInstaller
 
     # Install pyinstaller
     pip install pyinstaller
@@ -37,6 +46,11 @@ copied and distributed.
 
 ## History
 
+* 1.0: 2018-12-12
+    * Added force parameter to ignore versions and just overwrite
+    * Include JS scripts
+    * Exclude /_export/... urls
+
 * 1.0-r1: 2018-12-05
     * Only download new and updated files. File modification dates are stored in json file in the dump 
     * Added the wiki title, download and last modification date to the template file
@@ -45,5 +59,7 @@ copied and distributed.
 
 * 1.0-beta1: 2018-12-03
     * Initial release
+
+## Todo
 
-[![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) [![Open Source Love svg2](https://badges.frapsoft.com/os/v2/open-source.svg?v=103)](https://github.com/ellerbrock/open-source-badges/) [![GPLv3 license](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0.html)
+- [ ] Output folder as paramter
diff --git a/dokuwiki_offline.py b/dokuwiki_offline.py
@@ -48,7 +48,8 @@ def validate_url(ctx, param, url: str):
 @click.option('--username', prompt=True, help="Login username")
 @click.option('--password', prompt=True, help="Login password")
 @click.option('--skipcert', is_flag=True, default=False, help="Skip https certificate checks")
-def dump(url: str, username: str, password: str, skipcert: bool) -> None:
+@click.option('--force', is_flag=True, default=False, help="Skip local versioncheck and force download")
+def dump(url: str, username: str, password: str, skipcert: bool, force: bool) -> None:
     """
     A tool to download and store all pages and media files from Dokuwikis through the xmlrpl api.
 
@@ -77,7 +78,10 @@ def dump(url: str, username: str, password: str, skipcert: bool) -> None:
 
     # Load JSON file with file versions
     try:
-        versions = json.load(open(os.path.join(dump_folder, "versions.json")))
+        if not force:
+            versions = json.load(open(os.path.join(dump_folder, "versions.json")))
+        else:
+            versions = {"pages": {}, "medias": {}}
     except FileNotFoundError:
         versions = {"pages": {}, "medias": {}}
 
@@ -90,8 +94,8 @@ def dump(url: str, username: str, password: str, skipcert: bool) -> None:
             filename = "{}/{}.html".format(dump_folder, str(page['id']).replace(":", "/"))
             os.makedirs(os.path.dirname(filename), exist_ok=True)
 
+            # build a dot path by traversing down to root (../../../../)
             root_path = ""
-            # traverse down to root
             for fdown in range(0, filename.count("/") - 1):
                 root_path = "../{}".format(root_path)
 
@@ -104,10 +108,15 @@ def dump(url: str, username: str, password: str, skipcert: bool) -> None:
                 for url_match in re.finditer(r"<a[\s]+(?:[^>]*?\s+)?href=([\"'])(/.*?)\1", html, re.MULTILINE):
                     new_url = urlparse(url_match.group(2)[1:].replace(":", "/")).path + ".html"
 
-                    if "_media" in new_url:  # fix media download urls
-                        new_url = new_url.replace("_media/", "")
-                        new_url = new_url.replace(".html", "")
-                    new_url = root_path + new_url  # fix absolute paths with relative urls
+                    # filter unusable paths
+                    ignore_paths = ["_export/"]  # to be extended
+                    if any(ignore_path in new_url[:len(ignore_path)] for ignore_path in ignore_paths):
+                        new_url = url + url_match.group(2)  # Keep link to online page ... better than nothing
+                    else:
+                        if "_media" in new_url:  # fix media download urls
+                            new_url = new_url.replace("_media/", "")
+                            new_url = new_url.replace(".html", "")
+                        new_url = root_path + new_url  # fix absolute paths with relative urls
                     html = html.replace(url_match.group(2), new_url)
 
                 # fix img paths
@@ -126,6 +135,7 @@ def dump(url: str, username: str, password: str, skipcert: bool) -> None:
                 html = html.replace("{{download_date}}", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
                 html = html.replace("{{last_modified}}", str(date(pages.info(page['id'])['lastModified'])))
 
+                file.write('\ufeff')  # Write unicode BOM for Firefox
                 file.write(html)
 
                 versions['pages'][page['id']] = page_version
@@ -141,8 +151,6 @@ def dump(url: str, username: str, password: str, skipcert: bool) -> None:
 
             logger.info("Updating media {}".format(media['id']))
 
-            # print(wiki.medias.info(media['id']))
-
             filename = "{}/{}".format(dump_folder, str(media['id']).replace(":", "/"))
             os.makedirs(os.path.dirname(filename), exist_ok=True)
 
@@ -154,15 +162,23 @@ def dump(url: str, username: str, password: str, skipcert: bool) -> None:
         else:
             logger.info("No update needed for media {}".format(media['id']))
 
-    # Download stylesheet
+    # Download stylesheet and jquery and javascript
     try:
         if skipcert:
-            data = request.urlopen("{}/lib/exe/css.php".format(url), context=_create_unverified_context()).read()
+            css_data = request.urlopen("{}/lib/exe/css.php".format(url), context=_create_unverified_context()).read()
+            jquery_data = request.urlopen("{}/lib/exe/jquery.php".format(url), context=_create_unverified_context()).read()
+            js_data = request.urlopen("{}/lib/exe/js.php".format(url), context=_create_unverified_context()).read()
         else:
-            data = request.urlopen("{}/lib/exe/css.php".format(url)).read()
+            css_data = request.urlopen("{}/lib/exe/css.php".format(url)).read()
+            jquery_data = request.urlopen("{}/lib/exe/jquery.php".format(url)).read()
+            js_data = request.urlopen("{}/lib/exe/js.php".format(url)).read()
 
         with open("{}/style.css".format(dump_folder), "wb") as stylesheet:
-            stylesheet.write(data)
+            stylesheet.write(css_data)
+        with open("{}/jquery.js".format(dump_folder), "wb") as stylesheet:
+            stylesheet.write(jquery_data)
+        with open("{}/javascript.js".format(dump_folder), "wb") as stylesheet:
+            stylesheet.write(js_data)
     except Exception as err:
         logger.error("Stylesheet download error: {}".format(err))
 

diff --git a/template.tpl b/template.tpl
@@ -2,15 +2,16 @@
 <html>
 <head>
     <title>{{page_title}} [{{wiki_title}}]</title>
-    <link rel="stylesheet" type="text/css" href="{{root_link}}style.css"/>
     <meta name="viewport" content="width=device-width,initial-scale=1"/>
+    <link rel="stylesheet" type="text/css" href="{{root_link}}style.css"/>
 </head>
 <body>
 <div id="dokuwiki__site">
     <div id="dokuwiki__top" class="site dokuwiki mode_show tpl_dokuwiki loggedIn  home  ">
         <div id="dokuwiki__header">
             <div class="headings group">
-                <h1><a href="{{root_link}}start.html"><img src="{{root_link}}wiki/dokuwiki-128.png" width="64" height="64" alt=""><span>{{wiki_title}}</span></a></h1>
+                <h1><a href="{{root_link}}start.html"><img src="{{root_link}}wiki/dokuwiki-128.png" width="64"
+                                                           height="64" alt=""><span>{{wiki_title}}</span></a></h1>
             </div>
             <div class="breadcrumbs">
                 <div class="youarehere"><span class="home">{{path}}</span></div>
@@ -26,9 +27,17 @@
         </div>
         <div id="dokuwiki__footer">
             <div class="pad">
-                Download: {{download_date}} - Last modified: {{last_modified}} - Created with <a href="https://github.com/Netzvamp/dokuwiki_offline" target="_blank">dokuwiki_offline</a>
+                Download: {{download_date}} - Last modified: {{last_modified}} - Created with <a
+                    href="https://github.com/Netzvamp/dokuwiki_offline" target="_blank">dokuwiki_offline</a>
             </div>
         </div>
     </div>
+</div>
+<script type="text/javascript" charset="utf-8" src="{{root_link}}jquery.js"></script>
+<script type="text/javascript" charset="utf-8" src="{{root_link}}javascript.js"></script>
+<script type="text/javascript">/*<![CDATA[*/
+    DOKU_BASE='{{root_link}}';
+    JSINFO = {};
+/*!]]>*/</script>
 </body>
 </html>