From b33a4fb4ae543333717caff83652f82466dfb590 Mon Sep 17 00:00:00 2001 From: Dmitriy Bogdanov Date: Sun, 19 Apr 2020 12:14:18 +0400 Subject: [PATCH 1/4] TTS: minor parsing improvements --- app/src/main/res/raw/tts_parser.js | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/app/src/main/res/raw/tts_parser.js b/app/src/main/res/raw/tts_parser.js index 525c55d5d..50c8b1446 100644 --- a/app/src/main/res/raw/tts_parser.js +++ b/app/src/main/res/raw/tts_parser.js @@ -119,11 +119,10 @@ function parseDocumentText() { var accumulatedText = ''; var extras = []; - var currentElement = null; var emphasisStarts = []; - var checkForSentenceEnd = function() { + var checkForSentenceEnd = function(currentElement) { if (!accumulatedText || accumulatedText.trim().length === 0) return; var currentElementText = currentElement.textContent; @@ -143,7 +142,7 @@ function parseDocumentText() { var relevantExtras = values[1]; if (s.length !== 0) { - range.setEnd(currentElement, index); + range.setEnd(currentElement, index); // TODO: should subtract trimmed from end // console.log('checkForSentenceEnd()'); // console.log('Range: ' + range); var rect = range.getBoundingClientRect(); @@ -157,25 +156,26 @@ function parseDocumentText() { } range.setEnd(currentElement, currentElementLength); + + if (accumulatedText.trim().length === 0) accumulatedText = ''; }; var flushCurrentText = function() { if (accumulatedText && accumulatedText.trim().length > 0) { // console.log('flushCurrentText()'); // console.log('Range: ' + range); - var rect = range.getBoundingClientRect(); var values = prepareTextAndExtras(accumulatedText, extras, emphasisStarts, accumulatedText.length); var s = values[0]; var relevantExtras = values[1]; + var rect = range.getBoundingClientRect(); cmdText(s, rect.top, rect.bottom, serializeExtras(relevantExtras)); } accumulatedText = ''; extras = []; - currentElement = null; emphasisStarts = []; }; @@ -185,7 +185,7 @@ function parseDocumentText() { emphasisStarts.push(accumulatedText.length); } else { var lastStart = emphasisStarts.pop(); - if (lastStart !== undefined) { + if (lastStart !== undefined && accumulatedText.length > 0) { extras.push({type: 'emphasis', start: lastStart, end: accumulatedText.length}); } } @@ -210,15 +210,16 @@ function parseDocumentText() { if (element.nodeType === Node.TEXT_NODE) { if (!accumulatedText || accumulatedText.trim().length === 0) { + accumulatedText = ''; range.setStart(element, 0); } - accumulatedText += element.textContent; + accumulatedText += element.textContent + .replace(/[\r\n\x0B\x0C\u0085\u2028\u2029]/g, " "); - currentElement = element; range.setEnd(element, element.textContent.length); - checkForSentenceEnd(); + checkForSentenceEnd(element); } else if (element.nodeName === 'IMG') { flushCurrentText(); From 41c8c41289ecfed12c64a1a20ff18bdf8a6d351c Mon Sep 17 00:00:00 2001 From: Dmitriy Bogdanov Date: Sun, 19 Apr 2020 17:28:32 +0400 Subject: [PATCH 2/4] TTS: split sentences on ellipsis --- app/src/main/res/raw/tts_parser.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/src/main/res/raw/tts_parser.js b/app/src/main/res/raw/tts_parser.js index 50c8b1446..887073913 100644 --- a/app/src/main/res/raw/tts_parser.js +++ b/app/src/main/res/raw/tts_parser.js @@ -128,7 +128,7 @@ function parseDocumentText() { var currentElementText = currentElement.textContent; var currentElementLength = currentElementText.length; - var regex = /[.?!]+\s/g; + var regex = /[.?!\u2026]+\s/g; var match; while ((match = regex.exec(currentElementText)) !== null) { From 97d7a513dbc65164d9f4ec3d0232772ef48d6d9b Mon Sep 17 00:00:00 2001 From: Dmitriy Bogdanov Date: Sat, 18 Apr 2020 14:15:49 +0400 Subject: [PATCH 3/4] WIP: ranges --- app/src/main/assets/xpath-range.js | 518 ++++++++++++++++++ .../gaulupeau/apps/Poche/tts/GenericItem.java | 28 +- .../gaulupeau/apps/Poche/tts/ImageItem.java | 4 +- .../apps/Poche/tts/JsTtsController.java | 18 +- .../fr/gaulupeau/apps/Poche/tts/TextItem.java | 4 +- .../gaulupeau/apps/Poche/tts/WebViewText.java | 130 ++++- .../apps/Poche/ui/ReadArticleActivity.java | 2 + app/src/main/res/raw/tts_parser.js | 76 ++- 8 files changed, 749 insertions(+), 31 deletions(-) create mode 100644 app/src/main/assets/xpath-range.js diff --git a/app/src/main/assets/xpath-range.js b/app/src/main/assets/xpath-range.js new file mode 100644 index 000000000..94af958fd --- /dev/null +++ b/app/src/main/assets/xpath-range.js @@ -0,0 +1,518 @@ +(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.xpathRange = f()}})(function(){var define,module,exports;return (function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i 0) throw indexSize('start'); + so += sc.length; + } + + var ec = xpath.toNode(endPath, root); + if (ec === null) throw notFound('end'); + + var ei = document.createNodeIterator(ec, SHOW_TEXT); + var eo = endOffset - (0, _domSeek2['default'])(ei, endOffset); + + ec = ei.referenceNode; + if (!ei.pointerBeforeReferenceNode) { + if (eo > 0) throw indexSize('end'); + eo += ec.length; + } + + var range = document.createRange(); + range.setStart(sc, so); + range.setEnd(ec, eo); + + return range; + + function notFound(which) { + var error = new Error('The ' + which + ' node was not found.'); + error.name = 'NotFoundError'; + return error; + } + + function indexSize(which) { + var error = new Error('There is no text at the requested ' + which + ' offset.'); + error.name = 'IndexSizeError'; + return error; + } +} + +},{"dom-seek":3,"get-document":5,"simple-xpath-position":7}],2:[function(require,module,exports){ +module.exports = parents + +function parents(node, filter) { + var out = [] + + filter = filter || noop + + do { + out.push(node) + node = node.parentNode + } while(node && node.tagName && filter(node)) + + return out.slice(1) +} + +function noop(n) { + return true +} + +},{}],3:[function(require,module,exports){ +module.exports = require('./lib')['default']; + +},{"./lib":4}],4:[function(require,module,exports){ +'use strict'; + +exports.__esModule = true; +exports['default'] = seek; + +var _ancestors = require('ancestors'); + +var _ancestors2 = _interopRequireDefault(_ancestors); + +var _indexOf = require('index-of'); + +var _indexOf2 = _interopRequireDefault(_indexOf); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { 'default': obj }; } + +var E_SHOW = 'Argument 1 of seek must use filter NodeFilter.SHOW_TEXT.'; +var E_WHERE = 'Argument 2 of seek must be a number or a Text Node.'; + +var SHOW_TEXT = 4; +var TEXT_NODE = 3; + +function seek(iter, where) { + if (iter.whatToShow !== SHOW_TEXT) { + throw new Error(E_SHOW); + } + + var count = 0; + var node = iter.referenceNode; + var predicates = null; + + if (isNumber(where)) { + predicates = { + forward: function forward() { + return count < where; + }, + backward: function backward() { + return count > where; + } + }; + } else if (isText(where)) { + var forward = before(node, where) ? function () { + return false; + } : function () { + return node !== where; + }; + var backward = function backward() { + return node != where || !iter.pointerBeforeReferenceNode; + }; + predicates = { forward: forward, backward: backward }; + } else { + throw new Error(E_WHERE); + } + + while (predicates.forward() && (node = iter.nextNode()) !== null) { + count += node.nodeValue.length; + } + + while (predicates.backward() && (node = iter.previousNode()) !== null) { + count -= node.nodeValue.length; + } + + return count; +} + +function isNumber(n) { + return !isNaN(parseInt(n)) && isFinite(n); +} + +function isText(node) { + return node.nodeType === TEXT_NODE; +} + +function before(ref, node) { + if (ref === node) return false; + + var common = null; + var left = [ref].concat((0, _ancestors2['default'])(ref)).reverse(); + var right = [node].concat((0, _ancestors2['default'])(node)).reverse(); + + while (left[0] === right[0]) { + common = left.shift(); + right.shift(); + } + + left = left[0]; + right = right[0]; + + var l = (0, _indexOf2['default'])(common.childNodes, left); + var r = (0, _indexOf2['default'])(common.childNodes, right); + + return l > r; +} + +},{"ancestors":2,"index-of":6}],5:[function(require,module,exports){ + +/** + * Module exports. + */ + +module.exports = getDocument; + +// defined by w3c +var DOCUMENT_NODE = 9; + +/** + * Returns `true` if `w` is a Document object, or `false` otherwise. + * + * @param {?} d - Document object, maybe + * @return {Boolean} + * @private + */ + +function isDocument (d) { + return d && d.nodeType === DOCUMENT_NODE; +} + +/** + * Returns the `document` object associated with the given `node`, which may be + * a DOM element, the Window object, a Selection, a Range. Basically any DOM + * object that references the Document in some way, this function will find it. + * + * @param {Mixed} node - DOM node, selection, or range in which to find the `document` object + * @return {Document} the `document` object associated with `node` + * @public + */ + +function getDocument(node) { + if (isDocument(node)) { + return node; + + } else if (isDocument(node.ownerDocument)) { + return node.ownerDocument; + + } else if (isDocument(node.document)) { + return node.document; + + } else if (node.parentNode) { + return getDocument(node.parentNode); + + // Range support + } else if (node.commonAncestorContainer) { + return getDocument(node.commonAncestorContainer); + + } else if (node.startContainer) { + return getDocument(node.startContainer); + + // Selection support + } else if (node.anchorNode) { + return getDocument(node.anchorNode); + } +} + +},{}],6:[function(require,module,exports){ +/*! + * index-of + * + * Copyright (c) 2014-2015 Jon Schlinkert. + * Licensed under the MIT license. + */ + +'use strict'; + +module.exports = function indexOf(arr, ele, start) { + start = start || 0; + var idx = -1; + + if (arr == null) return idx; + var len = arr.length; + var i = start < 0 + ? (len + start) + : start; + + if (i >= arr.length) { + return -1; + } + + while (i < len) { + if (arr[i] === ele) { + return i; + } + i++; + } + + return -1; +}; + +},{}],7:[function(require,module,exports){ +module.exports = require('./lib/xpath') + +},{"./lib/xpath":9}],8:[function(require,module,exports){ +"use strict"; + +exports.__esModule = true; + +function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } + +var DOMException = function DOMException(message, name) { + _classCallCheck(this, DOMException); + + this.message = message; + this.name = name; + this.stack = new Error().stack; +}; + +exports["default"] = DOMException; + + +DOMException.prototype = new Error(); + +DOMException.prototype.toString = function () { + return this.name + ": " + this.message; +}; + +},{}],9:[function(require,module,exports){ +'use strict'; + +exports.__esModule = true; +exports.fromNode = fromNode; +exports.toNode = toNode; + +var _getDocument = require('get-document'); + +var _getDocument2 = _interopRequireDefault(_getDocument); + +var _domException = require('./dom-exception'); + +var _domException2 = _interopRequireDefault(_domException); + +function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { 'default': obj }; } + +// https://developer.mozilla.org/en-US/docs/XPathResult +var FIRST_ORDERED_NODE_TYPE = 9; + +// Default namespace for XHTML documents +var HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'; + +/** + * Compute an XPath expression for the given node. + * + * If the optional parameter `root` is supplied, the computed XPath expression + * will be relative to it. Otherwise, the root element is the root of the + * document to which `node` belongs. + * + * @param {Node} node The node for which to compute an XPath expression. + * @param {Node} [root] The root context for the XPath expression. + * @returns {string} + */ +function fromNode(node) { + var root = arguments.length <= 1 || arguments[1] === undefined ? null : arguments[1]; + + if (node === undefined) { + throw new Error('missing required parameter "node"'); + } + + root = root || (0, _getDocument2['default'])(node); + + var path = '/'; + while (node !== root) { + if (!node) { + var message = 'The supplied node is not contained by the root node.'; + var name = 'InvalidNodeTypeError'; + throw new _domException2['default'](message, name); + } + path = '/' + nodeName(node) + '[' + nodePosition(node) + ']' + path; + node = node.parentNode; + } + return path.replace(/\/$/, ''); +} + +/** + * Find a node using an XPath relative to the given root node. + * + * The XPath expressions are evaluated relative to the Node argument `root`. + * + * If the optional parameter `resolver` is supplied, it will be used to resolve + * any namespaces within the XPath. + * + * @param {string} path An XPath String to evaluate. + * @param {Node} root The root context for the XPath expression. + * @returns {Node|null} The first matching Node or null if none is found. + */ +function toNode(path, root) { + var resolver = arguments.length <= 2 || arguments[2] === undefined ? null : arguments[2]; + + if (path === undefined) { + throw new Error('missing required parameter "path"'); + } + if (root === undefined) { + throw new Error('missing required parameter "root"'); + } + + // Make the path relative to the root, if not the document. + var document = (0, _getDocument2['default'])(root); + if (root !== document) path = path.replace(/^\//, './'); + + // Make a default resolver. + var documentElement = document.documentElement; + if (resolver === null && documentElement.lookupNamespaceURI) { + (function () { + var defaultNS = documentElement.lookupNamespaceURI(null) || HTML_NAMESPACE; + resolver = function resolver(prefix) { + var ns = { '_default_': defaultNS }; + return ns[prefix] || documentElement.lookupNamespaceURI(prefix); + }; + })(); + } + + return resolve(path, root, resolver); +} + +// Get the XPath node name. +function nodeName(node) { + switch (node.nodeName) { + case '#text': + return 'text()'; + case '#comment': + return 'comment()'; + case '#cdata-section': + return 'cdata-section()'; + default: + return node.nodeName.toLowerCase(); + } +} + +// Get the ordinal position of this node among its siblings of the same name. +function nodePosition(node) { + var name = node.nodeName; + var position = 1; + while (node = node.previousSibling) { + if (node.nodeName === name) position += 1; + } + return position; +} + +// Find a single node with XPath `path` +function resolve(path, root, resolver) { + try { + // Add a default value to each path part lacking a prefix. + var nspath = path.replace(/\/(?!\.)([^\/:\(]+)(?=\/|$)/g, '/_default_:$1'); + return platformResolve(nspath, root, resolver); + } catch (err) { + return fallbackResolve(path, root); + } +} + +// Find a single node with XPath `path` using the simple, built-in evaluator. +function fallbackResolve(path, root) { + var steps = path.split("/"); + var node = root; + while (node) { + var step = steps.shift(); + if (step === undefined) break; + if (step === '.') continue; + + var _step$split = step.split(/[\[\]]/); + + var name = _step$split[0]; + var position = _step$split[1]; + + name = name.replace('_default_:', ''); + position = position ? parseInt(position) : 1; + node = findChild(node, name, position); + } + return node; +} + +// Find a single node with XPath `path` using `document.evaluate`. +function platformResolve(path, root, resolver) { + var document = (0, _getDocument2['default'])(root); + var r = document.evaluate(path, root, resolver, FIRST_ORDERED_NODE_TYPE, null); + return r.singleNodeValue; +} + +// Find the child of the given node by name and ordinal position. +function findChild(node, name, position) { + for (node = node.firstChild; node; node = node.nextSibling) { + if (nodeName(node) === name && --position === 0) break; + } + return node; +} + +},{"./dom-exception":8,"get-document":5}]},{},[1])(1) +}); diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/GenericItem.java b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/GenericItem.java index 68a3a5269..bf2db79d5 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/GenericItem.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/GenericItem.java @@ -2,13 +2,39 @@ abstract class GenericItem { + static class Range { + String start; + long startOffset; + String end; + long endOffset; + + Range(String start, long startOffset, String end, long endOffset) { + this.start = start; + this.startOffset = startOffset; + this.end = end; + this.endOffset = endOffset; + } + + @Override + public String toString() { + return "Range{" + + "start='" + start + '\'' + + ", startOffset=" + startOffset + + ", end='" + end + '\'' + + ", endOffset=" + endOffset + + '}'; + } + } + + Range range; float top; // top location in the web view float bottom; // bottom location in the web view long timePosition; // in milliseconds from the beginning of the document GenericItem() {} - GenericItem(float top, float bottom) { + GenericItem(Range range, float top, float bottom) { + this.range = range; this.top = top; this.bottom = bottom; } diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/ImageItem.java b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/ImageItem.java index ef6f932f9..63114d19b 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/ImageItem.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/ImageItem.java @@ -6,8 +6,8 @@ class ImageItem extends GenericItem { String title; String src; - ImageItem(String altText, String title, String src, float top, float bottom) { - super(top, bottom); + ImageItem(String altText, String title, String src, Range range, float top, float bottom) { + super(range, top, bottom); this.altText = altText; this.title = title; this.src = src; diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/JsTtsController.java b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/JsTtsController.java index 3a1908af6..776f482e4 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/JsTtsController.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/JsTtsController.java @@ -34,16 +34,24 @@ public void onEnd() { @SuppressWarnings("unused") @JavascriptInterface - public void onText(String text, String topString, String bottomString, String extras) { - post(() -> webViewText.onDocumentParseText(text, - Float.parseFloat(topString), Float.parseFloat(bottomString), extras)); + public void onText(String text, String extras, String range, + String topString, String bottomString) { + post(() -> webViewText.onDocumentParseText(text, extras, range, + Float.parseFloat(topString), Float.parseFloat(bottomString))); } @SuppressWarnings("unused") @JavascriptInterface - public void onImage(String altText, String title, String src, + public void onImage(String altText, String title, String src, String range, String topString, String bottomString) { - post(() -> webViewText.onDocumentParseImage(altText, title, src, + post(() -> webViewText.onDocumentParseImage(altText, title, src, range, + Float.parseFloat(topString), Float.parseFloat(bottomString))); + } + + @SuppressWarnings("unused") + @JavascriptInterface + public void onRangeInfoResponse(String requestId, String topString, String bottomString) { + post(() -> webViewText.onRangeInfoResponse(requestId, Float.parseFloat(topString), Float.parseFloat(bottomString))); } diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/TextItem.java b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/TextItem.java index 59db8947f..4801100d7 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/TextItem.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/TextItem.java @@ -38,8 +38,8 @@ static Type getType(String type) { String text; List extras; - TextItem(String text, float top, float bottom, List extras) { - super(top, bottom); + TextItem(String text, Range range, float top, float bottom, List extras) { + super(range, top, bottom); this.text = text; this.extras = extras; } diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/WebViewText.java b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/WebViewText.java index 3969f6541..c3a7a62e9 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/WebViewText.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/WebViewText.java @@ -20,9 +20,19 @@ */ class WebViewText implements TextInterface { + private static class RangeVisibilityRequest { + RangeVisibilityRequest(int requestId, boolean canMoveBackward) { + this.requestId = requestId; + this.canMoveBackward = canMoveBackward; + } + + int requestId; + boolean canMoveBackward; + } + private static final String TAG = WebViewText.class.getSimpleName(); - private static final String JS_PARSE_DOCUMENT_SCRIPT + private static final String JS_PARSE_DOCUMENT_SCRIPT // TODO: rename or reorganize = StorageHelper.readRawString(R.raw.tts_parser); private final Handler handler = new Handler(); @@ -40,6 +50,10 @@ class WebViewText implements TextInterface { private Runnable readFinishedCallback; private Runnable parsingFinishedCallback; + private int requestCounter; + + private RangeVisibilityRequest rangeVisibilityRequest; + WebViewText(TtsConverter ttsConverter, TtsHost ttsHost) { this.ttsConverter = ttsConverter; this.ttsHost = ttsHost; @@ -58,9 +72,12 @@ void parseWebViewDocument(Runnable callback) { parsingFinishedCallback = callback; + runJs(JS_PARSE_DOCUMENT_SCRIPT + "; parseDocumentText();"); + } + + private void runJs(String jsToRun) { ttsHost.getJsTtsController().setWebViewText(this); - ttsHost.getWebView().evaluateJavascript("javascript:" + JS_PARSE_DOCUMENT_SCRIPT - + ";parseDocumentText();", null); + ttsHost.getWebView().evaluateJavascript("javascript:" + jsToRun, null); } void onDocumentParseStart() { @@ -75,15 +92,16 @@ void onDocumentParseEnd() { } } - void onDocumentParseText(String text, float top, float bottom, String extras) { + void onDocumentParseText(String text, String extras, String range, float top, float bottom) { top = convertWebViewToScreenY(top); bottom = convertWebViewToScreenY(bottom); if (BuildConfig.DEBUG) { - Log.v(TAG, String.format("onDocumentParseText(%s, %f, %f)", text, top, bottom)); + Log.v(TAG, String.format("onDocumentParseText(%s, %s, %f, %f)", + text, range, top, bottom)); } - addItem(new TextItem(text, top, bottom, parseTextItemExtras(extras))); + addItem(new TextItem(text, parseRange(range), top, bottom, parseTextItemExtras(extras))); } private List parseTextItemExtras(String extrasString) { @@ -112,16 +130,56 @@ private List parseTextItemExtras(String extrasString) { return result; } - void onDocumentParseImage(String altText, String title, String src, float top, float bottom) { + void onDocumentParseImage(String altText, String title, String src, String range, + float top, float bottom) { top = convertWebViewToScreenY(top); bottom = convertWebViewToScreenY(bottom); if (BuildConfig.DEBUG) { - Log.v(TAG, String.format("onDocumentParseImage(%s, %s, %s, %f, %f)", - altText, title, src, top, bottom)); + Log.v(TAG, String.format("onDocumentParseImage(%s, %s, %s, %s, %f, %f)", + altText, title, src, range, top, bottom)); + } + + addItem(new ImageItem(altText, title, src, parseRange(range), top, bottom)); + } + + private GenericItem.Range parseRange(String rangeString) { + if (TextUtils.isEmpty(rangeString)) return null; + + Log.v(TAG, "parseRange() rangeString: " + rangeString); + + try { + JSONObject jsonRange = new JSONObject(rangeString); + + return new GenericItem.Range( + jsonRange.getString("start"), + jsonRange.getLong("startOffset"), + jsonRange.getString("end"), + jsonRange.getLong("endOffset")); + } catch (Exception e) { + Log.w(TAG, "parseRange()", e); + } + + return null; + } + + private String serializeRange(GenericItem.Range range) { + if (range == null) return null; + + JSONObject jsonRange = new JSONObject(); + + try { + jsonRange.put("start", range.start); + jsonRange.put("startOffset", range.startOffset); + jsonRange.put("end", range.end); + jsonRange.put("endOffset", range.endOffset); + + return jsonRange.toString(); + } catch (Exception e) { + Log.w(TAG, "serializeRange()", e); } - addItem(new ImageItem(altText, title, src, top, bottom)); + return null; } private void addItem(GenericItem item) { @@ -133,6 +191,20 @@ private void addItem(GenericItem item) { textList.add(item); } + void onRangeInfoResponse(String requestId, float top, float bottom) { + RangeVisibilityRequest request = rangeVisibilityRequest; + if (request == null || Integer.parseInt(requestId) != request.requestId) { + Log.d(TAG, "onRangeInfoResponse() no request or id didn't match"); + return; + } + rangeVisibilityRequest = null; + + top = convertWebViewToScreenY(top); + bottom = convertWebViewToScreenY(bottom); + + ensureTextRangeVisibleOnScreen(request.canMoveBackward, top, bottom); + } + @Override public synchronized int getCurrentIndex() { return current; @@ -316,14 +388,46 @@ public long getTotalDuration() { } private void ensureTextRangeVisibleOnScreen(boolean canMoveBackward) { + highlightRange(); // TODO: move + if (ttsHost == null) return; GenericItem item = textList.get(current); - if (item.bottom > ttsHost.getScrollY() + ttsHost.getViewHeight() - || canMoveBackward && item.top < ttsHost.getScrollY()) { + + if (item.range == null) { + Log.w(TAG, "ensureTextRangeVisibleOnScreen() range is null"); + return; + } + + rangeVisibilityRequest = new RangeVisibilityRequest(requestCounter++, canMoveBackward); + + // TODO: ensure the base is injected + runJs("getRangeInfo(" + rangeVisibilityRequest.requestId + + ", '" + serializeRange(item.range) + "');"); + } + + private void ensureTextRangeVisibleOnScreen(boolean canMoveBackward, float top, float bottom) { + if (ttsHost == null) return; + + if (bottom > ttsHost.getScrollY() + ttsHost.getViewHeight() + || canMoveBackward && top < ttsHost.getScrollY()) { // TODO: check: call directly? - handler.post(() -> ttsHost.scrollTo((int) item.top)); + handler.post(() -> ttsHost.scrollTo((int) top)); + } + } + + private void highlightRange() { + if (ttsHost == null) return; + + GenericItem item = textList.get(current); + + if (item.range == null) { + Log.w(TAG, "highlightRange() range is null"); + return; } + + // TODO: ensure the base is injected + runJs("highlightRange('" + serializeRange(item.range) + "');"); } private float convertWebViewToScreenY(float y) { diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java b/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java index 741825fb9..3e2c5a1b7 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java @@ -900,6 +900,8 @@ private String getExtraHead() { "\t\t"; } + extra += "\n" + "\t\t"; + return extra; } diff --git a/app/src/main/res/raw/tts_parser.js b/app/src/main/res/raw/tts_parser.js index 887073913..88e5da91d 100644 --- a/app/src/main/res/raw/tts_parser.js +++ b/app/src/main/res/raw/tts_parser.js @@ -4,11 +4,14 @@ function cmdStart() { function cmdEnd() { hostWebViewTextController.onEnd(); } -function cmdText(text, top, bottom, extras) { - hostWebViewTextController.onText(text, top, bottom, extras); +function cmdText(text, extras, range, top, bottom) { + hostWebViewTextController.onText(text, extras, JSON.stringify(range), top, bottom); } -function cmdImg(altText, title, src, top, bottom) { - hostWebViewTextController.onImage(altText, title, src, top, bottom); +function cmdImg(altText, title, src, range, top, bottom) { + hostWebViewTextController.onImage(altText, title, src, JSON.stringify(range), top, bottom); +} +function cmdRangeInfo(requestId, top, bottom) { + hostWebViewTextController.onRangeInfoResponse(requestId, top, bottom); } function traverse(element, callback) { @@ -112,7 +115,14 @@ function shiftExtras(extras, emphasisStarts, amount) { } } +function serializeRange(range, root) { + // uses the `xpath-range` library + return xpathRange.fromRange(range, root); +} + function parseDocumentText() { + var root = document.getElementById('article'); + var range = document.createRange(); // var stack = []; @@ -146,7 +156,7 @@ function parseDocumentText() { // console.log('checkForSentenceEnd()'); // console.log('Range: ' + range); var rect = range.getBoundingClientRect(); - cmdText(s, rect.top, rect.bottom, serializeExtras(relevantExtras)); + cmdText(s, serializeExtras(relevantExtras), serializeRange(range, root), rect.top, rect.bottom); } accumulatedText = accumulatedText.substring(end); @@ -171,7 +181,7 @@ function parseDocumentText() { var relevantExtras = values[1]; var rect = range.getBoundingClientRect(); - cmdText(s, rect.top, rect.bottom, serializeExtras(relevantExtras)); + cmdText(s, serializeExtras(relevantExtras), serializeRange(range, root), rect.top, rect.bottom); } accumulatedText = ''; @@ -225,7 +235,7 @@ function parseDocumentText() { range.selectNode(element); var rect = range.getBoundingClientRect(); - cmdImg(element.alt, element.title, element.src, rect.top, rect.bottom); + cmdImg(element.alt, element.title, element.src, serializeRange(range, root), rect.top, rect.bottom); } else if (shouldBreak(element)) { flushCurrentText(); } @@ -246,7 +256,7 @@ function parseDocumentText() { cmdStart(); - traverse(document.getElementById('article'), parserCallback); + traverse(root, parserCallback); cmdEnd(); } @@ -259,3 +269,53 @@ function shouldBreak(element) { //function stackToString(stack) { // return stack.map(e => e.nodeName.toLowerCase()).join(', '); //} + + +function deserializeRange(rangeString, root) { + var rangeObj = JSON.parse(rangeString); + return xpathRange.toRange(rangeObj.start, rangeObj.startOffset, + rangeObj.end, rangeObj.endOffset, root); +} + +function getRangeInfo(requestId, rangeString) { + var range = deserializeRange(rangeString, document.getElementById('article')); + + var rect = range.getBoundingClientRect(); + cmdRangeInfo(requestId, rect.top, rect.bottom); +} + +function highlightRange(rangeString) { + var range = deserializeRange(rangeString, document.getElementById('article')); + + document.getSelection().removeAllRanges(); + document.getSelection().addRange(range); +} + + +/* +// for debugging in a web-browser: +// just uncomment this and copy-paste everything into the browser console + +//function serializeRange(range, root) { +// return null; +//} + +function cmdStart() { + console.log('parse start'); +} +function cmdEnd() { + console.log('parse end'); +} +function cmdText(text, extras, range, top, bottom) { + console.log('TEXT: ' + text); + if (range !== null) console.log('RANGE: ' + JSON.stringify(range)); + if (extras !== null) console.log('EXTRAS: ' + extras); +} +function cmdImg(altText, title, src, range, top, bottom) { + console.log('IMG: ' + altText + ', title: ' + title + ', src: ' + src); + if (range !== null) console.log('RANGE: ' + JSON.stringify(range)); +} + + +parseDocumentText(); +*/ From 3931fec9c126f286883ee118ac6c1403a839f6d1 Mon Sep 17 00:00:00 2001 From: Dmitriy Bogdanov Date: Sun, 19 Apr 2020 17:37:37 +0400 Subject: [PATCH 4/4] WIP: java parser --- app/build.gradle | 1 + .../fr/gaulupeau/apps/Poche/tts/Parser.java | 455 ++++++++++++++++++ .../apps/Poche/ui/ReadArticleActivity.java | 7 + 3 files changed, 463 insertions(+) create mode 100644 app/src/main/java/fr/gaulupeau/apps/Poche/tts/Parser.java diff --git a/app/build.gradle b/app/build.gradle index 61948c1c5..2d86cdbc9 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -74,4 +74,5 @@ dependencies { implementation 'com.mikepenz:aboutlibraries:7.1.0' implementation 'com.github.di72nn.wallabag-api-wrapper:api-wrapper:v2.0.0-beta.5' implementation 'org.slf4j:slf4j-android:1.7.30' + implementation 'org.jsoup:jsoup:1.13.1' } diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/tts/Parser.java b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/Parser.java new file mode 100644 index 000000000..7c131d939 --- /dev/null +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/tts/Parser.java @@ -0,0 +1,455 @@ +package fr.gaulupeau.apps.Poche.tts; + +import android.util.Log; + +import org.jsoup.Jsoup; +import org.jsoup.internal.StringUtil; +import org.jsoup.nodes.Comment; +import org.jsoup.nodes.DataNode; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Parser { + + private static final String TAG = Parser.class.getSimpleName(); + + private static final Pattern SENTENCE_END_PATTERN = Pattern.compile("[.?!\u2026]+\\s"); + + private static final Set EMPHASIS_TAGS = new HashSet<>(Arrays.asList( + "b", "i", "strong", "em" + )); + + private static abstract class VisitorAdapter implements NodeVisitor { + + private Node currentlySkipping; + + @Override + public void head(Node node, int depth) { + if (currentlySkipping != null) return; + + if (shouldSkip(node)) { + currentlySkipping = node; + return; + } + + if (node.childNodeSize() > 0) { + enterNode(node); + } else { + processLeafNode(node); + } + } + + @Override + public void tail(Node node, int depth) { + if (node == currentlySkipping) { + currentlySkipping = null; + return; + } + + if (node.childNodeSize() > 0) { + leaveNode(node); + } + } + + boolean shouldSkip(Node node) { + return false; + } + + abstract void enterNode(Node node); + + abstract void leaveNode(Node node); + + abstract void processLeafNode(Node node); + + } + + static class Range { + Node start; + int startOffset; + Node end; + int endOffset; + + Range copy() { + Range range = new Range(); + range.start = start; + range.startOffset = startOffset; + range.end = end; + range.endOffset = endOffset; + return range; + } + + @Override + public String toString() { + return "Range{" + + "start=" + start + + ", startOffset=" + startOffset + + ", end=" + end + + ", endOffset=" + endOffset + + '}'; + } + } + + static class EmphasisExtra { + int start, end; + + EmphasisExtra(int start, int end) { + this.start = start; + this.end = end; + } + + EmphasisExtra copy() { + return new EmphasisExtra(start, end); + } + + @Override + public String toString() { + return "EmphasisExtra{" + + "start=" + start + + ", end=" + end + + '}'; + } + } + + private Node rootNode; + + private StringBuilder accumulatedText = new StringBuilder(); + private Range currentRange = new Range(); + private List extras = new LinkedList<>(); + private LinkedList emphasisStarts = new LinkedList<>(); + + private static void replaceNewLinesWithSpaces(StringBuilder sb, int startOffset) { + for (int i = startOffset; i < sb.length(); i++) { + char c = sb.charAt(i); + if (c == '\n' || c == '\r') { + sb.setCharAt(i, ' '); + } + } + } + + private static boolean isBlank(CharSequence s) { + if (s.length() == 0) return true; + + return countWhitespacesFromStart(s) == s.length(); + } + + private static int countWhitespacesFromStart(CharSequence s) { + int i; + for (i = 0; i < s.length(); i++) { + int c = s.charAt(i); + if (!StringUtil.isActuallyWhitespace(c) && !StringUtil.isInvisibleChar(c)) break; + } + return i; + } + + private static int countWhitespacesFromEnd(CharSequence s, int start, int end) { + int i; + for (i = end - 1; i >= start; i--) { + int c = s.charAt(i); + if (!StringUtil.isActuallyWhitespace(c) && !StringUtil.isInvisibleChar(c)) break; + } + return end - i - 1; + } + + public void parse(String html) { + Document document = Jsoup.parseBodyFragment(html); // TODO: check + + document.traverse(new VisitorAdapter() { + @Override + boolean shouldSkip(Node node) { + return node instanceof DataNode || node instanceof Comment; // TODO: check + } + + @Override + void enterNode(Node node) { + if (rootNode == null && isRoot(node)) { + rootNode = node; + } + processBoundary(node, true); + } + + @Override + void leaveNode(Node node) { + processBoundary(node, false); + } + + @Override + void processLeafNode(Node node) { + Parser.this.processLeafNode(node); + } + + boolean isRoot(Node node) { // TODO: fix hardcode + return node instanceof Element && "article".equals(((Element) node).normalName()); + } + }); + } + + private void addText(String text, List extras, Range range) { +// Log.i(TAG, String.format("addText(%s, %s, %s)", text, extras, range)); + Log.i(TAG, String.format("addText(%s, %s)", text, extras)); + GenericItem.Range xPathRange = toXPathRange(range); + Log.d(TAG, "addText() XPath range: " + xPathRange); + // TODO + } + + private void addImage(String altText, String title, String src, Range range) { +// Log.i(TAG, String.format("addImage(%s, %s, %s, %s)", altText, title, src, range)); + Log.i(TAG, String.format("addImage(%s, %s, %s)", altText, title, src)); + GenericItem.Range xPathRange = toXPathRange(range); + Log.d(TAG, "addImage() XPath range: " + xPathRange); + // TODO + } + + private GenericItem.Range toXPathRange(Range range) { + return new GenericItem.Range( + getXPathString(range.start, rootNode), range.startOffset, + getXPathString(range.end, rootNode), range.endOffset + ); + } + + private void processBoundary(Node node, boolean enter) { + if (shouldBreak(node)) { + flushCurrentText(); + } else { + handleFormatting(node, enter); + } + } + + private void processLeafNode(Node node) { + if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + if (!textNode.isBlank()) { + if (accumulatedText.length() == 0) { + currentRange.start = node; + currentRange.startOffset = 0; + } + + int oldLength = accumulatedText.length(); + accumulatedText.append(textNode.getWholeText()); + replaceNewLinesWithSpaces(accumulatedText, oldLength); + + currentRange.end = node; + currentRange.endOffset = textNode.getWholeText().length(); + + checkForSentenceEnd(textNode); + } + } else if (node instanceof Element && "img".equals(((Element) node).normalName())) { + flushCurrentText(); + + Node parent = node.parent(); + + Range range = new Range(); + range.start = parent; + range.end = parent; + range.startOffset = indexOf(node); + range.endOffset = range.startOffset + 1; + + Element img = ((Element) node); + + addImage(img.attr("alt"), img.attr("title"), // TODO: check title + img.attr("src"), range); + } else if (shouldBreak(node)) { + flushCurrentText(); + } + } + + private boolean shouldBreak(Node node) { // TODO: note: downgrade - no computed style + if (node instanceof Element) { + Element element = (Element) node; + + return element.isBlock() || "br".equals(element.normalName()); + } + + return false; + } + + private void checkForSentenceEnd(TextNode currentNode) { + if (accumulatedText.length() == 0) return; + + String currentNodeText = currentNode.getWholeText(); + int currentNodeTextLength = currentNodeText.length(); + + boolean found = false; + + Matcher matcher = SENTENCE_END_PATTERN.matcher(currentNodeText); + while (matcher.find()) { + int index = matcher.end(); + + int end = accumulatedText.length() - (currentNodeTextLength - index); + + Range range = currentRange.copy(); + range.endOffset = index; + + processText(range, end); + + accumulatedText.delete(0, end); + shiftExtras(end); + + currentRange.start = currentNode; + currentRange.startOffset = index; + + found = true; + } + + if (found && isBlank(accumulatedText)) { + accumulatedText.delete(0, accumulatedText.length()); + } + } + + private void flushCurrentText() { + int length = accumulatedText.length(); + if (length > 0) { + processText(currentRange.copy(), length); + } + + accumulatedText.delete(0, accumulatedText.length()); + extras.clear(); + emphasisStarts.clear(); + } + + private void processText(Range range, int length) { + int trimFromStart = countWhitespacesFromStart(accumulatedText); + int trimFromEnd = countWhitespacesFromEnd(accumulatedText, trimFromStart, length); + + if (length - trimFromStart - trimFromEnd > 0) { + range.startOffset += trimFromStart; + range.endOffset -= trimFromEnd; + + List relevantExtras = getRelevantExtras(trimFromStart, length); + + String s = accumulatedText.substring(trimFromStart, length - trimFromEnd); + + addText(s, relevantExtras, range); + } + } + + private void handleFormatting(Node node, boolean start) { + if (node instanceof Element && EMPHASIS_TAGS.contains(((Element) node).normalName())) { + if (start) { + emphasisStarts.push(accumulatedText.length()); + } else { + Integer lastStart = emphasisStarts.poll(); + if (lastStart != null && accumulatedText.length() > 0) { + extras.add(new EmphasisExtra(lastStart, accumulatedText.length())); + } + } + } + } + + private List getRelevantExtras(int startOffset, int end) { + List result = null; + + for (EmphasisExtra extra : extras) { + if (extra.start < end) { + EmphasisExtra copy = extra.copy(); + copy.start -= startOffset; + copy.end -= startOffset; + + if (result == null) result = new ArrayList<>(); + result.add(copy); + } + } + + if (emphasisStarts.size() > 0) { + if (result == null) result = new ArrayList<>(1); + result.add(new EmphasisExtra( + emphasisStarts.getLast() - startOffset, + end - startOffset)); + } + + return result; + } + + private void shiftExtras(int amount) { + if (amount == 0) return; + + for (int i = 0; i < emphasisStarts.size(); i++) { + emphasisStarts.set(i, Math.max(emphasisStarts.get(i) - amount, 0)); + } + + for (ListIterator it = extras.listIterator(); it.hasNext(); ) { + EmphasisExtra extra = it.next(); + + extra.start = Math.max(extra.start - amount, 0); + + extra.end -= amount; + if (extra.end <= 0) it.remove(); + } + } + + private String getXPathString(Node node, Node rootNode) { + StringBuilder sb = new StringBuilder(); // TODO: optimize? + + do { + int index = indexOf(node); + sb.insert(0, getXPathPart(node, index)); // TODO: optimize? + } while (node != rootNode && (node = node.parent()) != null); + + return sb.toString(); + } + + private String getXPathPart(Node node, int index) { + index++; // 1-based indexing + + if (node instanceof TextNode) { + return "/text()[" + index + "]"; + } else if (node instanceof Element) { + String name = ((Element) node).normalName(); + return "/" + name + "[" + index + "]"; + } else { + Log.e(TAG, "getXPath() don't know how to deal with " + node); + return null; + } + } + + private int indexOf(Node node) { + Node parent = node.parent(); + + if (parent == null) return 0; + + boolean textNode = node instanceof TextNode; + String name = node instanceof Element ? ((Element) node).normalName() : null; + + if (!textNode && name == null) { + Log.e(TAG, "indexOf() node type is not supported: " + node); + return 0; + } + + boolean found = false; + int sameTypeSiblings = 0; + + for (Node n : parent.childNodes()) { + if (n == node) { + found = true; + break; + } + + if (textNode) { + if (n instanceof TextNode) { + sameTypeSiblings++; + } + } else if (n instanceof Element && name.equals(((Element) n).normalName())) { + sameTypeSiblings++; + } + } + + if (!found) { + Log.w(TAG, "getXPath() node index wasn't found"); + } + + return sameTypeSiblings; + } + +} diff --git a/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java b/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java index 3e2c5a1b7..04a3e6ec2 100644 --- a/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java +++ b/app/src/main/java/fr/gaulupeau/apps/Poche/ui/ReadArticleActivity.java @@ -60,6 +60,7 @@ import fr.gaulupeau.apps.Poche.network.ImageCacheUtils; import fr.gaulupeau.apps.Poche.service.OperationsHelper; import fr.gaulupeau.apps.Poche.tts.JsTtsController; +import fr.gaulupeau.apps.Poche.tts.Parser; import fr.gaulupeau.apps.Poche.tts.TtsFragment; import fr.gaulupeau.apps.Poche.tts.TtsHost; @@ -1292,6 +1293,12 @@ private boolean loadArticle(long id) { Log.v(TAG, "loadArticle() articleProgress: " + articleProgress); Log.v(TAG, "loadArticle() articleLanguage: " + article.getLanguage()); + article.getContent(); + + Log.v(TAG, "loadArticle() parsing started"); + new Parser().parse("
" + article.getContent() + "
"); + Log.v(TAG, "loadArticle() parsing ended"); + return true; }