diff --git a/lib/cleaner.js b/lib/cleaner.js index 382b944..eed2140 100644 --- a/lib/cleaner.js +++ b/lib/cleaner.js @@ -1,6 +1,6 @@ // Generated by CoffeeScript 2.0.0-beta7 void function () { - var _, cleanArticleTags, cleanBadTags, cleanCodeBlocks, cleanEmTags, cleaner, cleanErrantLinebreaks, cleanParaSpans, cleanUnderlines, divToPara, getReplacementNodes, removeBodyClasses, removeDropCaps, removeNodesRegex, removeScriptsStyles, replaceWithPara; + var _, cleanArticleTags, cleanBadTags, cleanCodeBlocks, cleanEmTags, cleaner, cleanErrantLinebreaks, cleanParaSpans, cleanSelector, cleanUnderlines, divToPara, getReplacementNodes, removeBodyClasses, removeDropCaps, removeNodesRegex, removeScriptsStyles, replaceWithPara; _ = require('lodash'); module.exports = cleaner = function (doc) { removeBodyClasses(doc); @@ -16,6 +16,7 @@ void function () { removeNodesRegex(doc, /[^-]facebook/); removeNodesRegex(doc, /facebook-broadcasting/); removeNodesRegex(doc, /[^-]twitter/); + cleanSelector(doc, 'blockquote.instagram-media'); cleanParaSpans(doc); cleanUnderlines(doc); cleanErrantLinebreaks(doc); @@ -68,6 +69,9 @@ void function () { }); return doc(comments).remove(); }; + cleanSelector = function (doc, selector) { + return doc(selector).remove(); + }; cleanBadTags = function (doc) { var re, removeNodesRe, toRemove; removeNodesRe = '^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|partner-gravity-ad|video-full-transcript|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies'; diff --git a/lib/extractor.js b/lib/extractor.js index 0c954a7..510138b 100644 --- a/lib/extractor.js +++ b/lib/extractor.js @@ -1,6 +1,6 @@ // Generated by CoffeeScript 2.0.0-beta7 void function () { - var _, addSiblings, biggestTitleChunk, cleanText, cleanTitle, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, rawTitle, stopwords, updateNodeCount, updateScore; + var _, addSiblings, biggestTitleChunk, cleanNull, cleanText, cleanTitle, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, rawTitle, stopwords, updateNodeCount, updateScore; _ = require('lodash'); stopwords = require('./stopwords'); formatter = require('./formatter'); @@ -8,7 +8,7 @@ void function () { date: function (doc) { var cache$, cache$1, cache$2, cache$3, cache$4, dateCandidates; dateCandidates = doc("meta[property='article:published_time'], meta[itemprop*='datePublished'], meta[name='dcterms.modified'], meta[name='dcterms.date'], meta[name='DC.date.issued'], meta[name='dc.date.issued'], meta[name='dc.date.modified'], meta[name='dc.date.created'], meta[name='DC.date'], meta[name='DC.Date'], meta[name='dc.date'], meta[name='date'], time[itemprop*='pubDate'], time[itemprop*='pubdate'], span[itemprop*='datePublished'], span[property*='datePublished'], p[itemprop*='datePublished'], p[property*='datePublished'], div[itemprop*='datePublished'], div[property*='datePublished'], li[itemprop*='datePublished'], li[property*='datePublished'], time, span[class*='date'], p[class*='date'], div[class*='date']"); - return (null != dateCandidates && null != (cache$ = dateCandidates.first()) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0) || (null != dateCandidates && null != (cache$2 = dateCandidates.first()) && null != (cache$3 = cache$2.attr('datetime')) ? cache$3.trim() : void 0) || cleanText(null != dateCandidates && null != (cache$4 = dateCandidates.first()) ? cache$4.text() : void 0) || null; + return (null != (cache$ = cleanNull(null != dateCandidates && null != (cache$1 = dateCandidates.first()) ? cache$1.attr('content') : void 0)) ? cache$.trim() : void 0) || (null != (cache$2 = cleanNull(null != dateCandidates && null != (cache$3 = dateCandidates.first()) ? cache$3.attr('datetime') : void 0)) ? cache$2.trim() : void 0) || cleanText(null != dateCandidates && null != (cache$4 = dateCandidates.first()) ? cache$4.text() : void 0) || null; }, copyright: function (doc) { var cache$, copyright, copyrightCandidates, text; @@ -28,7 +28,7 @@ void function () { authorList = []; authorCandidates.each(function () { var author, cache$, cache$1; - author = null != (cache$ = doc(this)) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0; + author = null != (cache$ = cleanNull(null != (cache$1 = doc(this)) ? cache$1.attr('content') : void 0)) ? cache$.trim() : void 0; if (author) return authorList.push(author); }); @@ -42,8 +42,7 @@ void function () { publisher: function (doc) { var cache$, cache$1, publisherCandidates; publisherCandidates = doc("meta[property='og:site_name'], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']"); - if (null != publisherCandidates && null != (cache$ = publisherCandidates.first()) && null != (cache$1 = cache$.attr('content'))) - return cache$1.trim(); + return (null != (cache$ = cleanNull(null != publisherCandidates && null != (cache$1 = publisherCandidates.first()) ? cache$1.attr('content') : void 0)) ? cache$.trim() : void 0) || null; }, title: function (doc) { var titleText; @@ -75,8 +74,8 @@ void function () { image: function (doc) { var images; images = doc("meta[property='og:image'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']"); - if (images.length > 0 && images.first().attr('content')) - return images.first().attr('content'); + if (images.length > 0 && cleanNull(images.first().attr('content'))) + return cleanNull(images.first().attr('content')); return null; }, links: function (doc, topNode, lang) { @@ -157,20 +156,18 @@ void function () { description: function (doc) { var cache$, cache$1, tag; tag = doc("meta[name=description], meta[property='og:description']"); - if (null != tag && null != (cache$ = tag.first()) && null != (cache$1 = cache$.attr('content'))) - return cache$1.trim(); + if (null != (cache$ = cleanNull(null != tag && null != (cache$1 = tag.first()) ? cache$1.attr('content') : void 0))) + return cache$.trim(); }, keywords: function (doc) { var tag; tag = doc('meta[name=keywords]'); - if (null != tag) - return tag.attr('content'); + return cleanNull(null != tag ? tag.attr('content') : void 0); }, canonicalLink: function (doc) { var tag; tag = doc('link[rel=canonical]'); - if (null != tag) - return tag.attr('href'); + return cleanNull(null != tag ? tag.attr('href') : void 0); }, tags: function (doc) { var elements, tags; @@ -471,8 +468,11 @@ void function () { }); return node; }; + cleanNull = function (text) { + return null != text ? text.replace(/^null$/g, '') : void 0; + }; cleanText = function (text) { - return text.replace(/[\r\n\t]/g, ' ').replace(/\s\s+/g, ' ').replace(//g, '').replace(/�/g, '').trim(); + return null != text ? text.replace(/[\r\n\t]/g, ' ').replace(/\s\s+/g, ' ').replace(//g, '').replace(/�/g, '').trim() : void 0; }; cleanTitle = function (title, delimiters) { var titleText, usedDelimeter; diff --git a/src/cleaner.coffee b/src/cleaner.coffee index 781f189..81d4a43 100644 --- a/src/cleaner.coffee +++ b/src/cleaner.coffee @@ -14,6 +14,7 @@ module.exports = cleaner = (doc) -> removeNodesRegex(doc, /[^-]facebook/) removeNodesRegex(doc, /facebook-broadcasting/) removeNodesRegex(doc, /[^-]twitter/) + cleanSelector(doc, 'blockquote.instagram-media') cleanParaSpans(doc) cleanUnderlines(doc) cleanErrantLinebreaks(doc) @@ -57,6 +58,9 @@ removeScriptsStyles = (doc) -> doc(comments).remove() +cleanSelector = (doc, selector) -> + doc(selector).remove() + cleanBadTags = (doc) -> removeNodesRe = "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|partner-gravity-ad|video-full-transcript|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies" re = new RegExp(removeNodesRe, "i");