diff --git a/api/resolvers/item.js b/api/resolvers/item.js index 6f0f2856d..50d4d3e67 100644 --- a/api/resolvers/item.js +++ b/api/resolvers/item.js @@ -499,7 +499,6 @@ export default { const doc = domino.createWindow(html).document const metadata = getMetadata(doc, url, { title: metadataRuleSets.title }) const datedata = extractArticlePublishedDate({ url, doc }) - console.log(datedata, (new Date() - datedata.date) / (1000 * 60 * 60 * 24)) const dateHint = (datedata && (new Date() - datedata.date) / (1000 * 60 * 60 * 24) > 365) ? ` (${datedata.date.getFullYear()})` : '' diff --git a/lib/timedate-scraper.js b/lib/timedate-scraper.js index 210ea4cd8..66334bcd8 100644 --- a/lib/timedate-scraper.js +++ b/lib/timedate-scraper.js @@ -47,6 +47,9 @@ exports.extractFromLDJson = function ({ url, doc }) { // returns { date, source } or undefined exports.extractFromMeta = function ({ url, doc }) { const dateRules = { + // note a couple of sad things: + // 1. meta names are case sensitive + // 2. rules stop if selector matches, even when callback returns false rules: [ ['meta[property="article:published_time"]', node => node.getAttribute('content')], ['meta[name="pubdate"]', node => node.getAttribute('content')], @@ -63,11 +66,11 @@ exports.extractFromMeta = function ({ url, doc }) { ['meta[name="article_date_original"]', node => node.getAttribute('content')], ['meta[name="cxenseparse:recs:publishtime"]', node => node.getAttribute('content')], ['meta[name="date_published"]', node => node.getAttribute('content')], + ['meta[itemprop="datePublished"]', node => node.getAttribute('content')], ['meta[itemprop="datepublished"]', node => node.getAttribute('content')], ['meta[itemprop="datecreated"]', node => node.getAttribute('content')], ['meta[http-equiv="date"]', node => node.getAttribute('content')], - // note: rules stop if selector matches, even when callback returns false ['meta[property="og:image"]', node => exports.extractFromURL(node.getAttribute('content'))], ['meta[itemprop="image"]', node => exports.extractFromURL(node.getAttribute('content'))] ] @@ -91,11 +94,11 @@ exports.extractFromHTMLTag = function ({ url, doc }) { ({ date } = getMetadata(doc, url, { date: { rules: [ - ['span[itemprop]="datePublished"]', node => node.getAttribute('content') || exports.parseStrDate(node.innerHTML)] + ['span[itemprop="datePublished"]', node => node.getAttribute('content') || exports.parseStrDate(node.innerHTML)] ] } })) - if (date) return { date, source: 'span[itemprop]="datePublished"]' } + if (date) return { date, source: 'span[itemprop="datePublished"]' } for (const tag of ['span', 'p', 'div']) { for (const className of ['pubdate', 'timestamp', 'article_date', 'articledate', 'date']) { @@ -112,22 +115,25 @@ exports.extractFromHTMLTag = function ({ url, doc }) { // returns { date, source } or undefined exports.extractArticlePublishedDate = function ({ url, doc }) { console.log('Extracting date from', url) - let articleDate + let foundDate try { - articleDate = { date: exports.extractFromURL(url), source: 'url' } - let possibleDate = exports.extractFromLDJson({ url, doc }) + // establish a default from the URL if possible + let possibleDate = exports.extractFromURL(url) + if (possibleDate) foundDate = { date: possibleDate, source: 'url' } + + // try to get date from various sources in order of precedence + possibleDate = exports.extractFromLDJson({ url, doc }) if (!possibleDate) possibleDate = exports.extractFromMeta({ url, doc }) if (!possibleDate) possibleDate = exports.extractFromHTMLTag({ url, doc }) - if (possibleDate) articleDate = possibleDate + if (possibleDate) foundDate = possibleDate } catch (e) { console.log('Exception in extractArticlePublishedDate for', url) - console.log(e) } - if (articleDate) { + if (foundDate) { try { - const d = new Date(articleDate.date) - if (!isNaN(d)) articleDate.date = d - } catch {} + const d = new Date(foundDate.date) + if (!isNaN(d)) foundDate.date = d + } catch { } } - return articleDate + return foundDate }