-
-
Notifications
You must be signed in to change notification settings - Fork 114
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Attempt to get publication year when auto-titling links (#520)
* port date scraper from python * bug fixes and cleanup * bug fixes and cleanup * refactor * address comments * make it intuitive * Update timedate-scraper.js - lint * address review comments * cleanup * simplfy and don't use side effects --------- Co-authored-by: rleed <[email protected]>
- Loading branch information
Showing
2 changed files
with
95 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
// Date rule for use with page-metadata-parser. | ||
// Based on https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py | ||
// Usage: import ruleSet and use in a call similar to: getMetadata(doc, url, { publicationDate: ruleSet.publicationDate }) | ||
// Some example URLs for testing purposes: | ||
|
||
// ld+json example from 2018: | ||
// https://mhagemann.medium.com/how-to-add-structured-json-ld-data-to-nuxt-js-8bb5f7c8a2d | ||
|
||
// meta tag example from 2011: | ||
// https://www.newyorker.com/magazine/2011/10/10/the-crypto-currency | ||
|
||
// YouTube example from 2018: | ||
// https://www.youtube.com/watch?v=YSUVRj8iznU | ||
|
||
// A news article from 2023 (< 1 year, should not display a date): | ||
// https://edition.cnn.com/politics/live-news/matt-gaetz-kevin-mccarthy-house-speakership-10-03-23/index.html | ||
|
||
function cleanDateStr (dateString) { | ||
try { | ||
return new Date(dateString.match(String.raw`[0-9].*[0-9]`)[0]) | ||
} catch {} | ||
} | ||
|
||
export function extractFromURL (url) { | ||
// Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py | ||
const m = url.match(String.raw`([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?`) | ||
if (m) { | ||
return cleanDateStr(m[0]) | ||
} | ||
} | ||
|
||
function asDate (str) { | ||
if (str) { | ||
try { | ||
const d = new Date(str) | ||
if (!isNaN(d)) return d | ||
} catch { } | ||
} | ||
} | ||
|
||
export const ruleSet = { | ||
// note meta names are case sensitive, and scorers must not favor rules when they will not return good results. | ||
rules: [ | ||
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)], | ||
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)], | ||
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)], | ||
|
||
['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="date"]', node => asDate(node.getAttribute('content'))], | ||
['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="article.published"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="published-date"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="article.created"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))], | ||
['meta[name="date_published"]', node => asDate(node.getAttribute('content'))], | ||
['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))], | ||
['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))], | ||
['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))], | ||
['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))], | ||
['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))], | ||
['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))], | ||
|
||
['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))], | ||
['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))], | ||
...['span', 'p', 'div'].map(tag => { | ||
return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => { | ||
return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))] | ||
}) | ||
}).flat() | ||
], | ||
scorers: [ | ||
(el, score) => { | ||
if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) { | ||
const data = JSON.parse(el.innerHTML) | ||
return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0 | ||
} | ||
}, | ||
(el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0, | ||
(el, score) => !['script', 'meta'].includes(el.localName) ? score : 0 | ||
] | ||
} |