Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to get publication year when auto-titling links #520

Merged
merged 10 commits into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion api/resolvers/item.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { ensureProtocol, removeTracking } from '../../lib/url'
import { serializeInvoicable } from './serial'
import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
import { getMetadata, metadataRuleSets } from 'page-metadata-parser'
import { ruleSet as publicationDateRuleSet } from '../../lib/timedate-scraper'
import domino from 'domino'
import {
ITEM_SPAM_INTERVAL, ITEM_FILTER_THRESHOLD,
Expand All @@ -17,6 +18,7 @@ import { advSchema, amountSchema, bountySchema, commentSchema, discussionSchema,
import { sendUserNotification } from '../webPush'
import { defaultCommentSort } from '../../lib/item'
import { notifyItemParents, notifyUserSubscribers, notifyZapped } from '../../lib/push-notifications'
import { datePivot } from '../../lib/time'

export async function commentFilterClause (me, models) {
let clause = ` AND ("Item"."weightedVotes" - "Item"."weightedDownVotes" > -${ITEM_FILTER_THRESHOLD}`
Expand Down Expand Up @@ -540,8 +542,12 @@ export default {
const response = await fetch(ensureProtocol(url), { redirect: 'follow' })
const html = await response.text()
const doc = domino.createWindow(html).document
const metadata = getMetadata(doc, url, { title: metadataRuleSets.title })
const metadata = getMetadata(doc, url, { title: metadataRuleSets.title, publicationDate: publicationDateRuleSet })
const dateHint = ` (${metadata.publicationDate?.getFullYear()})`
const moreThanOneYearAgo = metadata.publicationDate && metadata.publicationDate < datePivot(new Date(), { years: -1 })

res.title = metadata?.title
if (moreThanOneYearAgo) res.title += dateHint
} catch { }

try {
Expand Down
88 changes: 88 additions & 0 deletions lib/timedate-scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Date rule for use with page-metadata-parser.
// Based on https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
// Usage: import ruleSet and use in a call similar to: getMetadata(doc, url, { publicationDate: ruleSet.publicationDate })
// Some example URLs for testing purposes:

// ld+json example from 2018:
// https://mhagemann.medium.com/how-to-add-structured-json-ld-data-to-nuxt-js-8bb5f7c8a2d

// meta tag example from 2011:
// https://www.newyorker.com/magazine/2011/10/10/the-crypto-currency

// YouTube example from 2018:
// https://www.youtube.com/watch?v=YSUVRj8iznU

// A news article from 2023 (< 1 year, should not display a date):
// https://edition.cnn.com/politics/live-news/matt-gaetz-kevin-mccarthy-house-speakership-10-03-23/index.html

function cleanDateStr (dateString) {
try {
return new Date(dateString.match(String.raw`[0-9].*[0-9]`)[0])
} catch {}
}

export function extractFromURL (url) {
// Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
const m = url.match(String.raw`([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?`)
if (m) {
return cleanDateStr(m[0])
}
}

function asDate (str) {
if (str) {
try {
const d = new Date(str)
if (!isNaN(d)) return d
} catch { }
}
}

export const ruleSet = {
// note meta names are case sensitive, and scorers must not favor rules when they will not return good results.
rules: [
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)],

['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))],
['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))],
['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))],
['meta[name="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.published"]', node => asDate(node.getAttribute('content'))],
['meta[name="published-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.created"]', node => asDate(node.getAttribute('content'))],
['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))],
['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))],
['meta[name="date_published"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))],
['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))],

['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))],
['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))],
...['span', 'p', 'div'].map(tag => {
return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => {
return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))]
})
}).flat()
],
scorers: [
(el, score) => {
if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) {
const data = JSON.parse(el.innerHTML)
return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0
}
},
(el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0,
(el, score) => !['script', 'meta'].includes(el.localName) ? score : 0
]
}