-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraperToDB.js
190 lines (169 loc) · 8.88 KB
/
scraperToDB.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// ******** ------- Scrap Data from Art World News Sites and send to database ------- ********
let cron = require('node-cron');
//*** ---------- Connections to mongo server and schema file ----------- *
const mongo = require('./mongo');
// const mongoose = require('mongoose');
const articleSchema = require('./schemas/article-schema');
//*** ---------- URL links to art news resources ----------- *
const puppeteer = require('puppeteer');
const artNpapUrl = 'https://www.theartnewspaper.com/news';
const artNUrl = 'https://www.artnews.com/c/art-news/news/';
const artNetUrl = 'https://news.artnet.com/';
const hyperAUrl = 'https://hyperallergic.com/category/art/';
const artsyUrl = 'https://www.artsy.net/articles';
const artForUrl ='https://www.artforum.com/news';
//*** ----------------- Main News Scraper -------------------- *
function scrape () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox"]
});
const page = await browser.newPage();
// *** --------- The Art Newspaper ------- *
await page.goto(artNpapUrl);
let urls1 = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('.cp-comp');
items.forEach((item) => {
articleUrl = item.querySelector('.cp-link').getAttribute('href');//<-- fix relative url --
results.push({
source: 'artNewsPaper',
linkUrl: `https://www.theartnewspaper.com${articleUrl}`,
imgUrl:item.querySelector('.cp-thumbnail-cont .cp-thumbnail').getAttribute('data-bg'),
title: item.querySelector('.cp-details .cp-preview-headline').innerText,
descript: item.querySelector('.cp-details .cp-excerpt').innerText,
});
});
return results.slice(0, 6);
});
// *** --------- Art News ------- *
await page.goto(artNUrl);
let urls2 = await page.evaluate(() => {
let results2 = [];
let items2 = document.querySelectorAll('article.story');
items2.forEach((item) => {
results2.push({
source: 'artNews',
linkUrl: item.querySelector('.lrv-a-grid .a-span2 h3 a').getAttribute('href'),
imgUrl:item.querySelector('.lrv-a-grid img.c-lazy-image__img').getAttribute('data-lazy-src'),
title: item.querySelector('.lrv-a-grid .a-span2 h3 a').innerText,
descript: item.querySelector('.lrv-a-grid .a-span2 p').innerText,
});
});
return results2.slice(0, 6);
});
// *** --------- ArtNet ------- *
await page.goto(artNetUrl);
let urls3 = await page.evaluate(() => {
let results3 = [];
let items3 = document.querySelectorAll('.media .teaser');
items3.forEach((item) => {
results3.push({
source: 'artNet',
linkUrl: item.querySelector('.teaser-info a').getAttribute('href'),
imgUrl:item.querySelector('.teaser-image .image-wrapper img').getAttribute('src'),
title: item.querySelector('.teaser-info a .teaser-title').innerText,
descript: item.querySelector('.teaser-info a .teaser-blurb').innerText,
});
});
return results3.slice(0, 6);
});
// *** --------- Hyperallergic ------- *
await page.goto(hyperAUrl);
let urls4 = await page.evaluate(() => {
let results4 = [];
let items4 = document.querySelectorAll('article.post');
items4.forEach((item) => {
results4.push({
source: 'hyperAllergic',
linkUrl: item.querySelector('.entry-container .entry-header .entry-title a').getAttribute('href'),
imgUrl:item.querySelector('.post figure a amp-img').getAttribute('src'),
title: item.querySelector('.entry-container .entry-header .entry-title').innerText,
descript: item.querySelector('.entry-container .entry-content p').innerText,
});
});
return results4.slice(0, 6);
});
// *** --------- Artsy ------- *
await page.goto(artsyUrl);
let urls5 = await page.evaluate(() => {
let results5 = [];
let items5 = document.querySelectorAll('.article-figure-container');
items5.forEach((item) => {
articleUrl = item.querySelector('a').getAttribute('href');//<-- fix relative url --
results5.push({
source: 'artsy',
linkUrl: `https://www.artsy.net/${articleUrl}`,
imgUrl:item.querySelector('a.article-figure-img-container .article-figure-img').getAttribute('style'),
title: item.querySelector('.article-figure-figcaption .article-figure-title').innerText,
descript: "Artsy does not provide a description"
});
});
return results5.slice(0, 6);
});
// *** --------- Art Forum ------- *
await page.goto(artForUrl);
let urls6 = await page.evaluate(() => {
let results6 = [];
let items6 = document.querySelectorAll('.news-list__main');
items6.forEach((item) => {
results6.push({
source: 'artForum',
linkUrl: item.querySelector('a').getAttribute('href'),
imgUrl:item.querySelector('.news-list__main .image-container img').getAttribute('src'),
title: item.querySelector('.news-list__main .news-list__words .news__title a').innerText,
descript: item.querySelector('.news-list__main .news-list__words .news-list__content p').innerText
});
});
return results6.slice(0, 6);
});
//*** -------------------- A collection of Objects with Arrays of news links for each Url ---------------------- *
// browser.close();
return resolve({artNP:urls1, artNews:urls2, artNet:urls3, hyperA:urls4, artsy:urls5, artForum:urls6});
} catch (e) {
return reject(e);
}
})
}
cron.schedule('0 */8 * * * ', () => { // '0 */8 * * * ' <----- cron - run 3 times a day **** ###
scrape().then(function(value) {
let artColArray = value;
let artNewsPa = artColArray.artNP;
let artNews = artColArray.artNews;
let artNet = artColArray.artNet;
let hyperA = artColArray.hyperA;
let artsy = artColArray.artsy;
let artForum = artColArray.artForum;
addToMongoDB(artNewsPa, artNews, artNet, hyperA, artsy, artForum);
}).catch(console.error);
}); // <----- cron **** ###
const addToMongoDB = async (artNewsPa, artNews, artNet, hyperA, artsy, artForum) => {
await mongo().then(async (mongoose) => {
try {
console.log('Connected to mongoDb!');
// *-- Remove older links from database --------*
await articleSchema.artNewsP.deleteMany({ "source": "artNewsPaper" });
await articleSchema.artNews.deleteMany({ "source": "artNews" });
await articleSchema.artNet.deleteMany({ "source": "artNet" });
await articleSchema.hyperAll.deleteMany({ "source": "hyperAll" });
await articleSchema.artsy.deleteMany({ "source": "artsy" });
await articleSchema.artForums.deleteMany({ "source": "artForum" });
// *-- Call addNewData and add to database ---------*
await addNewData(artNewsPa, artNews, artNet, hyperA, artsy, artForum);
console.log(artNewsPa);
} finally {
mongoose.connection.close()
}
})
};
// *-- Add new links to database ---------*
async function addNewData(src1, src2, src3, src4, src5, src6){
await articleSchema.artNewsP.insertMany(src1);
await articleSchema.artNews.insertMany(src2);
await articleSchema.artNet.insertMany(src3);
await articleSchema.hyperAll.insertMany(src4);
await articleSchema.artsy.insertMany(src5);
await articleSchema.artForums.insertMany(src6);
};