-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews-scraper.js
64 lines (59 loc) · 2.32 KB
/
news-scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/*
* Copyright (c) 2020 Gareth McNicol
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
"use strict";
const puppeteer = require("puppeteer");
async function clickAgree(page) {
await page.waitForSelector("div#UtypeNotLoginPopup.login-popup-container div.login-popup.popup-sub div.content p.login-popu-para span.login-btn-normal");
await page.evaluate(_ => {
NonLoginPrivateUser();
});
}
async function getParagraphText(page, currentParagraph) {
let text = await page.evaluate(p => p.textContent, currentParagraph);
text = text.replace(/\r?\n|\r/g, " ").trim();
return text.toLowerCase();
}
async function processContent(page) {
await page.waitForSelector("#ArticleContent");
const content = await page.$$("#ArticleContent p");
let paragraphs = [];
for (let i = 0; i < content.length; i++) {
const currentParagraph = content[i];
let text = await getParagraphText(page, currentParagraph);
if (!!text) {
paragraphs.push({text: text});
}
}
return paragraphs;
}
module.exports = async (url) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setViewport({width: 3840, height: 2160});
await page.goto(url.href);
await clickAgree(page);
let paragraphs = await processContent(page);
await browser.close();
return paragraphs;
};