From 12b7f55f7061451a56b1e5c56faa013530ade7c7 Mon Sep 17 00:00:00 2001 From: Zhoukun Cheng Date: Thu, 2 May 2024 19:49:40 +0800 Subject: [PATCH 01/10] fix(route/linkedin): handle missing elements in jobs parsing (#15438) --- lib/routes/linkedin/utils.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/routes/linkedin/utils.ts b/lib/routes/linkedin/utils.ts index d069ddb2280ae7..40ef3db80e55f1 100644 --- a/lib/routes/linkedin/utils.ts +++ b/lib/routes/linkedin/utils.ts @@ -87,11 +87,11 @@ function parseJobSearch(data) { const jobs = $('li') .map((i, elem) => { const elemHtml = $(elem); - const link = elemHtml.find('a.base-card__full-link').attr('href').split('?')[0]; - const title = elemHtml.find('h3.base-search-card__title').text().trim(); - const company = elemHtml.find('h4.base-search-card__subtitle').text().trim(); - const location = elemHtml.find('span.job-search-card__location').text().trim(); - const pubDate = elemHtml.find('time').attr('datetime'); + const link = elemHtml.find('a.base-card__full-link, a.base-card--link')?.attr('href')?.split('?')[0]; + const title = elemHtml.find('h3.base-search-card__title')?.text()?.trim(); + const company = elemHtml.find('h4.base-search-card__subtitle')?.text()?.trim(); + const location = elemHtml.find('span.job-search-card__location')?.text()?.trim(); + const pubDate = elemHtml.find('time')?.attr('datetime'); return new Job(title, link, company, location, pubDate); }) From 1f72365451acd8394c8d0374a4e380b8197b7f9f Mon Sep 17 00:00:00 2001 From: sddzhyc <41501986+sddzhyc@users.noreply.github.com> Date: Thu, 2 May 2024 20:42:28 +0800 Subject: [PATCH 02/10] =?UTF-8?q?feat(route):=20Add=20=E4=B8=AD=E5=9B=BD?= =?UTF-8?q?=E7=9F=B3=E6=B2=B9=E5=A4=A7=E5=AD=A6=EF=BC=88=E5=8D=8E=E4=B8=9C?= =?UTF-8?q?=EF=BC=89=E6=95=99=E5=8A=A1=E5=A4=84=E9=80=9A=E7=9F=A5=E5=85=AC?= =?UTF-8?q?=E5=91=8A=20(#15427)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 添加upc/jwc路由 * 规范代码风格 * Apply suggestions from code review Co-authored-by: Tony * Fixing the issues from reviews * fix link and timezone conversion --------- --- lib/routes/upc/jwc.ts | 121 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 lib/routes/upc/jwc.ts diff --git a/lib/routes/upc/jwc.ts b/lib/routes/upc/jwc.ts new file mode 100644 index 00000000000000..e96772d2551a90 --- /dev/null +++ b/lib/routes/upc/jwc.ts @@ -0,0 +1,121 @@ +// 导入必要的模组 +import { Route } from '@/types'; +import got from '@/utils/got'; +import { load } from 'cheerio'; +import { parseDate } from '@/utils/parse-date'; +import cache from '@/utils/cache'; +import timezone from '@/utils/timezone'; + +const typeDict = { + tzgg: '', // 默认为所有通知 + '18519': '教学·运行-', // 教学·运行 + '18520': '学业·学籍-', // 学业·学籍 + '18521': '教学·研究-', // 教学·研究 + '18522': '课程·教材-', // 课程·教材 + '18523': '实践·教学-', // 实践·教学 + '18524': '创新·创业-', // 创新·创业 + yywwz: '语言·文字-', // 语言·文字 + jxwjy: '继续·教育-', // 继续·教育 + bkwzs: '本科·招生-', // 本科·招生 +}; + +// module.exports = async (ctx) => { +const handler = async (ctx) => { + // 从 URL 参数中获取通知分类 + const { type = 'tzgg' } = ctx.req.param(); + // console.log(type); + const baseUrl = 'https://jwc.upc.edu.cn'; + const { data: response } = await got(`${baseUrl}/${type}/list.htm`); + // console.log(`${baseUrl}/${typeDict[type]}/list.htm`); + const $ = load(response); + // const listItems = $('ul.news_list').find('li'); + // console.log(`List item count: ${listItems.length}`); + // const list = $('ul.news_list') 只会得到第一个li + const list = $('ul.news_list') + .find('li') + // 使用“toArray()”方法将选择的所有 DOM 元素以数组的形式返回。 + .toArray() + // 使用“map()”方法遍历数组,并从每个元素中解析需要的数据。 + .map((item) => { + // console.log(item); + item = $(item); + const a = item.find('a').first(); + let linkStr = a.attr('href'); + // 若链接不是以http开头,则加上前缀 + if (a.attr('href').startsWith('http://')) { + // 改为https访问 + linkStr.replace('http://', 'https://'); + } else { + linkStr = `${baseUrl}${a.attr('href')}`; + } + return { + title: a.text(), + link: linkStr, + pubDate: timezone(parseDate(item.find('.news_meta').text()), +8), // 添加发布日期查询 + }; + }); + + const items = await Promise.all( + list.map((item) => + cache.tryGet(item.link, async () => { + const { data: response } = await got(item.link); + const $ = load(response); + // 选择类名为“comment-body”的第一个元素 + item.description = $('.read').first().html(); + // item.pubDate = $('.arti_update').html() === null ? '' : $('.arti_update').html().slice(5, 15); + // item.publisher = $('.arti_publisher').html(); + item.author = $('.arti_publisher').html(); + // console.log($('.arti_update').html().slice(5, 15)); + // 上面每个列表项的每个属性都在此重用, + // 并增加了一个新属性“description” + return item; + }) + ) + ); + + /* ctx.state.data = { + // 源标题 + title: `${typeDict[type]}教务处通知-中国石油大学(华东)`, + // 源链接 + link: `https://jwc.upc.edu.cn/tzgg/list.htm`, + // 源文章 + item: items, + }; */ + + return { + // 源标题 + title: `${typeDict[type]}教务处通知-中国石油大学(华东)`, + // 源链接 + link: `${baseUrl}/${type}/list.htm`, + // 源文章 + item: items, + }; +}; + +export const route: Route = { + path: '/jwc/:type?', + categories: ['university'], + example: '/upc/jwc/tzgg', + parameters: { type: '分类,见下表,其值与对应网页url路径参数一致,默认为所有通知' }, + features: { + requireConfig: false, + requirePuppeteer: false, + antiCrawler: true, + supportBT: false, + supportPodcast: false, + supportScihub: false, + }, + radar: [ + { + source: ['jwc.upc.edu.cn', 'jwc.upc.edu.cn/:type/list.htm'], + target: '/jwc/:type?', + }, + ], + name: '教务处通知公告', + maintainers: ['sddzhyc'], + description: `| 所有通知 | 教学·运行 | 学业·学籍 | 教学·研究 | 课程·教材 | 实践·教学 | 创新·创业 | 语言·文字 | 继续·教育 | 本科·招生 | + | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | + | tzgg | 18519 | 18520 | 18521 | 18522 | 18523 | 18524 | yywwz | jxwjy | bkwzs |`, + url: 'jwc.upc.edu.cn/tzgg/list.htm', + handler, +}; From 4a8c56122e4d5542913091bee3705b62ae395c71 Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 2 May 2024 06:08:24 -0700 Subject: [PATCH 03/10] fix(route): techcrunch cleanup useless nav element (#15441) --- lib/routes/techcrunch/news.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/routes/techcrunch/news.ts b/lib/routes/techcrunch/news.ts index 5780e78f6b99eb..09ff9ab9031b85 100644 --- a/lib/routes/techcrunch/news.ts +++ b/lib/routes/techcrunch/news.ts @@ -44,6 +44,8 @@ async function handler() { const description = $('#root'); description.find('.article__title').remove(); description.find('.article__byline__meta').remove(); + description.find('.mobile-header-nav').remove(); + description.find('.desktop-nav').remove(); return { title: item.title, pubDate: item.pubDate, From b251ae16ff245ca8f3c6e7b8c2f35e43a3e4d9c7 Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 2 May 2024 06:50:19 -0700 Subject: [PATCH 04/10] feat(route): magnumphotos (#15442) --- lib/routes/magnumphotos/magazine.ts | 63 ++++++++++++++++++++++++++++ lib/routes/magnumphotos/namespace.ts | 6 +++ 2 files changed, 69 insertions(+) create mode 100644 lib/routes/magnumphotos/magazine.ts create mode 100644 lib/routes/magnumphotos/namespace.ts diff --git a/lib/routes/magnumphotos/magazine.ts b/lib/routes/magnumphotos/magazine.ts new file mode 100644 index 00000000000000..6ca90ef7c8d6fe --- /dev/null +++ b/lib/routes/magnumphotos/magazine.ts @@ -0,0 +1,63 @@ +import { Route } from '@/types'; +import cache from '@/utils/cache'; +import parser from '@/utils/rss-parser'; +import ofetch from '@/utils/ofetch'; +import { load } from 'cheerio'; +const host = 'https://www.magnumphotos.com'; +export const route: Route = { + path: '/magazine', + categories: ['picture'], + example: '/magnumphotos/magazine', + parameters: {}, + features: { + requireConfig: false, + requirePuppeteer: false, + antiCrawler: false, + supportBT: false, + supportPodcast: false, + supportScihub: false, + }, + radar: [ + { + source: ['magnumphotos.com/'], + }, + ], + name: 'Magazine', + maintainers: ['EthanWng97'], + handler, + url: 'magnumphotos.com/', +}; + +async function handler() { + const rssUrl = `${host}/feed/`; + const feed = await parser.parseURL(rssUrl); + const items = await Promise.all( + feed.items.map((item) => + cache.tryGet(item.link, async () => { + if (!item.link) { + return; + } + const data = await ofetch(item.link); + const $ = load(data); + const description = $('#content'); + description.find('ul.share').remove(); + description.find('h1').remove(); + + return { + title: item.title, + pubDate: item.pubDate, + link: item.link, + category: item.categories, + description: description.html(), + }; + }) + ) + ); + + return { + title: 'Magnum Photos', + link: host, + description: 'Magnum is a community of thought, a shared human quality, a curiosity about what is going on in the world, a respect for what is going on and a desire to transcribe it visually', + item: items, + }; +} diff --git a/lib/routes/magnumphotos/namespace.ts b/lib/routes/magnumphotos/namespace.ts new file mode 100644 index 00000000000000..36e2f12075e8f9 --- /dev/null +++ b/lib/routes/magnumphotos/namespace.ts @@ -0,0 +1,6 @@ +import type { Namespace } from '@/types'; + +export const namespace: Namespace = { + name: 'Magnum Photos', + url: 'magnumphotos.com', +}; From 5c687c718988d02fe62dad67b0f60e268b405d90 Mon Sep 17 00:00:00 2001 From: Tony Date: Thu, 2 May 2024 23:28:12 +0800 Subject: [PATCH 05/10] style(eslint): enable no-array-callback-reference --- .eslintrc.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.eslintrc.json b/.eslintrc.json index 6b758a500b3a95..4bfca3e1dcb3ad 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -59,7 +59,7 @@ "unicorn/explicit-length-check": 0, "unicorn/filename-case": ["error", { "case": "kebabCase", "ignore": [".*\\.(yaml|yml)$", "RequestInProgress\\.js$"] }], "unicorn/new-for-builtins": 0, - "unicorn/no-array-callback-reference": 0, + "unicorn/no-array-callback-reference": 1, "unicorn/no-array-reduce": 1, "unicorn/no-await-expression-member": 0, "unicorn/no-empty-file": 1, From aecdcd98222be865ba741f87a417451de23f04e5 Mon Sep 17 00:00:00 2001 From: Ethan Shen <42264778+nczitzk@users.noreply.github.com> Date: Fri, 3 May 2024 01:44:39 +0800 Subject: [PATCH 06/10] =?UTF-8?q?fix(route):=20IT=E4=B9=8B=E5=AE=B6?= =?UTF-8?q?=E4=B8=93=E9=A2=98=20(#15446)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(route): IT之家专题 * docs: add link * Update lib/routes/ithome/zt.ts --------- --- lib/routes/ithome/templates/description.art | 13 ++ lib/routes/ithome/zt.ts | 174 +++++++++++++------- 2 files changed, 131 insertions(+), 56 deletions(-) create mode 100644 lib/routes/ithome/templates/description.art diff --git a/lib/routes/ithome/templates/description.art b/lib/routes/ithome/templates/description.art new file mode 100644 index 00000000000000..0a7f83a6f60fb1 --- /dev/null +++ b/lib/routes/ithome/templates/description.art @@ -0,0 +1,13 @@ +{{ if images }} + {{ each images image }} + {{ if image?.src }} +
+ {{ image.alt }} +
+ {{ /if }} + {{ /each }} +{{ /if }} \ No newline at end of file diff --git a/lib/routes/ithome/zt.ts b/lib/routes/ithome/zt.ts index a33e5270c88780..4e7a251dd26cb6 100644 --- a/lib/routes/ithome/zt.ts +++ b/lib/routes/ithome/zt.ts @@ -1,88 +1,150 @@ import { Route } from '@/types'; +import { getCurrentPath } from '@/utils/helpers'; +const __dirname = getCurrentPath(import.meta.url); + import cache from '@/utils/cache'; import got from '@/utils/got'; import { load } from 'cheerio'; import timezone from '@/utils/timezone'; import { parseDate } from '@/utils/parse-date'; +import { art } from '@/utils/render'; +import path from 'node:path'; -export const route: Route = { - path: '/zt/:id', - categories: ['new-media'], - example: '/ithome/zt/xijiayi', - parameters: { id: '专题 id' }, - features: { - requireConfig: false, - requirePuppeteer: false, - antiCrawler: false, - supportBT: false, - supportPodcast: false, - supportScihub: false, - }, - radar: [ - { - source: ['ithome.com/zt/:id'], - }, - ], - name: '专题', - maintainers: ['nczitzk'], - handler, - description: `所有专题请见[此处](https://www.ithome.com/zt)`, -}; - -async function handler(ctx) { - const id = ctx.req.param('id'); +export const handler = async (ctx) => { + const { id = 'xijiayi' } = ctx.req.param(); + const limit = ctx.req.query('limit') ? Number.parseInt(ctx.req.query('limit'), 10) : 50; const rootUrl = 'https://www.ithome.com'; - const currentUrl = `${rootUrl}/zt/${id}`; + const currentUrl = new URL(`zt/${id}`, rootUrl).href; + + const { data: response } = await got(currentUrl); - const response = await got({ - method: 'get', - url: currentUrl, - }); + const $ = load(response); - const $ = load(response.data); + const author = 'IT之家'; + const language = 'zh'; - const list = $('.newsbody a') - .map((_, item) => { + let items = $('div.newsbody') + .slice(0, limit) + .toArray() + .map((item) => { item = $(item); + const title = item.find('h2').text(); + const image = item.find('img').prop('data-original') ?? item.find('img').prop('src'); + return { - title: item.text(), - link: item.attr('href'), + title, + pubDate: timezone( + parseDate( + item + .find('span.time script') + .text() + .match(/'(.*?)'/) + ), + +8 + ), + link: item.find('a').first().prop('href'), + author: item.find('div.editor').contents().first().text(), + image, + banner: image, + language, }; - }) - .get(); + }); - const items = await Promise.all( - list.map((item) => + items = await Promise.all( + items.map((item) => cache.tryGet(item.link, async () => { - const detailResponse = await got({ - method: 'get', - url: item.link, - }); + const { data: detailResponse } = await got(item.link); + + const $$ = load(detailResponse); + + $$('p.ad-tips, a.topic-bar').remove(); - const content = load(detailResponse.data); - const post = content('.post_content'); + $$('div#paragraph p img').each((_, el) => { + el = $$(el); - post.find('img[data-original]').each((_, ele) => { - ele = $(ele); - ele.attr('src', ele.attr('data-original')); - ele.removeAttr('class'); - ele.removeAttr('data-original'); + const src = el.prop('data-original'); + + if (src) { + el.replaceWith( + art(path.join(__dirname, 'templates/description.art'), { + images: [ + { + src, + alt: el.prop('alt'), + }, + ], + }) + ); + } }); - item.description = post.html(); - item.author = content('#author_baidu').text().replace('作者:', ''); - item.pubDate = timezone(parseDate(content('#pubtime_baidu').text()), +8); + const title = $$('h1').text(); + const description = $$('div#paragraph').html(); + const image = $$('div#paragraph img').first().prop('src'); + + item.title = title; + item.description = description; + item.pubDate = timezone(parseDate($$('span#pubtime_baidu').text()), +8); + item.category = $$('div.cv a') + .toArray() + .map((c) => $$(c).text()) + .slice(1); + item.author = $$('span#author_baidu').contents().last().text() || $$('span#source_baidu').contents().last().text() || $$('span#editor_baidu').contents().last().text(); + item.content = { + html: description, + text: $$('div#paragraph').text(), + }; + item.image = image; + item.banner = image; + item.language = language; return item; }) ) ); + const image = new URL($('meta[property="og:image"]').prop('content'), rootUrl).href; + return { - title: `${$('title').text()} - IT之家`, + title: `${author} - ${$('title').text()}`, + description: $('meta[name="description"]').prop('content'), link: currentUrl, item: items, + allowEmpty: true, + image, + author, + language, }; -} +}; + +export const route: Route = { + path: '/zt/:id?', + name: '专题', + url: 'ithome.com', + maintainers: ['nczitzk'], + handler, + example: '/ithome/zt/xijiayi', + parameters: { category: '专题 id,默认为 xijiayi,即 [喜加一](https://www.ithome.com/zt/xijiayi),可在对应专题页 URL 中找到' }, + description: `:::tip + 更多专题请见 [IT之家专题](https://www.ithome.com/zt) + :::`, + categories: ['new-media'], + + features: { + requireConfig: false, + requirePuppeteer: false, + antiCrawler: false, + supportRadar: true, + supportBT: false, + supportPodcast: false, + supportScihub: false, + }, + radar: [ + { + source: ['ithome.com/zt/:id'], + target: '/zt/:id', + }, + ], +}; From c2510e0e2e1d332e6bc08c5b72bcef0a8b0acd81 Mon Sep 17 00:00:00 2001 From: Tony Date: Fri, 3 May 2024 04:12:36 +0800 Subject: [PATCH 07/10] fix(route): set preload to metadata as suggested by the [spec](https://html.spec.whatwg.org/multipage/media.html#attr-media-preload) (#15448) --- lib/routes/caixin/templates/article.art | 2 +- lib/routes/douyin/templates/embed.art | 2 +- lib/routes/fansly/templates/media.art | 2 +- lib/routes/ifeng/templates/video.art | 2 +- lib/routes/instagram/templates/video.art | 2 +- lib/routes/kcna/utils.ts | 2 +- lib/routes/mingpao/templates/fancybox.art | 2 +- lib/routes/missav/templates/preview.art | 2 +- lib/routes/pikabu/templates/video.art | 2 +- lib/routes/pornhub/templates/description.art | 2 +- lib/routes/sina/templates/video.art | 2 +- lib/routes/tiktok/templates/user.art | 2 +- lib/routes/zhihu/topic.ts | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/routes/caixin/templates/article.art b/lib/routes/caixin/templates/article.art index 6b33c31ead9f40..fd30a943eb5570 100644 --- a/lib/routes/caixin/templates/article.art +++ b/lib/routes/caixin/templates/article.art @@ -15,7 +15,7 @@ <% const video = $('script').text().match(/initPlayer\('(.*?)','(.*?)'\)/); %> <% const videoUrl = video[1]; %> <% const poster = video[2]; %> - +
{{ /if }} diff --git a/lib/routes/douyin/templates/embed.art b/lib/routes/douyin/templates/embed.art index 530226b263a806..e8f32e69091d14 100644 --- a/lib/routes/douyin/templates/embed.art +++ b/lib/routes/douyin/templates/embed.art @@ -1,4 +1,4 @@ -