-
Notifications
You must be signed in to change notification settings - Fork 7.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(route): CCDI.gov.cn route broken (#10879)
* fix ccdi site * fix doc * upgrade rate limit countermeasure and sort the router alphabetically * set anticrawler flag in docs
- Loading branch information
Showing
8 changed files
with
113 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
/* eslint-disable no-await-in-loop */ | ||
const { rootUrl, parseNewsList, parseArticle } = require('./utils'); | ||
|
||
const getRandomInt = (min, max) => { | ||
min = Math.ceil(min); | ||
max = Math.floor(max); | ||
return Math.floor(Math.random() * (max - min) + min); // The maximum is exclusive and the minimum is inclusive | ||
}; | ||
|
||
module.exports = async (ctx) => { | ||
const defaultPath = '/yaowenn/'; | ||
|
||
let pathname = ctx.path.replace(/(^\/ccdi|\/$)/g, ''); | ||
pathname = pathname === '' ? defaultPath : pathname.endsWith('/') ? pathname : pathname + '/'; | ||
const currentUrl = `${rootUrl}${pathname}`; | ||
|
||
const { list, title } = await parseNewsList(currentUrl, '.list_news_dl li', ctx); | ||
const items = []; | ||
|
||
for (const item of list) { | ||
items.push(await parseArticle(item, ctx)); | ||
// sleep randomly for anti rate limit on ccdi site | ||
await new Promise((r) => setTimeout(r, getRandomInt(1000, 2500))); | ||
} | ||
|
||
ctx.state.data = { | ||
title, | ||
link: currentUrl, | ||
item: items, | ||
}; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
const cheerio = require('cheerio'); | ||
const { parseDate } = require('@/utils/parse-date'); | ||
const got = require('@/utils/got'); | ||
const timezone = require('@/utils/timezone'); | ||
|
||
const { CookieJar } = require('tough-cookie'); | ||
const cookieJar = new CookieJar(); | ||
|
||
const owner = '中央纪委国家监委网站'; | ||
const rootUrl = 'https://www.ccdi.gov.cn'; | ||
|
||
const parseNewsList = async (url, selector, ctx) => { | ||
const response = await got(url, { cookieJar }); | ||
const $ = cheerio.load(response.data); | ||
const list = $(selector) | ||
.slice(0, ctx.query.limit ? parseInt(ctx.query.limit) : 8) | ||
.toArray() | ||
.map((item) => { | ||
item = $(item); | ||
return { | ||
title: item.find('a').first().text().trim(), | ||
link: new URL(item.find('a').first().attr('href'), url).href, | ||
pubDate: parseDate(item.find('span').text(), 'YYYY-MM-DD'), | ||
}; | ||
}); | ||
const title = $('.other_Location') | ||
.text() | ||
.replace(/(.+)首页/, owner); | ||
return { list, title }; | ||
}; | ||
|
||
const parseArticle = async (item, ctx) => | ||
await ctx.cache.tryGet(item.link, async () => { | ||
const response = await got(item.link, { cookieJar }); | ||
const $ = cheerio.load(response.data); | ||
|
||
const title = $('.daty').text().trim(); | ||
item.author = title.match(/来源:(.*)发布时间/s)?.[1].trim() ?? owner; | ||
item.pubDate = timezone(parseDate(title.match(/发布时间:(.*)分享/s)?.[1].trim() ?? item.pubDate), +8); | ||
|
||
// Change the img src from relative to absolute for a better compatibility | ||
$('.content') | ||
.find('img') | ||
.each((_, el) => { | ||
$(el).attr('src', new URL($(el).attr('src'), item.link).href); | ||
// oldsrc is causing freshrss imageproxy not to work correctly | ||
$(el).removeAttr('oldsrc').removeAttr('alt'); | ||
}); | ||
item.description = $('.content').html(); | ||
return item; | ||
}); | ||
|
||
module.exports = { | ||
rootUrl, | ||
parseNewsList, | ||
parseArticle, | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters