From 67e7855413c533ef4cd86c82b8f0f17978f922d7 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sat, 20 Apr 2024 01:24:51 +0800 Subject: [PATCH 1/7] fix(core/utils/wechat-mp): empty description when request blocked by WAF Silently falling back to the fallback method would generate empty description. Fixed by throwing an error when failed to extract item_show_type from the page. Some misc warnings are also added to make bug reporting easier. Signed-off-by: Rongrong --- lib/setup.test.ts | 22 +++++++++++-- lib/utils/wechat-mp.test.ts | 13 +++++++- lib/utils/wechat-mp.ts | 66 +++++++++++++++++++++++++++---------- 3 files changed, 80 insertions(+), 21 deletions(-) diff --git a/lib/setup.test.ts b/lib/setup.test.ts index a06f67c51c2a30..dfcabe7fe7668b 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -154,8 +154,26 @@ var ct = "${1_636_626_300}"; ) ) ), - http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.text(genWeChatMpPage('', ''))), - http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.text(genWeChatMpPage('', ''))), + http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)), + http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)), + http.get(`https://mp.weixin.qq.com/mp/rsshub_test/waf`, () => + HttpResponse.text( + ` +
+
+

环境异常

+

当前环境异常,完成验证后即可继续访问。

+
+
+

+ 去验证 +

+
+
+` + ) + ), + http.get(`https://mp.weixin.qq.com/s/rsshub_test_hit_waf`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/mp/rsshub_test/waf`)), http.get(`http://rsshub.test/headers`, ({ request }) => HttpResponse.json({ ...Object.fromEntries(request.headers.entries()), diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index 2343ee69c6b9e7..eed550f46cf672 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it, vi } from 'vitest'; import { load } from 'cheerio'; import Parser from 'rss-parser'; import InvalidParameterError from '@/errors/types/invalid-parameter'; -import { exportedForTestingOnly, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp'; +import { exportedForTestingOnly, WeChatMpError, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp'; const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly; vi.mock('@/utils/request-rewriter', () => ({ default: null })); @@ -405,6 +405,17 @@ describe('wechat-mp', () => { await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: true }); }); + it('hit_waf', async () => { + try { + await fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_hit_waf'); + expect.unreachable('Should throw an error'); + } catch (error) { + expect(error).toBeInstanceOf(WeChatMpError); + expect((error).message).toContain('/mp/rsshub_test/waf'); + expect((error).message).toContain('环境异常'); + } + }); + it('route_test', async () => { try { await app.request('/test/wechat-mp'); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index 268b41578697b7..b4742094dbdfe2 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -31,11 +31,21 @@ import { parseDate } from '@/utils/parse-date'; import cache from '@/utils/cache'; import logger from '@/utils/logger'; -const MAINTAINERS = ['Rongronggg9']; +class WeChatMpError extends Error { + constructor(message: string) { + super(message); + this.name = 'WeChatMpError'; + } +} -const warn = (reason: string, details: string) => - logger.warn(`wechat-mp: ${reason}: ${details}, -consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`); +const MAINTAINERS = ['@Rongronggg9']; + +const formatLog = (...params: string[]): string => `wechat-mp: ${params.join(': ')} +Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`; +const warn = (...params: string[]) => logger.warn(formatLog(...params)); +const error = (...params: string[]): never => { + throw new WeChatMpError(formatLog(...params)); +}; const replaceReturnNewline = (() => { const returnRegExp = /\r|\\(r|x0d)/g; @@ -147,7 +157,7 @@ class ExtractMetadata { private static commonMetadataToBeExtracted = { showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }), realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }), - createTime: this.genExtractFunc('ct', { valuePattern: '\\d+' }), + createTime: this.genExtractFunc('ct', { valuePattern: '\\d+', allowNotFound: true }), sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }), }; @@ -376,18 +386,17 @@ const fixArticleContent = (html?: string | Cheerio, skipImg = false) => // abtest_cookie, wx_header // Known params (temporary link): // src, timestamp, ver, signature, new (unessential) -const normalizeUrl = (url, bypassHostCheck = false) => { - const oriUrl = url; +const normalizeUrl = (url: string, bypassHostCheck = false) => { const urlObj = new URL(url); if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') { - throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl); + error('URL host must be "mp.weixin.qq.com"', url); } urlObj.protocol = 'https:'; urlObj.hash = ''; // remove hash - if (/^\/s\/.+/.test(urlObj.pathname)) { + if (urlObj.pathname.startsWith('/s/')) { // a short link, just remove all the params urlObj.search = ''; - } else if (/^\/s$/.test(urlObj.pathname)) { + } else if (urlObj.pathname === '/s') { const biz = urlObj.searchParams.get('__biz'); const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid'); const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx'); @@ -405,11 +414,11 @@ const normalizeUrl = (url, bypassHostCheck = false) => { // a temporary link, remove all unessential params urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`; } else { - // unknown link, just let it go + warn('unknown URL search parameters', url); } } } else { - // IDK what it is, just let it go + warn('unknown URL path', url); } return urlObj.href; }; @@ -479,9 +488,11 @@ class PageParsers { } return page; }; - static dispatch = async ($: CheerioAPI) => { + static dispatch = async (html: string, url: string) => { + const $ = load(html); const commonMetadata = ExtractMetadata.common($); let page: Record; + let pageText: string = ''; switch (commonMetadata.showType) { case 'APP_MSG_PAGE': page = await PageParsers.appMsg($, commonMetadata); @@ -495,8 +506,16 @@ class PageParsers { case 'VIDEO_SHARE_PAGE': page = PageParsers.fallback($, commonMetadata); break; + case undefined: + pageText = $('body').text().replaceAll(/\s+/g, ' '); + if (pageText.length >= 25) { + pageText = pageText.slice(0, 25); + pageText += '...'; + } + error('unknown page, probably due to WAF', pageText, url); + return {}; // just to make TypeScript happy, actually UNREACHABLE default: - warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`); + warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url); page = PageParsers.fallback($, commonMetadata); } const locationMetadata = ExtractMetadata.location($); @@ -528,9 +547,20 @@ class PageParsers { const fetchArticle = (url: string, bypassHostCheck: boolean = false) => { url = normalizeUrl(url, bypassHostCheck); return cache.tryGet(url, async () => { - const data = await ofetch(url); - const $ = load(data); - const page = await PageParsers.dispatch($); + let maxRedirects = 5; + let raw = await ofetch.raw(url); + while ([301, 302, 303, 307, 308].includes(raw.status) && maxRedirects-- > 0) { + if (!raw.headers.has('location')) { + error('redirect without location', url); + } + // eslint-disable-next-line no-await-in-loop + raw = await ofetch.raw(raw.headers.get('location')); + } + if ([301, 302, 303, 307, 308].includes(raw.status) && maxRedirects <= 0) { + error('too many redirects', url); + } + // pass the redirected URL to dispatcher for better error logging + const page = await PageParsers.dispatch(raw._data, raw.url); return { ...page, link: url }; }) as Promise<{ title: string; @@ -582,4 +612,4 @@ const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = fal }; const exportedForTestingOnly = { ExtractMetadata, showTypeMapReverse }; -export { exportedForTestingOnly, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl }; +export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl }; From b9b25ad7df5ba602dcc6aa6e84fff0f34e490642 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sat, 20 Apr 2024 21:44:33 +0800 Subject: [PATCH 2/7] fix(core/utils/wechat-mp): err msg for unknown page Signed-off-by: Rongrong --- lib/setup.test.ts | 13 ++++++++++++- lib/utils/wechat-mp.test.ts | 3 +++ lib/utils/wechat-mp.ts | 5 +++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lib/setup.test.ts b/lib/setup.test.ts index dfcabe7fe7668b..462eceb228a605 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -158,8 +158,19 @@ var ct = "${1_636_626_300}"; http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)), http.get(`https://mp.weixin.qq.com/mp/rsshub_test/waf`, () => HttpResponse.text( - ` + ` + +Title + + + + +
+ +
+ +

环境异常

当前环境异常,完成验证后即可继续访问。

diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index eed550f46cf672..76418a2b7545ba 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -411,7 +411,10 @@ describe('wechat-mp', () => { expect.unreachable('Should throw an error'); } catch (error) { expect(error).toBeInstanceOf(WeChatMpError); + expect((error).message).not.toContain('console.log'); + expect((error).message).not.toContain('.style'); expect((error).message).toContain('/mp/rsshub_test/waf'); + expect((error).message).toContain('Title'); expect((error).message).toContain('环境异常'); } }); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index b4742094dbdfe2..746ce393997b10 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -507,8 +507,9 @@ class PageParsers { page = PageParsers.fallback($, commonMetadata); break; case undefined: - pageText = $('body').text().replaceAll(/\s+/g, ' '); - if (pageText.length >= 25) { + $('script, style').remove(); + pageText = $('title, body').text().replaceAll(/\s+/g, ' ').trim(); + if (pageText.length >= 25 + '...'.length) { pageText = pageText.slice(0, 25); pageText += '...'; } From 24b5a338c8591e7d0393bc613ed8657f4f31f8d9 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sat, 20 Apr 2024 22:23:47 +0800 Subject: [PATCH 3/7] test(core/utils/wechat-mp): fix coverage Signed-off-by: Rongrong --- lib/setup.test.ts | 2 ++ lib/utils/wechat-mp.test.ts | 26 +++++++++++++++++++++++--- lib/utils/wechat-mp.ts | 11 +++++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/lib/setup.test.ts b/lib/setup.test.ts index 462eceb228a605..994f5a3a67d355 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -185,6 +185,8 @@ var ct = "${1_636_626_300}"; ) ), http.get(`https://mp.weixin.qq.com/s/rsshub_test_hit_waf`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/mp/rsshub_test/waf`)), + http.get(`https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location`, () => HttpResponse.text('', { status: 302 })), + http.get(`https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect`)), http.get(`http://rsshub.test/headers`, ({ request }) => HttpResponse.json({ ...Object.fromEntries(request.headers.entries()), diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index 76418a2b7545ba..bd4088b1aa83d6 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -1,14 +1,16 @@ -import { describe, expect, it, vi } from 'vitest'; +import { describe, expect, it, vi, afterEach } from 'vitest'; import { load } from 'cheerio'; import Parser from 'rss-parser'; import InvalidParameterError from '@/errors/types/invalid-parameter'; import { exportedForTestingOnly, WeChatMpError, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp'; -const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly; +const { toggleWerror, ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly; vi.mock('@/utils/request-rewriter', () => ({ default: null })); const { default: app } = await import('@/app'); const parser = new Parser(); +afterEach(() => toggleWerror(false)); + const expectedItem: { title: string; summary: string; @@ -324,8 +326,20 @@ describe('wechat-mp', () => { expect(normalizeUrl(somethingElseWithHash.replace('https://', 'http://'))).toBe(somethingElse); const notWechatMp = 'https://im.not.wechat.mp/and/an/error/is/expected'; - expect(() => normalizeUrl(notWechatMp)).toThrow(); + expect(() => normalizeUrl(notWechatMp)).toThrow('URL host must be "mp.weixin.qq.com"'); expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp); + + const unknownSearchParam = mpArticleRoot + '?unknown=param'; + toggleWerror(false); + expect(normalizeUrl(unknownSearchParam)).toBe(unknownSearchParam); + toggleWerror(true); + expect(() => normalizeUrl(unknownSearchParam)).toThrow('WarningAsError: unknown URL search parameters'); + + const unknownPath = mpRoot + '/unknown/path'; + toggleWerror(false); + expect(normalizeUrl(unknownPath)).toBe(unknownPath); + toggleWerror(true); + expect(() => normalizeUrl(unknownPath, true)).toThrow('WarningAsError: unknown URL path'); }); it('fetchArticle_&_finishArticleItem_appMsg', async () => { @@ -413,12 +427,18 @@ describe('wechat-mp', () => { expect(error).toBeInstanceOf(WeChatMpError); expect((error).message).not.toContain('console.log'); expect((error).message).not.toContain('.style'); + expect((error).message).toContain('unknown page'); expect((error).message).toContain('/mp/rsshub_test/waf'); expect((error).message).toContain('Title'); expect((error).message).toContain('环境异常'); } }); + it('redirect', () => { + expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location')).rejects.toThrow('redirect without location'); + expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect')).rejects.toThrow('too many redirects'); + }); + it('route_test', async () => { try { await app.request('/test/wechat-mp'); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index 746ce393997b10..d9eb743e863a2a 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -42,10 +42,17 @@ const MAINTAINERS = ['@Rongronggg9']; const formatLog = (...params: string[]): string => `wechat-mp: ${params.join(': ')} Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`; -const warn = (...params: string[]) => logger.warn(formatLog(...params)); +let warn = (...params: string[]) => logger.warn(formatLog(...params)); const error = (...params: string[]): never => { throw new WeChatMpError(formatLog(...params)); }; +const toggleWerror = (() => { + const onFunc = (...params: string[]) => error('WarningAsError', ...params); + const offFunc = warn; + return (on: boolean) => { + warn = on ? onFunc : offFunc; + }; +})(); const replaceReturnNewline = (() => { const returnRegExp = /\r|\\(r|x0d)/g; @@ -612,5 +619,5 @@ const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = fal return item; }; -const exportedForTestingOnly = { ExtractMetadata, showTypeMapReverse }; +const exportedForTestingOnly = { toggleWerror, ExtractMetadata, showTypeMapReverse }; export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl }; From 3472d81d3392611d2fc5ae08ac86711a03c10a7a Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sat, 20 Apr 2024 22:38:19 +0800 Subject: [PATCH 4/7] fix(core/utils/wechat-mp): ESLint no-await-in-loop Signed-off-by: Rongrong --- lib/utils/wechat-mp.ts | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index d9eb743e863a2a..e26c14cc58b157 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -544,6 +544,20 @@ class PageParsers { }; } +const redirectHelper = async (url: string, maxRedirects: number = 5) => { + maxRedirects--; + const raw = await ofetch.raw(url); + if ([301, 302, 303, 307, 308].includes(raw.status)) { + if (!raw.headers.has('location')) { + error('redirect without location', url); + } else if (maxRedirects <= 0) { + error('too many redirects', url); + } + return await redirectHelper(raw.headers.get('location'), maxRedirects); + } + return raw; +}; + /** * Fetch article and its metadata from WeChat MP (mp.weixin.qq.com). * @@ -555,20 +569,9 @@ class PageParsers { const fetchArticle = (url: string, bypassHostCheck: boolean = false) => { url = normalizeUrl(url, bypassHostCheck); return cache.tryGet(url, async () => { - let maxRedirects = 5; - let raw = await ofetch.raw(url); - while ([301, 302, 303, 307, 308].includes(raw.status) && maxRedirects-- > 0) { - if (!raw.headers.has('location')) { - error('redirect without location', url); - } - // eslint-disable-next-line no-await-in-loop - raw = await ofetch.raw(raw.headers.get('location')); - } - if ([301, 302, 303, 307, 308].includes(raw.status) && maxRedirects <= 0) { - error('too many redirects', url); - } + const raw = await redirectHelper(url); // pass the redirected URL to dispatcher for better error logging - const page = await PageParsers.dispatch(raw._data, raw.url); + const page = await PageParsers.dispatch(raw._data, raw.url); return { ...page, link: url }; }) as Promise<{ title: string; From 5abf3d89ce40fade1efc8c0647c6d22180bd9c6f Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sun, 21 Apr 2024 08:54:25 +0800 Subject: [PATCH 5/7] fix(core/utils/wechat-mp): normalizeUrl: & -> & Signed-off-by: Rongrong --- lib/utils/wechat-mp.test.ts | 3 +++ lib/utils/wechat-mp.ts | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index bd4088b1aa83d6..a3578a7c2e8be4 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -340,6 +340,9 @@ describe('wechat-mp', () => { expect(normalizeUrl(unknownPath)).toBe(unknownPath); toggleWerror(true); expect(() => normalizeUrl(unknownPath, true)).toThrow('WarningAsError: unknown URL path'); + + const ampEscapedUrl = longUrl.replaceAll('&', '&'); + expect(normalizeUrl(ampEscapedUrl)).toBe(longUrlShortened); }); it('fetchArticle_&_finishArticleItem_appMsg', async () => { diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index e26c14cc58b157..b3e7fabd4aa5eb 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -394,6 +394,10 @@ const fixArticleContent = (html?: string | Cheerio, skipImg = false) => // Known params (temporary link): // src, timestamp, ver, signature, new (unessential) const normalizeUrl = (url: string, bypassHostCheck = false) => { + const oriUrl = url; + // already seen some weird urls with `&` escaped as `&`, so fix it + // calling fixUrl should always be safe since having `&` or `\x26` in a URL is meaningless + url = fixUrl(url); const urlObj = new URL(url); if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') { error('URL host must be "mp.weixin.qq.com"', url); @@ -421,11 +425,11 @@ const normalizeUrl = (url: string, bypassHostCheck = false) => { // a temporary link, remove all unessential params urlObj.search = `?src=${src}×tamp=${timestamp}&ver=${ver}&signature=${signature}`; } else { - warn('unknown URL search parameters', url); + warn('unknown URL search parameters', oriUrl); } } } else { - warn('unknown URL path', url); + warn('unknown URL path', oriUrl); } return urlObj.href; }; From 17b87987ed175c26f9d4ed0b0e5c5b07bcbcd0a5 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sun, 21 Apr 2024 09:22:24 +0800 Subject: [PATCH 6/7] feat(core/utils/wechat-mp): do not prompt raising an issue when blocked by WAF Signed-off-by: Rongrong --- lib/setup.test.ts | 16 ++++++++++++++++ lib/utils/wechat-mp.test.ts | 19 ++++++++++++++++++- lib/utils/wechat-mp.ts | 18 +++++++++++++----- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/lib/setup.test.ts b/lib/setup.test.ts index 994f5a3a67d355..cc04aab7837380 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -185,6 +185,22 @@ var ct = "${1_636_626_300}"; ) ), http.get(`https://mp.weixin.qq.com/s/rsshub_test_hit_waf`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/mp/rsshub_test/waf`)), + http.get(`https://mp.weixin.qq.com/s/unknown_page`, () => + HttpResponse.text( + ` + +Title + + + + + +

+Unknown paragraph +

+` + ) + ), http.get(`https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location`, () => HttpResponse.text('', { status: 302 })), http.get(`https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect`)), http.get(`http://rsshub.test/headers`, ({ request }) => diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index a3578a7c2e8be4..aa47072fb790b6 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -430,13 +430,30 @@ describe('wechat-mp', () => { expect(error).toBeInstanceOf(WeChatMpError); expect((error).message).not.toContain('console.log'); expect((error).message).not.toContain('.style'); - expect((error).message).toContain('unknown page'); + expect((error).message).not.toContain('Consider raise an issue'); + expect((error).message).toContain('request blocked by WAF:'); expect((error).message).toContain('/mp/rsshub_test/waf'); expect((error).message).toContain('Title'); expect((error).message).toContain('环境异常'); } }); + it('unknown_page', async () => { + const unknownPageUrl = 'https://mp.weixin.qq.com/s/unknown_page'; + try { + await fetchArticle(unknownPageUrl); + expect.unreachable('Should throw an error'); + } catch (error) { + expect(error).toBeInstanceOf(WeChatMpError); + expect((error).message).not.toContain('console.log'); + expect((error).message).not.toContain('.style'); + expect((error).message).toContain('Consider raise an issue'); + expect((error).message).toContain('unknown page,'); + expect((error).message).toContain('Title Unknown paragraph'); + expect((error).message).toContain(unknownPageUrl); + } + }); + it('redirect', () => { expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location')).rejects.toThrow('redirect without location'); expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect')).rejects.toThrow('too many redirects'); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index b3e7fabd4aa5eb..148c88e87d7c62 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -40,12 +40,16 @@ class WeChatMpError extends Error { const MAINTAINERS = ['@Rongronggg9']; -const formatLog = (...params: string[]): string => `wechat-mp: ${params.join(': ')} +const formatLogNoMention = (...params: string[]): string => `wechat-mp: ${params.join(': ')}`; +const formatLog = (...params: string[]): string => `${formatLogNoMention(...params)} Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`; let warn = (...params: string[]) => logger.warn(formatLog(...params)); const error = (...params: string[]): never => { throw new WeChatMpError(formatLog(...params)); }; +const errorNoMention = (...params: string[]): never => { + throw new WeChatMpError(formatLogNoMention(...params)); +}; const toggleWerror = (() => { const onFunc = (...params: string[]) => error('WarningAsError', ...params); const offFunc = warn; @@ -503,7 +507,7 @@ class PageParsers { const $ = load(html); const commonMetadata = ExtractMetadata.common($); let page: Record; - let pageText: string = ''; + let pageText: string, pageTextShort: string; switch (commonMetadata.showType) { case 'APP_MSG_PAGE': page = await PageParsers.appMsg($, commonMetadata); @@ -520,11 +524,15 @@ class PageParsers { case undefined: $('script, style').remove(); pageText = $('title, body').text().replaceAll(/\s+/g, ' ').trim(); + pageTextShort = pageText.slice(0, 25); if (pageText.length >= 25 + '...'.length) { - pageText = pageText.slice(0, 25); - pageText += '...'; + pageTextShort = pageText.slice(0, 25); + pageTextShort += '...'; + } + if (new URL(url).pathname.includes('captcha') || pageText.includes('环境异常')) { + errorNoMention('request blocked by WAF', pageTextShort, url); } - error('unknown page, probably due to WAF', pageText, url); + error('unknown page, probably due to WAF', pageTextShort, url); return {}; // just to make TypeScript happy, actually UNREACHABLE default: warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url); From b0d02f3aa2b941199c1a205fb9add092a56ba093 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Sun, 21 Apr 2024 10:16:43 +0800 Subject: [PATCH 7/7] feat(core/utils/wechat-mp): recognize deleted page Signed-off-by: Rongrong --- lib/setup.test.ts | 16 ++++++++++++++++ lib/utils/wechat-mp.test.ts | 17 +++++++++++++++++ lib/utils/wechat-mp.ts | 7 +++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/lib/setup.test.ts b/lib/setup.test.ts index cc04aab7837380..ab7436a3c19a55 100644 --- a/lib/setup.test.ts +++ b/lib/setup.test.ts @@ -198,6 +198,22 @@ var ct = "${1_636_626_300}";

Unknown paragraph

+` + ) + ), + http.get(`https://mp.weixin.qq.com/s/deleted_page`, () => + HttpResponse.text( + ` + +Title + + + + + +

+该内容已被发布者删除 +

` ) ), diff --git a/lib/utils/wechat-mp.test.ts b/lib/utils/wechat-mp.test.ts index aa47072fb790b6..2d24a2110f74e8 100644 --- a/lib/utils/wechat-mp.test.ts +++ b/lib/utils/wechat-mp.test.ts @@ -454,6 +454,23 @@ describe('wechat-mp', () => { } }); + it('deleted_page', async () => { + const deletedPageUrl = 'https://mp.weixin.qq.com/s/deleted_page'; + + try { + await fetchArticle(deletedPageUrl); + expect.unreachable('Should throw an error'); + } catch (error) { + expect(error).toBeInstanceOf(WeChatMpError); + expect((error).message).not.toContain('console.log'); + expect((error).message).not.toContain('.style'); + expect((error).message).not.toContain('Consider raise an issue'); + expect((error).message).toContain('deleted by author:'); + expect((error).message).toContain('Title 该内容已被发布者删除'); + expect((error).message).toContain(deletedPageUrl); + } + }); + it('redirect', () => { expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location')).rejects.toThrow('redirect without location'); expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect')).rejects.toThrow('too many redirects'); diff --git a/lib/utils/wechat-mp.ts b/lib/utils/wechat-mp.ts index 148c88e87d7c62..7173b0f30a1c6e 100644 --- a/lib/utils/wechat-mp.ts +++ b/lib/utils/wechat-mp.ts @@ -529,10 +529,13 @@ class PageParsers { pageTextShort = pageText.slice(0, 25); pageTextShort += '...'; } - if (new URL(url).pathname.includes('captcha') || pageText.includes('环境异常')) { + if (pageText.includes('已被发布者删除')) { + errorNoMention('deleted by author', pageTextShort, url); + } else if (new URL(url).pathname.includes('captcha') || pageText.includes('环境异常')) { errorNoMention('request blocked by WAF', pageTextShort, url); + } else { + error('unknown page, probably due to WAF', pageTextShort, url); } - error('unknown page, probably due to WAF', pageTextShort, url); return {}; // just to make TypeScript happy, actually UNREACHABLE default: warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url);