Skip to content

Commit

Permalink
fix(core/utils/wechat-mp): empty description when request blocked by WAF
Browse files Browse the repository at this point in the history
Silently falling back to the fallback method would generate empty
description. Fixed by throwing an error when failed to extract
item_show_type from the page.

Some misc warnings are also added to make bug reporting easier.

Signed-off-by: Rongrong <[email protected]>
  • Loading branch information
Rongronggg9 committed Apr 19, 2024
1 parent 8eccccc commit 67e7855
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 21 deletions.
22 changes: 20 additions & 2 deletions lib/setup.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,26 @@ var ct = "${1_636_626_300}";
)
)
),
http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.text(genWeChatMpPage('', ''))),
http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.text(genWeChatMpPage('', ''))),
http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)),
http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)),
http.get(`https://mp.weixin.qq.com/mp/rsshub_test/waf`, () =>
HttpResponse.text(
`<html><body>
<div class="weui-msg">
<div class="weui-msg__text-area pc-area">
<h2 class="weui-msg__title">环境异常</h2>
<p class="weui-msg__desc">当前环境异常,完成验证后即可继续访问。</p>
</div>
<div class="weui-msg__opr-area">
<p class="weui-btn-area">
<a class="weui-btn weui-btn_primary" id="js_verify">去验证</a>
</p>
</div>
</div>
</body></html>`
)
),
http.get(`https://mp.weixin.qq.com/s/rsshub_test_hit_waf`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/mp/rsshub_test/waf`)),
http.get(`http://rsshub.test/headers`, ({ request }) =>
HttpResponse.json({
...Object.fromEntries(request.headers.entries()),
Expand Down
13 changes: 12 additions & 1 deletion lib/utils/wechat-mp.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { describe, expect, it, vi } from 'vitest';
import { load } from 'cheerio';
import Parser from 'rss-parser';
import InvalidParameterError from '@/errors/types/invalid-parameter';
import { exportedForTestingOnly, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp';
import { exportedForTestingOnly, WeChatMpError, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp';
const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly;

vi.mock('@/utils/request-rewriter', () => ({ default: null }));
Expand Down Expand Up @@ -405,6 +405,17 @@ describe('wechat-mp', () => {
await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: true });
});

it('hit_waf', async () => {
try {
await fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_hit_waf');
expect.unreachable('Should throw an error');
} catch (error) {
expect(error).toBeInstanceOf(WeChatMpError);
expect((<WeChatMpError>error).message).toContain('/mp/rsshub_test/waf');
expect((<WeChatMpError>error).message).toContain('环境异常');
}
});

it('route_test', async () => {
try {
await app.request('/test/wechat-mp');
Expand Down
66 changes: 48 additions & 18 deletions lib/utils/wechat-mp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,21 @@ import { parseDate } from '@/utils/parse-date';
import cache from '@/utils/cache';
import logger from '@/utils/logger';

const MAINTAINERS = ['Rongronggg9'];
class WeChatMpError extends Error {
constructor(message: string) {
super(message);
this.name = 'WeChatMpError';
}
}

const warn = (reason: string, details: string) =>
logger.warn(`wechat-mp: ${reason}: ${details},
consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`);
const MAINTAINERS = ['@Rongronggg9'];

const formatLog = (...params: string[]): string => `wechat-mp: ${params.join(': ')}
Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`;
const warn = (...params: string[]) => logger.warn(formatLog(...params));
const error = (...params: string[]): never => {
throw new WeChatMpError(formatLog(...params));
};

const replaceReturnNewline = (() => {
const returnRegExp = /\r|\\(r|x0d)/g;
Expand Down Expand Up @@ -147,7 +157,7 @@ class ExtractMetadata {
private static commonMetadataToBeExtracted = {
showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }),
realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }),
createTime: this.genExtractFunc('ct', { valuePattern: '\\d+' }),
createTime: this.genExtractFunc('ct', { valuePattern: '\\d+', allowNotFound: true }),
sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }),
};

Expand Down Expand Up @@ -376,18 +386,17 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
// abtest_cookie, wx_header
// Known params (temporary link):
// src, timestamp, ver, signature, new (unessential)
const normalizeUrl = (url, bypassHostCheck = false) => {
const oriUrl = url;
const normalizeUrl = (url: string, bypassHostCheck = false) => {
const urlObj = new URL(url);
if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
error('URL host must be "mp.weixin.qq.com"', url);
}
urlObj.protocol = 'https:';
urlObj.hash = ''; // remove hash
if (/^\/s\/.+/.test(urlObj.pathname)) {
if (urlObj.pathname.startsWith('/s/')) {
// a short link, just remove all the params
urlObj.search = '';
} else if (/^\/s$/.test(urlObj.pathname)) {
} else if (urlObj.pathname === '/s') {
const biz = urlObj.searchParams.get('__biz');
const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
Expand All @@ -405,11 +414,11 @@ const normalizeUrl = (url, bypassHostCheck = false) => {
// a temporary link, remove all unessential params
urlObj.search = `?src=${src}&timestamp=${timestamp}&ver=${ver}&signature=${signature}`;
} else {
// unknown link, just let it go
warn('unknown URL search parameters', url);
}
}
} else {
// IDK what it is, just let it go
warn('unknown URL path', url);
}
return urlObj.href;
};
Expand Down Expand Up @@ -479,9 +488,11 @@ class PageParsers {
}
return page;
};
static dispatch = async ($: CheerioAPI) => {
static dispatch = async (html: string, url: string) => {
const $ = load(html);
const commonMetadata = ExtractMetadata.common($);
let page: Record<string, any>;
let pageText: string = '';
switch (commonMetadata.showType) {
case 'APP_MSG_PAGE':
page = await PageParsers.appMsg($, commonMetadata);
Expand All @@ -495,8 +506,16 @@ class PageParsers {
case 'VIDEO_SHARE_PAGE':
page = PageParsers.fallback($, commonMetadata);
break;
case undefined:
pageText = $('body').text().replaceAll(/\s+/g, ' ');
if (pageText.length >= 25) {
pageText = pageText.slice(0, 25);
pageText += '...';
}
error('unknown page, probably due to WAF', pageText, url);
return {}; // just to make TypeScript happy, actually UNREACHABLE
default:
warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`);
warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url);
page = PageParsers.fallback($, commonMetadata);
}
const locationMetadata = ExtractMetadata.location($);
Expand Down Expand Up @@ -528,9 +547,20 @@ class PageParsers {
const fetchArticle = (url: string, bypassHostCheck: boolean = false) => {
url = normalizeUrl(url, bypassHostCheck);
return cache.tryGet(url, async () => {
const data = await ofetch(url);
const $ = load(data);
const page = await PageParsers.dispatch($);
let maxRedirects = 5;
let raw = await ofetch.raw(url);
while ([301, 302, 303, 307, 308].includes(raw.status) && maxRedirects-- > 0) {
if (!raw.headers.has('location')) {
error('redirect without location', url);
}
// eslint-disable-next-line no-await-in-loop
raw = await ofetch.raw(<string>raw.headers.get('location'));
}
if ([301, 302, 303, 307, 308].includes(raw.status) && maxRedirects <= 0) {
error('too many redirects', url);
}
// pass the redirected URL to dispatcher for better error logging
const page = await PageParsers.dispatch(<string>raw._data, raw.url);
return { ...page, link: url };
}) as Promise<{
title: string;
Expand Down Expand Up @@ -582,4 +612,4 @@ const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = fal
};

const exportedForTestingOnly = { ExtractMetadata, showTypeMapReverse };
export { exportedForTestingOnly, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };

0 comments on commit 67e7855

Please sign in to comment.