Skip to content

Commit

Permalink
Merge branch 'refs/heads/fix/wechat-mp/waf'
Browse files Browse the repository at this point in the history
  • Loading branch information
Rongronggg9 committed Apr 21, 2024
2 parents 5991ecb + b0d02f3 commit 428e65c
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 24 deletions.
67 changes: 65 additions & 2 deletions lib/setup.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,71 @@ var ct = "${1_636_626_300}";
)
)
),
http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.text(genWeChatMpPage('', ''))),
http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.text(genWeChatMpPage('', ''))),
http.get(`https://mp.weixin.qq.com/s/rsshub_test`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)),
http.get(`https://mp.weixin.qq.com/s?__biz=rsshub_test&mid=1&idx=1&sn=1`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/rsshub_test/fallback`)),
http.get(`https://mp.weixin.qq.com/mp/rsshub_test/waf`, () =>
HttpResponse.text(
`<html>
<head>
<title>Title</title>
<script>console.log</script>
</head>
<body class="zh_CN " ontouchstart="">
<script>console.log</script>
<style>.style{}</style>
<div class="weui-msg">
<div id="tips" style="display:none;" class="top_tips warning"></div>
<div class="weui-msg__icon-area">
<i class="weui-icon-info-circle weui-icon_msg"></i>
</div>
<div class="weui-msg__text-area pc-area">
<h2 class="weui-msg__title">环境异常</h2>
<p class="weui-msg__desc">当前环境异常,完成验证后即可继续访问。</p>
</div>
<div class="weui-msg__opr-area">
<p class="weui-btn-area">
<a class="weui-btn weui-btn_primary" id="js_verify">去验证</a>
</p>
</div>
</div>
</body></html>`
)
),
http.get(`https://mp.weixin.qq.com/s/rsshub_test_hit_waf`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/mp/rsshub_test/waf`)),
http.get(`https://mp.weixin.qq.com/s/unknown_page`, () =>
HttpResponse.text(
`<html>
<head>
<title>Title</title>
<script>console.log</script>
</head>
<body class="zh_CN " ontouchstart="">
<script>console.log</script>
<style>.style{}</style>
<p>
Unknown paragraph
</p>
</body></html>`
)
),
http.get(`https://mp.weixin.qq.com/s/deleted_page`, () =>
HttpResponse.text(
`<html>
<head>
<title>Title</title>
<script>console.log</script>
</head>
<body class="zh_CN " ontouchstart="">
<script>console.log</script>
<style>.style{}</style>
<p>
该内容已被发布者删除
</p>
</body></html>`
)
),
http.get(`https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location`, () => HttpResponse.text('', { status: 302 })),
http.get(`https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect`, () => HttpResponse.redirect(`https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect`)),
http.get(`http://rsshub.test/headers`, ({ request }) =>
HttpResponse.json({
...Object.fromEntries(request.headers.entries()),
Expand Down
79 changes: 75 additions & 4 deletions lib/utils/wechat-mp.test.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import { describe, expect, it, vi } from 'vitest';
import { describe, expect, it, vi, afterEach } from 'vitest';
import { load } from 'cheerio';
import Parser from 'rss-parser';
import InvalidParameterError from '@/errors/types/invalid-parameter';
import { exportedForTestingOnly, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp';
const { ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly;
import { exportedForTestingOnly, WeChatMpError, fetchArticle, finishArticleItem, fixArticleContent, normalizeUrl } from '@/utils/wechat-mp';
const { toggleWerror, ExtractMetadata, showTypeMapReverse } = exportedForTestingOnly;

vi.mock('@/utils/request-rewriter', () => ({ default: null }));
const { default: app } = await import('@/app');
const parser = new Parser();

afterEach(() => toggleWerror(false));

const expectedItem: {
title: string;
summary: string;
Expand Down Expand Up @@ -324,8 +326,23 @@ describe('wechat-mp', () => {
expect(normalizeUrl(somethingElseWithHash.replace('https://', 'http://'))).toBe(somethingElse);

const notWechatMp = 'https://im.not.wechat.mp/and/an/error/is/expected';
expect(() => normalizeUrl(notWechatMp)).toThrow();
expect(() => normalizeUrl(notWechatMp)).toThrow('URL host must be "mp.weixin.qq.com"');
expect(normalizeUrl(notWechatMp, true)).toBe(notWechatMp);

const unknownSearchParam = mpArticleRoot + '?unknown=param';
toggleWerror(false);
expect(normalizeUrl(unknownSearchParam)).toBe(unknownSearchParam);
toggleWerror(true);
expect(() => normalizeUrl(unknownSearchParam)).toThrow('WarningAsError: unknown URL search parameters');

const unknownPath = mpRoot + '/unknown/path';
toggleWerror(false);
expect(normalizeUrl(unknownPath)).toBe(unknownPath);
toggleWerror(true);
expect(() => normalizeUrl(unknownPath, true)).toThrow('WarningAsError: unknown URL path');

const ampEscapedUrl = longUrl.replaceAll('&', '&amp;');
expect(normalizeUrl(ampEscapedUrl)).toBe(longUrlShortened);
});

it('fetchArticle_&_finishArticleItem_appMsg', async () => {
Expand Down Expand Up @@ -405,6 +422,60 @@ describe('wechat-mp', () => {
await testFetchArticleFinishArticleItem('/fallback', { setMpNameAsAuthor: true, skipLink: true });
});

it('hit_waf', async () => {
try {
await fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_hit_waf');
expect.unreachable('Should throw an error');
} catch (error) {
expect(error).toBeInstanceOf(WeChatMpError);
expect((<WeChatMpError>error).message).not.toContain('console.log');
expect((<WeChatMpError>error).message).not.toContain('.style');
expect((<WeChatMpError>error).message).not.toContain('Consider raise an issue');
expect((<WeChatMpError>error).message).toContain('request blocked by WAF:');
expect((<WeChatMpError>error).message).toContain('/mp/rsshub_test/waf');
expect((<WeChatMpError>error).message).toContain('Title');
expect((<WeChatMpError>error).message).toContain('环境异常');
}
});

it('unknown_page', async () => {
const unknownPageUrl = 'https://mp.weixin.qq.com/s/unknown_page';
try {
await fetchArticle(unknownPageUrl);
expect.unreachable('Should throw an error');
} catch (error) {
expect(error).toBeInstanceOf(WeChatMpError);
expect((<WeChatMpError>error).message).not.toContain('console.log');
expect((<WeChatMpError>error).message).not.toContain('.style');
expect((<WeChatMpError>error).message).toContain('Consider raise an issue');
expect((<WeChatMpError>error).message).toContain('unknown page,');
expect((<WeChatMpError>error).message).toContain('Title Unknown paragraph');
expect((<WeChatMpError>error).message).toContain(unknownPageUrl);
}
});

it('deleted_page', async () => {
const deletedPageUrl = 'https://mp.weixin.qq.com/s/deleted_page';

try {
await fetchArticle(deletedPageUrl);
expect.unreachable('Should throw an error');
} catch (error) {
expect(error).toBeInstanceOf(WeChatMpError);
expect((<WeChatMpError>error).message).not.toContain('console.log');
expect((<WeChatMpError>error).message).not.toContain('.style');
expect((<WeChatMpError>error).message).not.toContain('Consider raise an issue');
expect((<WeChatMpError>error).message).toContain('deleted by author:');
expect((<WeChatMpError>error).message).toContain('Title 该内容已被发布者删除');
expect((<WeChatMpError>error).message).toContain(deletedPageUrl);
}
});

it('redirect', () => {
expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_redirect_no_location')).rejects.toThrow('redirect without location');
expect(fetchArticle('https://mp.weixin.qq.com/s/rsshub_test_recursive_redirect')).rejects.toThrow('too many redirects');
});

it('route_test', async () => {
try {
await app.request('/test/wechat-mp');
Expand Down
92 changes: 74 additions & 18 deletions lib/utils/wechat-mp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,32 @@ import { parseDate } from '@/utils/parse-date';
import cache from '@/utils/cache';
import logger from '@/utils/logger';

const MAINTAINERS = ['Rongronggg9'];
class WeChatMpError extends Error {
constructor(message: string) {
super(message);
this.name = 'WeChatMpError';
}
}

const warn = (reason: string, details: string) =>
logger.warn(`wechat-mp: ${reason}: ${details},
consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`);
const MAINTAINERS = ['@Rongronggg9'];

const formatLogNoMention = (...params: string[]): string => `wechat-mp: ${params.join(': ')}`;
const formatLog = (...params: string[]): string => `${formatLogNoMention(...params)}
Consider raise an issue (mentioning ${MAINTAINERS.join(', ')}) with the article URL for further investigation`;
let warn = (...params: string[]) => logger.warn(formatLog(...params));
const error = (...params: string[]): never => {
throw new WeChatMpError(formatLog(...params));
};
const errorNoMention = (...params: string[]): never => {
throw new WeChatMpError(formatLogNoMention(...params));
};
const toggleWerror = (() => {
const onFunc = (...params: string[]) => error('WarningAsError', ...params);
const offFunc = warn;
return (on: boolean) => {
warn = on ? onFunc : offFunc;
};
})();

const replaceReturnNewline = (() => {
const returnRegExp = /\r|\\(r|x0d)/g;
Expand Down Expand Up @@ -147,7 +168,7 @@ class ExtractMetadata {
private static commonMetadataToBeExtracted = {
showType: this.genExtractFunc('item_show_type', { valuePattern: '\\d+' }),
realShowType: this.genExtractFunc('real_item_show_type', { valuePattern: '\\d+' }),
createTime: this.genExtractFunc('ct', { valuePattern: '\\d+' }),
createTime: this.genExtractFunc('ct', { valuePattern: '\\d+', allowNotFound: true }),
sourceUrl: this.genExtractFunc('msg_source_url', { valuePattern: `https?://[^'"]*`, allowNotFound: true }),
};

Expand Down Expand Up @@ -376,18 +397,21 @@ const fixArticleContent = (html?: string | Cheerio<Element>, skipImg = false) =>
// abtest_cookie, wx_header
// Known params (temporary link):
// src, timestamp, ver, signature, new (unessential)
const normalizeUrl = (url, bypassHostCheck = false) => {
const normalizeUrl = (url: string, bypassHostCheck = false) => {
const oriUrl = url;
// already seen some weird urls with `&` escaped as `&amp;`, so fix it
// calling fixUrl should always be safe since having `&amp;` or `\x26` in a URL is meaningless
url = fixUrl(url);
const urlObj = new URL(url);
if (!bypassHostCheck && urlObj.host !== 'mp.weixin.qq.com') {
throw new Error('wechat-mp: URL host must be "mp.weixin.qq.com", but got ' + oriUrl);
error('URL host must be "mp.weixin.qq.com"', url);
}
urlObj.protocol = 'https:';
urlObj.hash = ''; // remove hash
if (/^\/s\/.+/.test(urlObj.pathname)) {
if (urlObj.pathname.startsWith('/s/')) {
// a short link, just remove all the params
urlObj.search = '';
} else if (/^\/s$/.test(urlObj.pathname)) {
} else if (urlObj.pathname === '/s') {
const biz = urlObj.searchParams.get('__biz');
const mid = urlObj.searchParams.get('mid') || urlObj.searchParams.get('appmsgid');
const idx = urlObj.searchParams.get('idx') || urlObj.searchParams.get('itemidx');
Expand All @@ -405,11 +429,11 @@ const normalizeUrl = (url, bypassHostCheck = false) => {
// a temporary link, remove all unessential params
urlObj.search = `?src=${src}&timestamp=${timestamp}&ver=${ver}&signature=${signature}`;
} else {
// unknown link, just let it go
warn('unknown URL search parameters', oriUrl);
}
}
} else {
// IDK what it is, just let it go
warn('unknown URL path', oriUrl);
}
return urlObj.href;
};
Expand Down Expand Up @@ -479,9 +503,11 @@ class PageParsers {
}
return page;
};
static dispatch = async ($: CheerioAPI) => {
static dispatch = async (html: string, url: string) => {
const $ = load(html);
const commonMetadata = ExtractMetadata.common($);
let page: Record<string, any>;
let pageText: string, pageTextShort: string;
switch (commonMetadata.showType) {
case 'APP_MSG_PAGE':
page = await PageParsers.appMsg($, commonMetadata);
Expand All @@ -495,8 +521,24 @@ class PageParsers {
case 'VIDEO_SHARE_PAGE':
page = PageParsers.fallback($, commonMetadata);
break;
case undefined:
$('script, style').remove();
pageText = $('title, body').text().replaceAll(/\s+/g, ' ').trim();
pageTextShort = pageText.slice(0, 25);
if (pageText.length >= 25 + '...'.length) {
pageTextShort = pageText.slice(0, 25);
pageTextShort += '...';
}
if (pageText.includes('已被发布者删除')) {
errorNoMention('deleted by author', pageTextShort, url);
} else if (new URL(url).pathname.includes('captcha') || pageText.includes('环境异常')) {
errorNoMention('request blocked by WAF', pageTextShort, url);
} else {
error('unknown page, probably due to WAF', pageTextShort, url);
}
return {}; // just to make TypeScript happy, actually UNREACHABLE
default:
warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`);
warn('new showType, trying fallback method', `showType=${commonMetadata.showType}`, url);
page = PageParsers.fallback($, commonMetadata);
}
const locationMetadata = ExtractMetadata.location($);
Expand All @@ -517,6 +559,20 @@ class PageParsers {
};
}

const redirectHelper = async (url: string, maxRedirects: number = 5) => {
maxRedirects--;
const raw = await ofetch.raw(url);
if ([301, 302, 303, 307, 308].includes(raw.status)) {
if (!raw.headers.has('location')) {
error('redirect without location', url);
} else if (maxRedirects <= 0) {
error('too many redirects', url);
}
return await redirectHelper(<string>raw.headers.get('location'), maxRedirects);
}
return raw;
};

/**
* Fetch article and its metadata from WeChat MP (mp.weixin.qq.com).
*
Expand All @@ -528,9 +584,9 @@ class PageParsers {
const fetchArticle = (url: string, bypassHostCheck: boolean = false) => {
url = normalizeUrl(url, bypassHostCheck);
return cache.tryGet(url, async () => {
const data = await ofetch(url);
const $ = load(data);
const page = await PageParsers.dispatch($);
const raw = await redirectHelper(url);
// pass the redirected URL to dispatcher for better error logging
const page = await PageParsers.dispatch(raw._data, raw.url);
return { ...page, link: url };
}) as Promise<{
title: string;
Expand Down Expand Up @@ -581,5 +637,5 @@ const finishArticleItem = async (item, setMpNameAsAuthor = false, skipLink = fal
return item;
};

const exportedForTestingOnly = { ExtractMetadata, showTypeMapReverse };
export { exportedForTestingOnly, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };
const exportedForTestingOnly = { toggleWerror, ExtractMetadata, showTypeMapReverse };
export { exportedForTestingOnly, WeChatMpError, fixArticleContent, fetchArticle, finishArticleItem, normalizeUrl };

0 comments on commit 428e65c

Please sign in to comment.