-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnhk_easy_news_downloader.py
executable file
·90 lines (77 loc) · 3.26 KB
/
nhk_easy_news_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/python3
# coding=utf-8
import json
import os
import sys
import subprocess
import requests
from bs4 import BeautifulSoup
def news_download(out_folder: str):
"""
通过 NHK 的 API 获取新闻列表
:param out_folder: 储存目录
"""
r = requests.get('http://www3.nhk.or.jp/news/easy/news-list.json')
r.encoding = 'utf-8-sig'
o = json.loads(r.text)
for k, v in o[0].items():
for d in v:
parse_news(d, out_folder)
def parse_news(news: dict, out_folder: str):
"""
下载新闻
:param news: api 里新闻的明细信息
:param out_folder: 储存目录
"""
news_id = news['news_id']
news_time = news['news_prearranged_time'].replace(':', '-')
title = news['title']
news_uri = str.format('http://www3.nhk.or.jp/news/easy/{0}/{1}.html', str(news_id), str(news_id))
news_file = os.path.join(out_folder, '_'.join([news_time, news_id, title]),
str.format("{0}.html", news_id)).replace(' ', '_')
has_news_web_image = news['has_news_web_image']
has_news_easy_image = news['has_news_easy_image']
has_news_easy_voice = news['has_news_easy_voice']
# 以文件名送判重
if not os.path.exists(news_file):
os.makedirs(os.path.dirname(news_file))
r = requests.get(news_uri)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', attrs={'class': 'article-main__title'})
article = soup.find('div', attrs={'id': 'js-article-body'})
for a in article.findAll('a'):
a.unwrap()
# 音频
voice_file = None
if has_news_easy_voice:
voice_file = news['news_easy_voice_uri']
voice_uri = str.format('https://nhks-vh.akamaihd.net/i/news/easy/{0}/master.m3u8', voice_file)
voice_path = os.path.join(os.path.dirname(news_file), voice_file)
subprocess.call(['ffmpeg', '-i', voice_uri, '-codec', 'copy', voice_path, '-loglevel', 'warning'])
# 图片
img_uri = None
if has_news_easy_image:
img_uri = str.format('http://www3.nhk.or.jp/news/easy/{0}/{1}', news_id, news['news_easy_image_uri'])
elif has_news_web_image:
img_uri = news['news_web_image_uri']
if img_uri is not None:
with open(os.path.join(os.path.dirname(news_file), str.format('{0}.jpg', news_id)), 'wb') as f:
f.write(requests.get(img_uri).content)
# 文本
with open(news_file, "w", encoding="utf-8") as f:
print("<!DOCTYPE html>", file=f)
print("<html lang='ja'>", file=f)
print("<head><meta charset='utf-8'></head>", file=f)
print("<style>p { font-size: 100%; line-height: 3.2; padding-bottom: 20px; }</style>", file=f)
print("<body>", file=f)
print(title, file=f)
if img_uri is not None:
print("<img src='" + news_id + ".jpg'><br>", file=f)
if voice_file is not None:
print("<audio controls><source src='" + voice_file + "' type='audio/mpeg'></audio>", file=f)
print(article, file=f)
print("</body>", file=f)
print("</html>", file=f)
if __name__ == "__main__":
news_download(sys.argv[1])