-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmanilatimes_scrape.py
68 lines (62 loc) · 1.65 KB
/
manilatimes_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from bs4 import BeautifulSoup as soup
import requests as req
def lists():
data = req.get("https://www.manilatimes.net/news").text
html = soup(data, "lxml")
body = html.findAll("div", class_="item-row")
a = []
for i in body:
url = i.find("a", class_="item-info-abs-href")
if url != None:
b = {
"link": url["href"],
"title": url["title"]
}
a.append(b)
return a
def article(link):
data = req.get(link).text
html = soup(data, "lxml")
title = html.find("h1", class_="article-title").text.replace("\n", "").replace(" ", "")
info_container = html.find("div", class_="article-info-container")
author = info_container.find("a", class_="article-author-name")
if author != None:
author = author.text.replace("\n", "").replace(" ", "")
else:
author = ""
date = info_container.find("div", class_="article-publish-time")
if date != None:
date = date.text.replace("\n", "").replace(" ", "")
else:
date = ""
caption = html.find("div", class_="article-image-caption")
if caption != None:
caption = caption.text
else:
caption = ""
article_body = html.find("div", class_="article-body-content")
p = article_body.findAll("p")
a = []
for i in p:
if i.text != " ":
a.append(i.text)
b = {
"title": title,
"author": author,
"time": date,
"imgCaption": caption,
"article": a
}
return b
a = lists()
for i in range(len(a)):
print(str(i + 1) + ": " + a[i]["title"])
print()
b = int(input("Enter the number of acticle: "))
print()
c = article(a[b - 1]["link"])
print(c["title"])
print(c["author"], "-", c["time"])
print(c["imgCaption"])
for i in c["article"]:
print(i)