-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStratton.go
184 lines (144 loc) · 4.83 KB
/
Stratton.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
// Track the number of articles we parse
var processedCount = 0
// Is there anyway to make this faster?
// Helper function to check if a keyword is part of the article keywords
func Find(slice []string, val string) (int, bool) {
for i, item := range slice {
if item == val {
return i, true
}
}
return -1, false
}
// Article stores information about a news article
type Article struct {
Title string
Content string
URL string
Keywords []string
PublishDate *time.Time
}
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: reuters.com, www.reuters.com
colly.AllowedDomains("reuters.com", "www.reuters.com"),
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
// colly.CacheDir("./reuters_cache"),
colly.Async(true),
// Set a max depth to prevent too much scraping right now
colly.MaxDepth(25),
)
// Don't revisit the same URL twice
c.AllowURLRevisit = false
// Parallelism
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 300})
// Create another collector to scrape article detail page
detailCollector := c.Clone()
// Create an empty slice that has a capacity of 5,000 Articles
articles := make([]Article, 0, 5000)
// On every link with an href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// If it has '/article' prefix it's an article so have the detailCollector
// collect the information
if strings.HasPrefix(link, "/article") {
// Need to provide the full URL or we get a schema error
detailCollector.Visit("https://reuters.com" + link)
}
// If the link is a next button, we have the original collector handle the paging
if e.Attr("class") == "control-nav-next" {
e.Request.Visit(link)
}
// Return and exit the callback
return
})
// Before making a request print "Visiting {URL}"
c.OnRequest(func(r *colly.Request) {
//log.Println("Visiting List Page: ", r.URL.String())
})
// Report error back to the console
c.OnError(func(_ *colly.Response, err error) {
log.Println("PAGE ERROR:", err)
})
detailCollector.OnError(func(_ *colly.Response, err error) {
log.Println("DETAILS ERROR:", err)
})
// Pass all of the HTML to the detailCollector
detailCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
// Increment the number of articles we've encountered
processedCount++
// Declare an array of strings to fill up with keywords
var keywords []string
// Define the Reuters article date format using magic reference
const ReutersDateFormat = "January _2, 2006 3:04 PM"
// Split the date string on the '/' character
var dateSplit = strings.Split(e.ChildText("div.ArticleHeader_date"), "/")
// Encountered an error here before so if the date is weirdly formatted just skip it
// to avoid an out of index error
if len(dateSplit) < 2 {
return
}
// Remove leading and trailing spaces
var date = strings.TrimSpace(dateSplit[0])
var timestamp = strings.TrimSpace(dateSplit[1])
var cleaned = date + " " + timestamp
// Convert the date/time string into a time object
var publishedAt, _ = time.Parse(ReutersDateFormat, cleaned)
// Grab the title and body (that was easy!)
title := e.ChildText("h1.ArticleHeader_headline")
body := e.ChildText("div.StandardArticleBody_body")
// Get the keywords for this article
// This will only run once
e.ForEach("meta[name=keywords]", func(_ int, el *colly.HTMLElement) {
keywords = strings.Split(el.Attr("content"), ",")
})
// If the article isn't about an acquisition, merger, or takeover, then just return from callback
_, found := Find(keywords, "Mergers / Acquisitions / Takeovers")
if !found {
return
}
// Construct the article
a := Article{
Title: title,
Content: body,
URL: e.Request.URL.String(),
Keywords: keywords,
PublishDate: &publishedAt,
}
// Append the article to the list of articles
articles = append(articles, a)
})
// Start the timer
start := time.Now()
// Start scraping on the business news section
c.Visit("https://www.reuters.com/news/archive/businessNews")
c.Wait()
detailCollector.Wait()
elapsed := time.Since(start)
fmt.Println()
fmt.Println("If anyone over here thinks I’m superficial or materialistic, go get a job at McDonald’s because that’s where you belong.")
fmt.Println()
log.Printf("%d Articles Processed", processedCount)
log.Printf("Job took %s", elapsed)
fmt.Println()
// WRITE TO A FILE
file, _ := json.MarshalIndent(articles, "", " ")
_ = ioutil.WriteFile("Acquisitions.json", file, 0644)
// WRITE TO Stdout
//enc := json.NewEncoder(os.Stdout)
//enc.SetIndent("", " ")
//enc.Encode(articles)
}
// Anna Kalik approves this project!