Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better debugging files/logs + allow LIST of interactions #298

Merged
merged 1 commit into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package config

var (
Debug bool
)
80 changes: 51 additions & 29 deletions fetch/fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,20 @@ import (
"io"
"log/slog"
"net/http"
"net/url"
"os"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp"
"github.com/jakopako/goskyr/config"
"github.com/jakopako/goskyr/types"
"github.com/jakopako/goskyr/utils"
)

type FetchOpts struct {
Interaction types.Interaction
Interaction []*types.Interaction
}

// A Fetcher allows to fetch the content of a web page
Expand Down Expand Up @@ -90,8 +94,8 @@ func (d *DynamicFetcher) Cancel() {
d.cancelAlloc()
}

func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", url))
func (d *DynamicFetcher) Fetch(urlStr string, opts FetchOpts) (string, error) {
logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", urlStr))
logger.Debug("fetching page", slog.String("user-agent", d.UserAgent))
// start := time.Now()
ctx, cancel := chromedp.NewContext(d.allocContext)
Expand All @@ -104,36 +108,37 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
var body string
sleepTime := time.Duration(d.WaitMilliseconds) * time.Millisecond
actions := []chromedp.Action{
chromedp.Navigate(url),
chromedp.Navigate(urlStr),
chromedp.Sleep(sleepTime),
}
logger.Debug(fmt.Sprintf("appended chrome actions: Navigate, Sleep(%v)", sleepTime))
delay := 500 * time.Millisecond // default is .5 seconds
if opts.Interaction.Delay > 0 {
delay = time.Duration(opts.Interaction.Delay) * time.Millisecond
}
if opts.Interaction.Type == types.InteractionTypeClick {
count := 1 // default is 1
if opts.Interaction.Count > 0 {
count = opts.Interaction.Count
for j, ia := range opts.Interaction {
logger.Debug(fmt.Sprintf("processing interaction nr %d, type %s", j, ia.Type))
delay := 500 * time.Millisecond // default is .5 seconds
if ia.Delay > 0 {
delay = time.Duration(ia.Delay) * time.Millisecond
}
for i := 0; i < count; i++ {
// we only click the button if it exists. Do we really need this check here?
// TODO: should we click as many times as possible if count == 0? How would we implement this?
// actions = append(actions, chromedp.Click(d.Interaction.Selector, chromedp.ByQuery))
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(opts.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
return err
}
if len(nodes) == 0 {
return nil
} // nothing to do
logger.Debug(fmt.Sprintf("clicking on node with selector: %s", opts.Interaction.Selector))
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
}))
actions = append(actions, chromedp.Sleep(delay))
logger.Debug(fmt.Sprintf("appended chrome actions: ActionFunc, Sleep(%v)", delay))
if ia.Type == types.InteractionTypeClick {
count := 1 // default is 1
if ia.Count > 0 {
count = ia.Count
}
for i := 0; i < count; i++ {
// we only click the button if it exists. Do we really need this check here?
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(ia.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
return err
}
if len(nodes) == 0 {
return nil
} // nothing to do
logger.Debug(fmt.Sprintf("clicking on node with selector: %s", ia.Selector))
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
}))
actions = append(actions, chromedp.Sleep(delay))
logger.Debug(fmt.Sprintf("appended chrome actions: ActionFunc (mouse click), Sleep(%v)", delay))
}
}
}
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
Expand All @@ -145,6 +150,23 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
return err
}))

if config.Debug {
u, _ := url.Parse(urlStr)
var buf []byte
r, err := utils.RandomString(u.Host)
if err != nil {
return "", err
}
filename := fmt.Sprintf("%s.png", r)
actions = append(actions, chromedp.CaptureScreenshot(&buf))
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
// log.Printf("Write %v", fileName)
logger.Debug(fmt.Sprintf("writing screenshot to file %s", filename))
return os.WriteFile(filename, buf, 0644)
}))
logger.Debug("appended chrome actions: CaptureScreenshot, ActionFunc (save screenshot)")
}

// run task list
err := chromedp.Run(ctx,
actions...,
Expand Down
4 changes: 3 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync"

"github.com/jakopako/goskyr/autoconfig"
"github.com/jakopako/goskyr/config"
"github.com/jakopako/goskyr/ml"
"github.com/jakopako/goskyr/output"
"github.com/jakopako/goskyr/scraper"
Expand Down Expand Up @@ -65,6 +66,7 @@ func main() {
return
}

config.Debug = *debugFlag
var logLevel slog.Level
if *debugFlag {
logLevel = slog.LevelDebug
Expand Down Expand Up @@ -170,7 +172,7 @@ func main() {
go func() {
for _, s := range config.Scrapers {
if *singleScraper == "" || *singleScraper == s.Name {
s.Debug = *debugFlag
// s.Debug = *debugFlag
sc <- s
}
}
Expand Down
53 changes: 27 additions & 26 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package scraper

import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io/fs"
Expand All @@ -19,6 +18,7 @@ import (
"github.com/antchfx/jsonquery"
"github.com/goodsign/monday"
"github.com/ilyakaznacheev/cleanenv"
"github.com/jakopako/goskyr/config"
"github.com/jakopako/goskyr/date"
"github.com/jakopako/goskyr/fetch"
"github.com/jakopako/goskyr/output"
Expand Down Expand Up @@ -236,17 +236,16 @@ type Paginator struct {
// A Scraper contains all the necessary config parameters and structs needed
// to extract the desired information from a website
type Scraper struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
Item string `yaml:"item"`
Fields []Field `yaml:"fields,omitempty"`
Filters []*Filter `yaml:"filters,omitempty"`
Paginator Paginator `yaml:"paginator,omitempty"`
RenderJs bool `yaml:"render_js,omitempty"`
PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
Interaction types.Interaction `yaml:"interaction,omitempty"`
Name string `yaml:"name"`
URL string `yaml:"url"`
Item string `yaml:"item"`
Fields []Field `yaml:"fields,omitempty"`
Filters []*Filter `yaml:"filters,omitempty"`
Paginator Paginator `yaml:"paginator,omitempty"`
RenderJs bool `yaml:"render_js,omitempty"`
PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
Interaction []*types.Interaction `yaml:"interaction,omitempty"`
fetcher fetch.Fetcher
Debug bool `yaml:"debug,omitempty"`
}

// GetItems fetches and returns all items from a website according to the
Expand Down Expand Up @@ -280,7 +279,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string
currentPage := 0
var doc *goquery.Document

hasNextPage, pageURL, doc, err := c.fetchPage(nil, currentPage, c.URL, globalConfig.UserAgent, &c.Interaction)
hasNextPage, pageURL, doc, err := c.fetchPage(nil, currentPage, c.URL, globalConfig.UserAgent, c.Interaction)
if err != nil {
return items, err
}
Expand Down Expand Up @@ -477,10 +476,10 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int
return item
}

func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i *types.Interaction) (bool, string, *goquery.Document, error) {
func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i []*types.Interaction) (bool, string, *goquery.Document, error) {

if nextPageI == 0 {
newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: *i})
newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: i})
if err != nil {
return false, "", nil, err
}
Expand All @@ -492,10 +491,12 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
pagSelector := doc.Find(c.Paginator.Location.Selector)
if len(pagSelector.Nodes) > 0 {
if nextPageI < c.Paginator.MaxPages || c.Paginator.MaxPages == 0 {
ia := types.Interaction{
Selector: c.Paginator.Location.Selector,
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
ia := []*types.Interaction{
{
Selector: c.Paginator.Location.Selector,
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
},
}
nextPageDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: ia})
if err != nil {
Expand Down Expand Up @@ -525,8 +526,8 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
}
}

func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := c.fetcher.Fetch(url, opts)
func (c *Scraper) fetchToDoc(urlStr string, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := c.fetcher.Fetch(urlStr, opts)
if err != nil {
return nil, err
}
Expand All @@ -537,14 +538,14 @@ func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Documen
}

// in debug mode we want to write all the html's to files
if c.Debug {
bs := make([]byte, 8)
_, err := rand.Read(bs)
if config.Debug {
u, _ := url.Parse(urlStr)
r, err := utils.RandomString(u.Host)
if err != nil {
return nil, fmt.Errorf("failed to generate random bytes for html file name")
return nil, err
}
filename := fmt.Sprintf("%s-%x.html", c.Name, bs[:8])
slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", url))
filename := fmt.Sprintf("%s.html", r)
slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", urlStr))
htmlStr, err := goquery.OuterHtml(doc.Children())
if err != nil {
return nil, fmt.Errorf("failed to write html file: %v", err)
Expand Down
10 changes: 10 additions & 0 deletions utils/utils.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package utils

import (
"crypto/rand"
"fmt"
"math"
"sort"
Expand Down Expand Up @@ -142,3 +143,12 @@ func ReverseSlice[T any](s []T) {
s[i], s[j] = s[j], s[i]
}
}

func RandomString(base string) (string, error) {
bs := make([]byte, 8)
_, err := rand.Read(bs)
if err != nil {
return "", fmt.Errorf("failed to generate random bytes: %v", err)
}
return fmt.Sprintf("%s-%x", base, bs[:8]), nil
}