Skip to content

Commit

Permalink
writing html file in debug mode
Browse files Browse the repository at this point in the history
  • Loading branch information
jakopako committed Apr 20, 2024
1 parent b93ac50 commit b0c1012
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 7 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,19 @@ jobs:
fail-fast: false
matrix:
language: ["go"]
go-version: ["1.22"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://git.io/codeql-language-support

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
Expand All @@ -66,5 +72,5 @@ jobs:
# make bootstrap
# make release

- name: Perform CodeQL Analysi2
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
1 change: 1 addition & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ func main() {
go func() {
for _, s := range config.Scrapers {
if *singleScraper == "" || *singleScraper == s.Name {
s.Debug = *debug
sc <- s
}
}
Expand Down
42 changes: 36 additions & 6 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package scraper

import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io/fs"
Expand Down Expand Up @@ -247,6 +248,7 @@ type Scraper struct {
PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
Interaction types.Interaction `yaml:"interaction,omitempty"`
fetcher fetch.Fetcher
Debug bool
}

// GetItems fetches and returns all items from a website according to the
Expand Down Expand Up @@ -476,7 +478,7 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int
func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i *types.Interaction) (bool, string, *goquery.Document, error) {

if nextPageI == 0 {
newDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: *i})
newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: *i})
if err != nil {
return false, "", nil, err
}
Expand All @@ -493,7 +495,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
}
nextPageDoc, err := fetchToDoc(currentPageUrl, c.fetcher, fetch.FetchOpts{Interaction: ia})
nextPageDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: ia})
if err != nil {
return false, "", nil, err
}
Expand All @@ -507,7 +509,7 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
return false, "", nil, err
}
if nextPageUrl != "" {
nextPageDoc, err := fetchToDoc(nextPageUrl, c.fetcher, fetch.FetchOpts{})
nextPageDoc, err := c.fetchToDoc(nextPageUrl, fetch.FetchOpts{})
if err != nil {
return false, "", nil, err
}
Expand All @@ -521,13 +523,41 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
}
}

func fetchToDoc(url string, fetcher fetch.Fetcher, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := fetcher.Fetch(url, opts)
func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := c.fetcher.Fetch(url, opts)
if err != nil {
return nil, err
}
// fmt.Println(res)
return goquery.NewDocumentFromReader(strings.NewReader(res))
doc, err := goquery.NewDocumentFromReader(strings.NewReader(res))
if err != nil {
return nil, err
}

if c.Debug {
bs := make([]byte, 8)
_, err := rand.Read(bs)
if err != nil {
return nil, fmt.Errorf("failed to generate random bytes for html file name")
}
filename := fmt.Sprintf("%s-%x.html", c.Name, bs[:8])
slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", url))
htmlStr, err := goquery.OuterHtml(doc.Children())
if err != nil {
return nil, fmt.Errorf("failed to write html file: %v", err)
}

f, err := os.Create(filename)
if err != nil {
return nil, fmt.Errorf("failed to write html file: %v", err)
}
defer f.Close()
_, err = f.WriteString(htmlStr)
if err != nil {
return nil, fmt.Errorf("failed to write html file: %v", err)
}
}
return doc, nil
}

func extractField(field *Field, event map[string]interface{}, s *goquery.Selection, baseURL string) error {
Expand Down

0 comments on commit b0c1012

Please sign in to comment.