Skip to content

Commit

Permalink
feat(web): add users behavior with chromedp by scrolling down
Browse files Browse the repository at this point in the history
  • Loading branch information
chuang8511 committed Oct 4, 2024
1 parent 4ec3fff commit 994d545
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 8 deletions.
14 changes: 14 additions & 0 deletions pkg/component/operator/web/v0/.compogen/scrape_webpage.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@


#### Scraping Support for Dynamic Content
This function provides scraping support to fetch dynamic content from web pages while simulating user behaviors, such as scrolling down. The initial implementation includes the following capabilities:

Scrolling:
- Mimics user scrolling down the page to load additional content dynamically.

Future enhancements will include additional user interactions, such as:
- Clicking: Simulate mouse clicks on specified elements.
- Taking Screenshots: Capture screenshots of the current view.
- Keyboard Actions: Simulate key presses and other keyboard interactions.

This function aims to provide a robust framework for interacting with web pages and extracting dynamic content effectively.
16 changes: 15 additions & 1 deletion pkg/component/operator/web/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ Scrape the webpage contents and manipulate html with jquery command. The sequenc
| Only Main Content | `only-main-content` | boolean | Only return the main content of the page excluding header, nav, footer. |
| Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer' |
| Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer' |
| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. |
| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded. |
</div>


Expand Down Expand Up @@ -143,6 +143,20 @@ Scrape the webpage contents and manipulate html with jquery command. The sequenc
| Title | `title` | string | The title of the webpage |
</div>
</details>


#### Scraping Support for Dynamic Content
This function provides scraping support to fetch dynamic content from web pages while simulating user behaviors, such as scrolling down. The initial implementation includes the following capabilities:

Scrolling:
- Mimics user scrolling down the page to load additional content dynamically.

Future enhancements will include additional user interactions, such as:
- Clicking: Simulate mouse clicks on specified elements.
- Taking Screenshots: Capture screenshots of the current view.
- Keyboard Actions: Simulate key presses and other keyboard interactions.

This function aims to provide a robust framework for interacting with web pages and extracting dynamic content effectively.
## Example Recipes

Recipe for the [Web scraper](https://instill.tech/instill-ai/pipelines/web-scraper/playground) pipeline.
Expand Down
2 changes: 1 addition & 1 deletion pkg/component/operator/web/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@
},
"timeout": {
"default": 1000,
"description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content.",
"description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded.",
"instillAcceptFormats": [
"integer"
],
Expand Down
2 changes: 1 addition & 1 deletion pkg/component/operator/web/v0/main.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//go:generate compogen readme ./config ./README.mdx --extraContents bottom=.compogen/bottom.mdx
//go:generate compogen readme ./config ./README.mdx --extraContents TASK_SCRAPE_WEBPAGE=.compogen/scrape_webpage.mdx --extraContents bottom=.compogen/bottom.mdx
package web

import (
Expand Down
43 changes: 38 additions & 5 deletions pkg/component/operator/web/v0/scrape_webpage.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,22 +141,27 @@ func curlRequest(url string) (*goquery.Document, error) {

func requestToWebpage(url string, timeout int) (*goquery.Document, error) {

ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
defer cancel()

ctx, cancelBrowser := chromedp.NewContext(ctx)
ctx, cancelBrowser := chromedp.NewContext(context.Background())
defer cancelBrowser()

var htmlContent string

err := chromedp.Run(ctx,
chromedp.Navigate(url),
chromedp.WaitReady("body"),
// Temporary solution for dynamic content.
// There are different ways to get the dynamic content.
// Now, we default it to scroll down the page.
scrollDown(ctx, timeout),
chromedp.OuterHTML("html", &htmlContent),
)

if err != nil {
log.Println("Cannot get dynamic content, so scrape the static content only", err)
return getStaticContent(url)
log.Println("htmlContent: ", htmlContent)
if htmlContent == "" {
return getStaticContent(url)
}
}

htmlReader := strings.NewReader(htmlContent)
Expand All @@ -169,6 +174,34 @@ func requestToWebpage(url string, timeout int) (*goquery.Document, error) {
return doc, nil
}

func scrollDown(ctx context.Context, timeout int) chromedp.Action {
return chromedp.ActionFunc(func(ctx context.Context) error {
// Scroll delay is the time to wait before the next scroll
// It is usually set 500 to 1000 milliseconds.
// We set it to 500 milliseconds as a default value for first version.
scrollDelay := 500 * time.Millisecond

scrollCount := 0
// Now, we cannot find a proper way to cancel the context for chromedp.
// So, we set the max scrolls according to the timeout users set.
maxScrolls := timeout / int(scrollDelay.Milliseconds())

for scrollCount < maxScrolls {
log.Println("Scrolling down...")

if err := chromedp.Evaluate(`window.scrollBy(0, window.innerHeight);`, nil).Do(ctx); err != nil {
return err
}
scrollCount++
time.Sleep(scrollDelay)
if ctx.Err() != nil {
break
}
}
return nil
})
}

func getRemovedTagsHTML(doc *goquery.Document, input ScrapeWebpageInput) string {
if input.OnlyMainContent {
removeSelectors := []string{"header", "nav", "footer"}
Expand Down

0 comments on commit 994d545

Please sign in to comment.