feat(web): add users behavior with chromedp by scrolling down

instill-ai · Oct 4, 2024 · 994d545 · 994d545
1 parent 4ec3fff
commit 994d545
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 8 deletions.
diff --git a/pkg/component/operator/web/v0/.compogen/scrape_webpage.mdx b/pkg/component/operator/web/v0/.compogen/scrape_webpage.mdx
@@ -0,0 +1,14 @@
+
+
+#### Scraping Support for Dynamic Content
+This function provides scraping support to fetch dynamic content from web pages while simulating user behaviors, such as scrolling down. The initial implementation includes the following capabilities:
+
+Scrolling:
+- Mimics user scrolling down the page to load additional content dynamically.
+
+Future enhancements will include additional user interactions, such as:
+- Clicking: Simulate mouse clicks on specified elements.
+- Taking Screenshots: Capture screenshots of the current view.
+- Keyboard Actions: Simulate key presses and other keyboard interactions.
+
+This function aims to provide a robust framework for interacting with web pages and extracting dynamic content effectively.
diff --git a/pkg/component/operator/web/v0/README.mdx b/pkg/component/operator/web/v0/README.mdx
@@ -110,7 +110,7 @@ Scrape the webpage contents and manipulate html with jquery command. The sequenc
 | Only Main Content | `only-main-content` | boolean | Only return the main content of the page excluding header, nav, footer. |
 | Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer' |
 | Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer' |
-| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. |
+| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded. |
 </div>
 
 
@@ -143,6 +143,20 @@ Scrape the webpage contents and manipulate html with jquery command. The sequenc
 | Title | `title` | string | The title of the webpage |
 </div>
 </details>
+
+
+#### Scraping Support for Dynamic Content
+This function provides scraping support to fetch dynamic content from web pages while simulating user behaviors, such as scrolling down. The initial implementation includes the following capabilities:
+
+Scrolling:
+- Mimics user scrolling down the page to load additional content dynamically.
+
+Future enhancements will include additional user interactions, such as:
+- Clicking: Simulate mouse clicks on specified elements.
+- Taking Screenshots: Capture screenshots of the current view.
+- Keyboard Actions: Simulate key presses and other keyboard interactions.
+
+This function aims to provide a robust framework for interacting with web pages and extracting dynamic content effectively.
 ## Example Recipes
 
 Recipe for the [Web scraper](https://instill.tech/instill-ai/pipelines/web-scraper/playground) pipeline.

diff --git a/pkg/component/operator/web/v0/config/tasks.json b/pkg/component/operator/web/v0/config/tasks.json
@@ -387,7 +387,7 @@
         },
         "timeout": {
           "default": 1000,
-          "description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content.",
+          "description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded.",
           "instillAcceptFormats": [
             "integer"
           ],

diff --git a/pkg/component/operator/web/v0/main.go b/pkg/component/operator/web/v0/main.go
@@ -1,4 +1,4 @@
-//go:generate compogen readme ./config ./README.mdx --extraContents bottom=.compogen/bottom.mdx
+//go:generate compogen readme ./config ./README.mdx --extraContents TASK_SCRAPE_WEBPAGE=.compogen/scrape_webpage.mdx --extraContents bottom=.compogen/bottom.mdx
 package web
 
 import (

diff --git a/pkg/component/operator/web/v0/scrape_webpage.go b/pkg/component/operator/web/v0/scrape_webpage.go
@@ -141,22 +141,27 @@ func curlRequest(url string) (*goquery.Document, error) {
 
 func requestToWebpage(url string, timeout int) (*goquery.Document, error) {
 
-	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
-	defer cancel()
-
-	ctx, cancelBrowser := chromedp.NewContext(ctx)
+	ctx, cancelBrowser := chromedp.NewContext(context.Background())
 	defer cancelBrowser()
 
 	var htmlContent string
 
 	err := chromedp.Run(ctx,
 		chromedp.Navigate(url),
 		chromedp.WaitReady("body"),
+		// Temporary solution for dynamic content.
+		// There are different ways to get the dynamic content.
+		// Now, we default it to scroll down the page.
+		scrollDown(ctx, timeout),
 		chromedp.OuterHTML("html", &htmlContent),
 	)
+
 	if err != nil {
 		log.Println("Cannot get dynamic content, so scrape the static content only", err)
-		return getStaticContent(url)
+		log.Println("htmlContent: ", htmlContent)
+		if htmlContent == "" {
+			return getStaticContent(url)
+		}
 	}
 
 	htmlReader := strings.NewReader(htmlContent)
@@ -169,6 +174,34 @@ func requestToWebpage(url string, timeout int) (*goquery.Document, error) {
 	return doc, nil
 }
 
+func scrollDown(ctx context.Context, timeout int) chromedp.Action {
+	return chromedp.ActionFunc(func(ctx context.Context) error {
+		// Scroll delay is the time to wait before the next scroll
+		// It is usually set 500 to 1000 milliseconds.
+		// We set it to 500 milliseconds as a default value for first version.
+		scrollDelay := 500 * time.Millisecond
+
+		scrollCount := 0
+		// Now, we cannot find a proper way to cancel the context for chromedp.
+		// So, we set the max scrolls according to the timeout users set.
+		maxScrolls := timeout / int(scrollDelay.Milliseconds())
+
+		for scrollCount < maxScrolls {
+			log.Println("Scrolling down...")
+
+			if err := chromedp.Evaluate(`window.scrollBy(0, window.innerHeight);`, nil).Do(ctx); err != nil {
+				return err
+			}
+			scrollCount++
+			time.Sleep(scrollDelay)
+			if ctx.Err() != nil {
+				break
+			}
+		}
+		return nil
+	})
+}
+
 func getRemovedTagsHTML(doc *goquery.Document, input ScrapeWebpageInput) string {
 	if input.OnlyMainContent {
 		removeSelectors := []string{"header", "nav", "footer"}