instill-ai · donch1989 · Oct 18, 2024 · Oct 17, 2024 · Oct 17, 2024 · jvallesm
@@ -34,8 +34,8 @@ This task involves systematically navigating through a website, starting from a
 | Task ID (required) | `task` | string | `TASK_CRAWL_SITE` |
 | URL (required) | `url` | string | The root URL to scrape. All links on this page will be scraped, and all links on those pages, and so on. |
 | Allowed Domains | `allowed-domains` | array[string] | A list of domains that are allowed to be scraped. If empty, all domains are allowed. |
-| Max Number of Pages (required) | `max-k` | integer | Max-K specifies the maximum number of pages to return. If max-k is set to 0, all available pages will be returned, up to a maximum of 100. If max-k is set to a positive number, the result will include up to max-k pages, but no more than that. |
-| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. |
+| Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. |
+| Timeout | `timeout` | integer | The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task. |
 | Max Depth | `max-depth` | integer | Max Depth specifies how deep the crawler will navigate from the root URL. If max depth is set to 1, the crawler will only scrape the root URL and will not follow any links to other pages. If max depth is set to 0, the crawler will scrape all reachable pages until the total number of scraped pages reaches max-k. If both max-k and max depth are defined, the crawler will prioritize the max-k setting when determining how many pages to scrape. |
 </div>
 
@@ -75,11 +75,12 @@ This task focuses on extracting specific data from a single targeted webpage by
 | :--- | :--- | :--- | :--- |
 | Task ID (required) | `task` | string | `TASK_SCRAPE_PAGE` |
 | URL (required) | `url` | string | The URL to scrape the webpage contents. |
+| Scrape Method (required) | `scrape-method` | string | Defines the method used for web scraping. Available options include 'http' for standard HTTP-based scraping and 'chrome-simulator' for scraping through a simulated Chrome browser environment. |
 | Include HTML | `include-html` | boolean | Indicate whether to include the raw HTML of the webpage in the output. If you want to include the raw HTML, set this to true. |
 | Only Main Content | `only-main-content` | boolean | Only return the main content of the page by excluding the content of the tag of header, nav, footer. |
 | Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. You can use [jQuery](https://www.w3schools.com/jquery/jquery_syntax.asp) to remove data. If empty, no tags will be removed. Example: 'script, .ad, #footer'. Please check the [jQuery Syntax Examples](#jquery-syntax-examples). |
 | Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. You can use [jQuery](https://www.w3schools.com/jquery/jquery_syntax.asp) to include data. If empty, all tags will be included. Example: 'script, .ad, #footer'. Please check the [jQuery Syntax Examples](#jquery-syntax-examples). |
-| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded. |
+| Timeout | `timeout` | integer | This parameter specifies the time to wait for a page to load, measured in milliseconds. The minimum value is 0, and the maximum value is 60,000. Please note that if you set a short timeout, the page may not fully load. Conversely, setting a long timeout could significantly increase the time it takes for the task to complete. This timeout setting applies only to the Chrome simulator. |
 </div>
 
 

@@ -63,7 +63,7 @@
         },
         "max-k": {
           "default": 10,
-          "description": "Max-K specifies the maximum number of pages to return. If max-k is set to 0, all available pages will be returned, up to a maximum of 100. If max-k is set to a positive number, the result will include up to max-k pages, but no more than that.",
+          "description": "Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more.",
           "instillAcceptFormats": [
             "integer"
           ],
@@ -72,14 +72,13 @@
             "value",
             "reference"
           ],
-          "maximum": 100,
           "minimum": 0,
           "title": "Max Number of Pages",
           "type": "integer"
         },
         "timeout": {
           "default": 1000,
-          "description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000.",
+          "description": "The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task.",
           "instillAcceptFormats": [
             "integer"
           ],
@@ -238,12 +237,31 @@
           "title": "URL",
           "type": "string"
         },
+        "scrape-method": {
+          "description": "Defines the method used for web scraping. Available options include 'http' for standard HTTP-based scraping and 'chrome-simulator' for scraping through a simulated Chrome browser environment.",
+          "instillAcceptFormats": [
+            "string"
+          ],
+          "enum": [
+            "http",
+            "chrome-simulator"
+          ],
+          "instillUIOrder": 1,
+          "instillUpstreamTypes": [
+            "value",
+            "reference",
+            "template"
+          ],
+          "default": "http",
+          "title": "Scrape Method",
+          "type": "string"
+        },
         "include-html": {
           "description": "Indicate whether to include the raw HTML of the webpage in the output. If you want to include the raw HTML, set this to true.",
           "instillAcceptFormats": [
             "boolean"
           ],
-          "instillUIOrder": 1,
+          "instillUIOrder": 2,
           "instillUpstreamTypes": [
             "value",
             "reference"
@@ -256,7 +274,7 @@
           "instillAcceptFormats": [
             "boolean"
           ],
-          "instillUIOrder": 2,
+          "instillUIOrder": 3,
           "instillUpstreamTypes": [
             "value",
             "reference"
@@ -269,7 +287,7 @@
           "instillAcceptFormats": [
             "array:string"
           ],
-          "instillUIOrder": 3,
+          "instillUIOrder": 4,
           "instillUpstreamTypes": [
             "value",
             "reference"
@@ -285,7 +303,7 @@
           "instillAcceptFormats": [
             "array:string"
           ],
-          "instillUIOrder": 4,
+          "instillUIOrder": 5,
           "instillUpstreamTypes": [
             "value",
             "reference"
@@ -298,11 +316,11 @@
         },
         "timeout": {
           "default": 1000,
-          "description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded.",
+          "description": "This parameter specifies the time to wait for a page to load, measured in milliseconds. The minimum value is 0, and the maximum value is 60,000. Please note that if you set a short timeout, the page may not fully load. Conversely, setting a long timeout could significantly increase the time it takes for the task to complete. This timeout setting applies only to the Chrome simulator.",
           "instillAcceptFormats": [
             "integer"
           ],
-          "instillUIOrder": 5,
+          "instillUIOrder": 6,
           "instillUpstreamTypes": [
             "value",
             "reference"
@@ -314,7 +332,8 @@
         }
       },
       "required": [
-        "url"
+        "url",
+        "scrape-method"
       ],
       "title": "Input",
       "type": "object"

@@ -1,8 +1,8 @@
 package web
 
 import (
+	"context"
 	"fmt"
-	"log"
 	"net/url"
 	"strings"
 	"sync"
@@ -19,8 +19,11 @@ import (
 	"github.com/instill-ai/pipeline-backend/pkg/component/internal/util"
 )
 
+// PageInfo defines the information of a page
 type PageInfo struct {
-	Link  string `json:"link"`
+	// Link: The URL of the page.
+	Link string `json:"link"`
+	// Title: The title of the page.
 	Title string `json:"title"`
 }
 
@@ -38,9 +41,12 @@ type CrawlWebsiteInput struct {
 	MaxDepth int `json:"max-depth"`
 }
 
-func (i *CrawlWebsiteInput) Preset() {
-	if i.MaxK < 0 {
-		i.MaxK = 0
+func (i *CrawlWebsiteInput) preset() {
+	if i.MaxK <= 0 {
+		// When the users set to 0, it means infinite.
+		// However, there is performance issue when we set it to infinite.
+		// So, we set the default value to solve performance issue easily.
+		i.MaxK = 8000
 	}
 }
 
@@ -61,7 +67,7 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
 		return nil, fmt.Errorf("error converting input to struct: %v", err)
 	}
 
-	inputStruct.Preset()
+	inputStruct.preset()
 
 	output := ScrapeWebsiteOutput{}
 
@@ -70,45 +76,67 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
 	var mu sync.Mutex
 	pageLinks := []string{}
 
+	// We will have the component timeout feature in the future.
+	// Before that, we initialize the context here.
+	ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
+	defer cancel()
+
 	// On every a element which has href attribute call callback
 	// Wont be called if error occurs
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		mu.Lock()
-		mu.Lock()
+		mu.Lock()
+                defer mu.Unlock()
-		mu.Lock()
+		mu.Lock()
+                defer mu.Unlock()
+
+		if ctx.Err() != nil {
+			mu.Unlock()
+			return
+		}
+
 		// If we set output.Pages to the slice of PageInfo, it will take a longer time if the first html page has a lot of links.
 		// To improve the small Max-K execution time, we will use a separate slice to store the links.
 		// However, when K is big, the output length could be less than K.
 		// So, I set twice the MaxK to stop the scraping.
-		if inputStruct.MaxK > 0 && len(pageLinks) >= inputStruct.MaxK*2 {
+		if len(pageLinks) >= inputStruct.MaxK*getPageTimes(inputStruct.MaxK) {
+			mu.Unlock()
 			return
 		}
 
 		link := e.Attr("href")
 
 		if util.InSlice(pageLinks, link) {
+			mu.Unlock()
 			return
 		}
 
 		pageLinks = append(pageLinks, link)
+		mu.Unlock()
 
 		_ = e.Request.Visit(link)
 	})
 
 	// Set error handler
 	c.OnError(func(r *colly.Response, err error) {
-		log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
+		// In the future, we can design the error handling logic.
 	})
 
 	c.OnRequest(func(r *colly.Request) {
+		mu.Lock()
+
 		// Before length of output page is over, we should always send request.
-		if inputStruct.MaxK > 0 && len(output.Pages) >= inputStruct.MaxK {
+		if (len(output.Pages) >= inputStruct.MaxK) || ctx.Err() != nil {
 			r.Abort()
+			mu.Unlock()
 			return
 		}
 
+		mu.Unlock()
 		// Set a random user agent to avoid being blocked by websites
 		r.Headers.Set("User-Agent", randomString())
 	})
 
 	c.OnResponse(func(r *colly.Response) {
+		if ctx.Err() != nil {
+			return
+		}
 
 		strippedURL := stripQueryAndTrailingSlash(r.Request.URL)
 
@@ -128,20 +156,36 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
 		title := util.ScrapeWebpageTitle(doc)
 		page.Title = title
 
-		defer mu.Unlock()
 		mu.Lock()
 		// If we do not set this condition, the length of output.Pages could be over the limit.
 		if len(output.Pages) < inputStruct.MaxK {
 			output.Pages = append(output.Pages, page)
+
+			// If the length of output.Pages is equal to MaxK, we should stop the scraping.
+			if len(output.Pages) == inputStruct.MaxK {
+				mu.Unlock()
+				cancel()
+				return
+			}
+			mu.Unlock()
+			return
 		}
+		mu.Unlock()
+		cancel()
+
 	})
 
 	// Start scraping
 	if !strings.HasPrefix(inputStruct.URL, "http://") && !strings.HasPrefix(inputStruct.URL, "https://") {
 		inputStruct.URL = "https://" + inputStruct.URL
 	}
-	_ = c.Visit(inputStruct.URL)
-	c.Wait()
+
+	go func() {
+		_ = c.Visit(inputStruct.URL)
+		c.Wait()
+	}()
+
+	<-ctx.Done()
 
 	outputStruct, err := base.ConvertToStructpb(output)
 	if err != nil {
@@ -179,19 +223,21 @@ func initColly(inputStruct CrawlWebsiteInput) *colly.Collector {
 	)
 
 	// Limit the number of requests to avoid being blocked.
-	// Set it to 10 first in case sending too many requests at once.
 	var parallel int
-	if inputStruct.MaxK < 10 {
+	if inputStruct.MaxK < 30 {
 		parallel = inputStruct.MaxK
 	} else {
-		parallel = 10
+		parallel = 30
 	}
 
 	_ = c.Limit(&colly.LimitRule{
 		DomainGlob:  "*",
 		Parallelism: parallel,
+		// We set the delay to avoid being blocked.
+		Delay: 100 * time.Millisecond,
 	})
 
+	// Timeout here is set of each page rather than whole colly instance.
 	c.SetRequestTimeout(time.Duration(inputStruct.Timeout) * time.Millisecond)
 
 	if len(inputStruct.AllowedDomains) > 0 {
@@ -201,3 +247,12 @@ func initColly(inputStruct CrawlWebsiteInput) *colly.Collector {
 
 	return c
 }
+
+// It ensures that we fetch enough pages to get the required number of pages.
+func getPageTimes(maxK int) int {
+	if maxK < 10 {
+		return 10
+	} else {
+		return 2
+	}
+}
@@ -39,7 +39,7 @@ type execution struct {
 	base.ComponentExecution
 	execute               func(*structpb.Struct) (*structpb.Struct, error)
 	externalCaller        func(url string) (ioCloser io.ReadCloser, err error)
-	getDocAfterRequestURL func(url string, timeout int) (*goquery.Document, error)
+	getDocAfterRequestURL func(url string, timeout int, scrapeMethod string) (*goquery.Document, error)
 }
 
 func Init(bc base.Component) *component {

@@ -96,7 +96,7 @@ func TestScrapeWebpage(t *testing.T) {
 	})
 }
 
-func fakeHTTPRequest(url string, timeout int) (*goquery.Document, error) {
+func fakeHTTPRequest(url string, timeout int, scrapeMethod string) (*goquery.Document, error) {
 	html := `
 	<!DOCTYPE html>
 	<html>