From 0672ca84117307c7cf3f9834c3bdb4aa0074a0c9 Mon Sep 17 00:00:00 2001 From: chuang8511 Date: Thu, 17 Oct 2024 18:26:25 +0100 Subject: [PATCH 1/2] feat(web): refactor web operator --- pkg/component/operator/web/v0/README.mdx | 7 +- .../operator/web/v0/config/tasks.json | 45 +++++++--- .../operator/web/v0/crawl_website.go | 84 +++++++++++++++---- pkg/component/operator/web/v0/main.go | 2 +- pkg/component/operator/web/v0/main_test.go | 2 +- .../operator/web/v0/scrape_webpage.go | 41 ++++++--- 6 files changed, 137 insertions(+), 44 deletions(-) diff --git a/pkg/component/operator/web/v0/README.mdx b/pkg/component/operator/web/v0/README.mdx index 5ad402ab0..d114c972c 100644 --- a/pkg/component/operator/web/v0/README.mdx +++ b/pkg/component/operator/web/v0/README.mdx @@ -34,8 +34,8 @@ This task involves systematically navigating through a website, starting from a | Task ID (required) | `task` | string | `TASK_CRAWL_SITE` | | URL (required) | `url` | string | The root URL to scrape. All links on this page will be scraped, and all links on those pages, and so on. | | Allowed Domains | `allowed-domains` | array[string] | A list of domains that are allowed to be scraped. If empty, all domains are allowed. | -| Max Number of Pages (required) | `max-k` | integer | Max-K specifies the maximum number of pages to return. If max-k is set to 0, all available pages will be returned, up to a maximum of 100. If max-k is set to a positive number, the result will include up to max-k pages, but no more than that. | -| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. | +| Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is large and all pages cannot be fetched within the time limit, the fetching process will continue for up to 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. | +| Timeout | `timeout` | integer | The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task. | | Max Depth | `max-depth` | integer | Max Depth specifies how deep the crawler will navigate from the root URL. If max depth is set to 1, the crawler will only scrape the root URL and will not follow any links to other pages. If max depth is set to 0, the crawler will scrape all reachable pages until the total number of scraped pages reaches max-k. If both max-k and max depth are defined, the crawler will prioritize the max-k setting when determining how many pages to scrape. | @@ -75,11 +75,12 @@ This task focuses on extracting specific data from a single targeted webpage by | :--- | :--- | :--- | :--- | | Task ID (required) | `task` | string | `TASK_SCRAPE_PAGE` | | URL (required) | `url` | string | The URL to scrape the webpage contents. | +| Scrape Method (required) | `scrape-method` | string | Defines the method used for web scraping. Available options include 'http' for standard HTTP-based scraping and 'chrome-simulator' for scraping through a simulated Chrome browser environment. | | Include HTML | `include-html` | boolean | Indicate whether to include the raw HTML of the webpage in the output. If you want to include the raw HTML, set this to true. | | Only Main Content | `only-main-content` | boolean | Only return the main content of the page by excluding the content of the tag of header, nav, footer. | | Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. You can use [jQuery](https://www.w3schools.com/jquery/jquery_syntax.asp) to remove data. If empty, no tags will be removed. Example: 'script, .ad, #footer'. Please check the [jQuery Syntax Examples](#jquery-syntax-examples). | | Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. You can use [jQuery](https://www.w3schools.com/jquery/jquery_syntax.asp) to include data. If empty, all tags will be included. Example: 'script, .ad, #footer'. Please check the [jQuery Syntax Examples](#jquery-syntax-examples). | -| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded. | +| Timeout | `timeout` | integer | This parameter specifies the time to wait for a page to load, measured in milliseconds. The minimum value is 0, and the maximum value is 60,000. Please note that if you set a short timeout, the page may not fully load. Conversely, setting a long timeout could significantly increase the time it takes for the task to complete. This timeout setting applies only to the Chrome simulator. | diff --git a/pkg/component/operator/web/v0/config/tasks.json b/pkg/component/operator/web/v0/config/tasks.json index e43978cb7..97a38bde0 100644 --- a/pkg/component/operator/web/v0/config/tasks.json +++ b/pkg/component/operator/web/v0/config/tasks.json @@ -62,8 +62,8 @@ "type": "array" }, "max-k": { - "default": 10, - "description": "Max-K specifies the maximum number of pages to return. If max-k is set to 0, all available pages will be returned, up to a maximum of 100. If max-k is set to a positive number, the result will include up to max-k pages, but no more than that.", + "default": 1, + "description": "Max-K sets a limit on the number of pages to fetch. If Max-K is large and all pages cannot be fetched within the time limit, the fetching process will continue for up to 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more.", "instillAcceptFormats": [ "integer" ], @@ -72,14 +72,13 @@ "value", "reference" ], - "maximum": 100, - "minimum": 0, + "minimum": 1, "title": "Max Number of Pages", "type": "integer" }, "timeout": { "default": 1000, - "description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000.", + "description": "The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task.", "instillAcceptFormats": [ "integer" ], @@ -238,12 +237,31 @@ "title": "URL", "type": "string" }, + "scrape-method": { + "description": "Defines the method used for web scraping. Available options include 'http' for standard HTTP-based scraping and 'chrome-simulator' for scraping through a simulated Chrome browser environment.", + "instillAcceptFormats": [ + "string" + ], + "enum": [ + "http", + "chrome-simulator" + ], + "instillUIOrder": 1, + "instillUpstreamTypes": [ + "value", + "reference", + "template" + ], + "default": "http", + "title": "Scrape Method", + "type": "string" + }, "include-html": { "description": "Indicate whether to include the raw HTML of the webpage in the output. If you want to include the raw HTML, set this to true.", "instillAcceptFormats": [ "boolean" ], - "instillUIOrder": 1, + "instillUIOrder": 2, "instillUpstreamTypes": [ "value", "reference" @@ -256,7 +274,7 @@ "instillAcceptFormats": [ "boolean" ], - "instillUIOrder": 2, + "instillUIOrder": 3, "instillUpstreamTypes": [ "value", "reference" @@ -269,7 +287,7 @@ "instillAcceptFormats": [ "array:string" ], - "instillUIOrder": 3, + "instillUIOrder": 4, "instillUpstreamTypes": [ "value", "reference" @@ -285,7 +303,7 @@ "instillAcceptFormats": [ "array:string" ], - "instillUIOrder": 4, + "instillUIOrder": 5, "instillUpstreamTypes": [ "value", "reference" @@ -298,11 +316,11 @@ }, "timeout": { "default": 1000, - "description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000. Please set it as 0 if you only want to collect static content. Please notice that if the timeout is set a small value, the page may not be fully loaded.", + "description": "This parameter specifies the time to wait for a page to load, measured in milliseconds. The minimum value is 0, and the maximum value is 60,000. Please note that if you set a short timeout, the page may not fully load. Conversely, setting a long timeout could significantly increase the time it takes for the task to complete. This timeout setting applies only to the Chrome simulator.", "instillAcceptFormats": [ "integer" ], - "instillUIOrder": 5, + "instillUIOrder": 6, "instillUpstreamTypes": [ "value", "reference" @@ -314,7 +332,8 @@ } }, "required": [ - "url" + "url", + "scrape-method" ], "title": "Input", "type": "object" @@ -397,4 +416,4 @@ "type": "object" } } -} +} \ No newline at end of file diff --git a/pkg/component/operator/web/v0/crawl_website.go b/pkg/component/operator/web/v0/crawl_website.go index 4dbc671bf..943a507ff 100644 --- a/pkg/component/operator/web/v0/crawl_website.go +++ b/pkg/component/operator/web/v0/crawl_website.go @@ -1,8 +1,8 @@ package web import ( + "context" "fmt" - "log" "net/url" "strings" "sync" @@ -19,8 +19,11 @@ import ( "github.com/instill-ai/pipeline-backend/pkg/component/internal/util" ) +// PageInfo defines the information of a page type PageInfo struct { - Link string `json:"link"` + // Link: The URL of the page. + Link string `json:"link"` + // Title: The title of the page. Title string `json:"title"` } @@ -38,9 +41,9 @@ type CrawlWebsiteInput struct { MaxDepth int `json:"max-depth"` } -func (i *CrawlWebsiteInput) Preset() { - if i.MaxK < 0 { - i.MaxK = 0 +func (i *CrawlWebsiteInput) preset() { + if i.MaxK <= 0 { + i.MaxK = 1 } } @@ -61,7 +64,7 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro return nil, fmt.Errorf("error converting input to struct: %v", err) } - inputStruct.Preset() + inputStruct.preset() output := ScrapeWebsiteOutput{} @@ -70,45 +73,67 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro var mu sync.Mutex pageLinks := []string{} + // We will have the component timeout feature in the future. + // Before that, we initialize the context here. + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + // On every a element which has href attribute call callback // Wont be called if error occurs c.OnHTML("a[href]", func(e *colly.HTMLElement) { + mu.Lock() + + if ctx.Err() != nil { + mu.Unlock() + return + } + // If we set output.Pages to the slice of PageInfo, it will take a longer time if the first html page has a lot of links. // To improve the small Max-K execution time, we will use a separate slice to store the links. // However, when K is big, the output length could be less than K. // So, I set twice the MaxK to stop the scraping. - if inputStruct.MaxK > 0 && len(pageLinks) >= inputStruct.MaxK*2 { + if len(pageLinks) >= inputStruct.MaxK*getPageTimes(inputStruct.MaxK) { + mu.Unlock() return } link := e.Attr("href") if util.InSlice(pageLinks, link) { + mu.Unlock() return } pageLinks = append(pageLinks, link) + mu.Unlock() _ = e.Request.Visit(link) }) // Set error handler c.OnError(func(r *colly.Response, err error) { - log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err) + // In the future, we can design the error handling logic. }) c.OnRequest(func(r *colly.Request) { + mu.Lock() + // Before length of output page is over, we should always send request. - if inputStruct.MaxK > 0 && len(output.Pages) >= inputStruct.MaxK { + if (len(output.Pages) >= inputStruct.MaxK) || ctx.Err() != nil { r.Abort() + mu.Unlock() return } + mu.Unlock() // Set a random user agent to avoid being blocked by websites r.Headers.Set("User-Agent", randomString()) }) c.OnResponse(func(r *colly.Response) { + if ctx.Err() != nil { + return + } strippedURL := stripQueryAndTrailingSlash(r.Request.URL) @@ -128,20 +153,38 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro title := util.ScrapeWebpageTitle(doc) page.Title = title - defer mu.Unlock() mu.Lock() // If we do not set this condition, the length of output.Pages could be over the limit. if len(output.Pages) < inputStruct.MaxK { output.Pages = append(output.Pages, page) + + // If the length of output.Pages is equal to MaxK, we should stop the scraping. + if len(output.Pages) == inputStruct.MaxK { + mu.Unlock() + cancel() + return + } + mu.Unlock() + return } + mu.Unlock() + cancel() + return + }) // Start scraping if !strings.HasPrefix(inputStruct.URL, "http://") && !strings.HasPrefix(inputStruct.URL, "https://") { inputStruct.URL = "https://" + inputStruct.URL } - _ = c.Visit(inputStruct.URL) - c.Wait() + + go func() { + c.Visit(inputStruct.URL) + c.Wait() + return + }() + + <-ctx.Done() outputStruct, err := base.ConvertToStructpb(output) if err != nil { @@ -179,19 +222,21 @@ func initColly(inputStruct CrawlWebsiteInput) *colly.Collector { ) // Limit the number of requests to avoid being blocked. - // Set it to 10 first in case sending too many requests at once. var parallel int - if inputStruct.MaxK < 10 { + if inputStruct.MaxK < 30 { parallel = inputStruct.MaxK } else { - parallel = 10 + parallel = 30 } _ = c.Limit(&colly.LimitRule{ DomainGlob: "*", Parallelism: parallel, + // We set the delay to avoid being blocked. + Delay: 100 * time.Millisecond, }) + // Timeout here is set of each page rather than whole colly instance. c.SetRequestTimeout(time.Duration(inputStruct.Timeout) * time.Millisecond) if len(inputStruct.AllowedDomains) > 0 { @@ -201,3 +246,12 @@ func initColly(inputStruct CrawlWebsiteInput) *colly.Collector { return c } + +// It ensures that we fetch enough pages to get the required number of pages. +func getPageTimes(maxK int) int { + if maxK < 10 { + return 10 + } else { + return 2 + } +} diff --git a/pkg/component/operator/web/v0/main.go b/pkg/component/operator/web/v0/main.go index 7c6bcf66c..a72837d5e 100644 --- a/pkg/component/operator/web/v0/main.go +++ b/pkg/component/operator/web/v0/main.go @@ -39,7 +39,7 @@ type execution struct { base.ComponentExecution execute func(*structpb.Struct) (*structpb.Struct, error) externalCaller func(url string) (ioCloser io.ReadCloser, err error) - getDocAfterRequestURL func(url string, timeout int) (*goquery.Document, error) + getDocAfterRequestURL func(url string, timeout int, scrapeMethod string) (*goquery.Document, error) } func Init(bc base.Component) *component { diff --git a/pkg/component/operator/web/v0/main_test.go b/pkg/component/operator/web/v0/main_test.go index 925aed3fa..4024a2b81 100644 --- a/pkg/component/operator/web/v0/main_test.go +++ b/pkg/component/operator/web/v0/main_test.go @@ -96,7 +96,7 @@ func TestScrapeWebpage(t *testing.T) { }) } -func fakeHTTPRequest(url string, timeout int) (*goquery.Document, error) { +func fakeHTTPRequest(url string, timeout int, scrapeMethod string) (*goquery.Document, error) { html := ` diff --git a/pkg/component/operator/web/v0/scrape_webpage.go b/pkg/component/operator/web/v0/scrape_webpage.go index f7ff3bc89..940d52846 100644 --- a/pkg/component/operator/web/v0/scrape_webpage.go +++ b/pkg/component/operator/web/v0/scrape_webpage.go @@ -17,29 +17,50 @@ import ( "github.com/instill-ai/pipeline-backend/pkg/component/internal/util" ) +// ScrapeWebpageInput defines the input of the scrape webpage task type ScrapeWebpageInput struct { - URL string `json:"url"` - IncludeHTML bool `json:"include-html"` - OnlyMainContent bool `json:"only-main-content"` - RemoveTags []string `json:"remove-tags,omitempty"` + // URL: The URL of the webpage to scrape. + URL string `json:"url"` + // ScrapeMethod: The method to scrape the webpage. It can be "http" or "chromedp". + ScrapeMethod string `json:"scrape-method"` + // IncludeHTML: Whether to include the HTML content of the webpage. + IncludeHTML bool `json:"include-html"` + // OnlyMainContent: Whether to scrape only the main content of the webpage. + OnlyMainContent bool `json:"only-main-content"` + // RemoveTags: The list of tags to remove from the HTML content. + RemoveTags []string `json:"remove-tags,omitempty"` + // OnlyIncludeTags: The list of tags to include in the HTML content. OnlyIncludeTags []string `json:"only-include-tags,omitempty"` - Timeout int `json:"timeout,omitempty"` + // Timeout: The number of milliseconds to wait before scraping the web page. Min 0, Max 60000. + Timeout int `json:"timeout,omitempty"` } + +// ScrapeWebpageOutput defines the output of the scrape webpage task type ScrapeWebpageOutput struct { + // Content: The plain text content of the webpage. Content string `json:"content"` + // Markdown: The markdown content of the webpage. Markdown string `json:"markdown"` + // HTML: The HTML content of the webpage. HTML string `json:"html"` + // Metadata: The metadata of the webpage. Metadata Metadata `json:"metadata"` + // LinksOnPage: The list of links on the webpage. LinksOnPage []string `json:"links-on-page"` } +// Metadata defines the metadata of the webpage type Metadata struct { + // Title: The title of the webpage. Title string `json:"title"` + // Description: The description of the webpage. Description string `json:"description,omitempty"` + // SourceURL: The source URL of the webpage. SourceURL string `json:"source-url"` } +// ScrapeWebpage scrapes the content of a webpage func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, error) { inputStruct := ScrapeWebpageInput{} @@ -52,7 +73,7 @@ func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, err output := ScrapeWebpageOutput{} - doc, err := e.getDocAfterRequestURL(inputStruct.URL, inputStruct.Timeout) + doc, err := e.getDocAfterRequestURL(inputStruct.URL, inputStruct.Timeout, inputStruct.ScrapeMethod) if err != nil { return nil, fmt.Errorf("error getting HTML page doc: %v", err) @@ -70,14 +91,12 @@ func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, err } -func getDocAfterRequestURL(url string, timeout int) (*goquery.Document, error) { +func getDocAfterRequestURL(url string, timeout int, scrapeMethod string) (*goquery.Document, error) { - if timeout > 0 { - return requestToWebpage(url, timeout) - } else { + if scrapeMethod == "http" { return httpRequest(url) } - + return requestToWebpage(url, timeout) } func httpRequest(url string) (*goquery.Document, error) { From ce215486de33d09fd640b9080f00e4ebe996eaa1 Mon Sep 17 00:00:00 2001 From: chuang8511 Date: Thu, 17 Oct 2024 18:42:08 +0100 Subject: [PATCH 2/2] fix(web): fix max k --- pkg/component/operator/web/v0/README.mdx | 2 +- pkg/component/operator/web/v0/config/tasks.json | 8 ++++---- pkg/component/operator/web/v0/crawl_website.go | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pkg/component/operator/web/v0/README.mdx b/pkg/component/operator/web/v0/README.mdx index d114c972c..757280c88 100644 --- a/pkg/component/operator/web/v0/README.mdx +++ b/pkg/component/operator/web/v0/README.mdx @@ -34,7 +34,7 @@ This task involves systematically navigating through a website, starting from a | Task ID (required) | `task` | string | `TASK_CRAWL_SITE` | | URL (required) | `url` | string | The root URL to scrape. All links on this page will be scraped, and all links on those pages, and so on. | | Allowed Domains | `allowed-domains` | array[string] | A list of domains that are allowed to be scraped. If empty, all domains are allowed. | -| Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is large and all pages cannot be fetched within the time limit, the fetching process will continue for up to 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. | +| Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. | | Timeout | `timeout` | integer | The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task. | | Max Depth | `max-depth` | integer | Max Depth specifies how deep the crawler will navigate from the root URL. If max depth is set to 1, the crawler will only scrape the root URL and will not follow any links to other pages. If max depth is set to 0, the crawler will scrape all reachable pages until the total number of scraped pages reaches max-k. If both max-k and max depth are defined, the crawler will prioritize the max-k setting when determining how many pages to scrape. | diff --git a/pkg/component/operator/web/v0/config/tasks.json b/pkg/component/operator/web/v0/config/tasks.json index 97a38bde0..e7bb14bf5 100644 --- a/pkg/component/operator/web/v0/config/tasks.json +++ b/pkg/component/operator/web/v0/config/tasks.json @@ -62,8 +62,8 @@ "type": "array" }, "max-k": { - "default": 1, - "description": "Max-K sets a limit on the number of pages to fetch. If Max-K is large and all pages cannot be fetched within the time limit, the fetching process will continue for up to 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more.", + "default": 10, + "description": "Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more.", "instillAcceptFormats": [ "integer" ], @@ -72,7 +72,7 @@ "value", "reference" ], - "minimum": 1, + "minimum": 0, "title": "Max Number of Pages", "type": "integer" }, @@ -416,4 +416,4 @@ "type": "object" } } -} \ No newline at end of file +} diff --git a/pkg/component/operator/web/v0/crawl_website.go b/pkg/component/operator/web/v0/crawl_website.go index 943a507ff..be59ab4d5 100644 --- a/pkg/component/operator/web/v0/crawl_website.go +++ b/pkg/component/operator/web/v0/crawl_website.go @@ -43,7 +43,10 @@ type CrawlWebsiteInput struct { func (i *CrawlWebsiteInput) preset() { if i.MaxK <= 0 { - i.MaxK = 1 + // When the users set to 0, it means infinite. + // However, there is performance issue when we set it to infinite. + // So, we set the default value to solve performance issue easily. + i.MaxK = 8000 } } @@ -169,7 +172,6 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro } mu.Unlock() cancel() - return }) @@ -179,9 +181,8 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro } go func() { - c.Visit(inputStruct.URL) + _ = c.Visit(inputStruct.URL) c.Wait() - return }() <-ctx.Done()