diff --git a/pkg/component/operator/web/v0/README.mdx b/pkg/component/operator/web/v0/README.mdx index 61ef716f2..3064db0f1 100644 --- a/pkg/component/operator/web/v0/README.mdx +++ b/pkg/component/operator/web/v0/README.mdx @@ -44,7 +44,7 @@ This task involves systematically navigating through a website, starting from a | Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. | | Timeout | `timeout` | integer | The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task. | | Max Depth | `max-depth` | integer | Max Depth specifies how deep the crawler will navigate from the root URL. If max depth is set to 1, the crawler will only scrape the root URL and will not follow any links to other pages. If max depth is set to 0, the crawler will scrape all reachable pages until the total number of scraped pages reaches max-k. If both max-k and max depth are defined, the crawler will prioritize the max-k setting when determining how many pages to scrape. | -| [Filter](#crawl-site-filter) | `filter` | object | Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled. | +| [Filter](#crawl-site-filter) | `filter` | object | Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled. | @@ -53,14 +53,14 @@ This task involves systematically navigating through a website, starting from a

Filter

-Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled. +Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled.
| Field | Field ID | Type | Note | | :--- | :--- | :--- | :--- | -| Exclude Patterns | `exclude-patterns` | array | When the URL is matched, the URL will not be crawled. | -| Include Patterns | `include-patterns` | array | When the URL is matched, the URL will be crawled. | +| Exclude Pattern | `exclude-pattern` | string | When the URL is matched, the URL will not be crawled. | +| Include Pattern | `include-pattern` | string | When the URL is matched, the URL will be crawled. |
diff --git a/pkg/component/operator/web/v0/config/tasks.json b/pkg/component/operator/web/v0/config/tasks.json index c79211556..23a89ce24 100644 --- a/pkg/component/operator/web/v0/config/tasks.json +++ b/pkg/component/operator/web/v0/config/tasks.json @@ -108,34 +108,28 @@ "type": "integer" }, "filter": { - "description": "Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled.", + "description": "Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled.", "instillUIOrder": 5, "type": "object", "title": "Filter", "properties": { - "exclude-patterns": { + "exclude-pattern": { "description": "When the URL is matched, the URL will not be crawled.", "instillAcceptFormats": [ - "array:string" + "string" ], - "items": { - "type": "string" - }, "instillUIOrder": 1, - "type": "array", - "title": "Exclude Patterns" + "type": "string", + "title": "Exclude Pattern" }, - "include-patterns": { + "include-pattern": { "description": "When the URL is matched, the URL will be crawled.", "instillAcceptFormats": [ - "array:string" + "string" ], - "items": { - "type": "string" - }, "instillUIOrder": 2, - "type": "array", - "title": "Include Patterns" + "title": "Include Pattern", + "type": "string" } }, "required": [] diff --git a/pkg/component/operator/web/v0/crawl_website.go b/pkg/component/operator/web/v0/crawl_website.go index 8a9f3988c..e6894f9be 100644 --- a/pkg/component/operator/web/v0/crawl_website.go +++ b/pkg/component/operator/web/v0/crawl_website.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "net/url" + "regexp" "strings" "sync" "time" @@ -40,17 +41,37 @@ type CrawlWebsiteInput struct { // MaxDepth: The maximum depth of the pages to scrape. MaxDepth int `json:"max-depth"` // Filter: The filter to filter the URLs to crawl. - Filter Filter `json:"filter"` + Filter filter `json:"filter"` } -// Filter defines the filter of the crawl website task -type Filter struct { - // ExcludePatterns: The patterns to exclude the URLs to crawl. - ExcludePatterns []string `json:"exclude-patterns"` - // IncludePatterns: The patterns to include the URLs to crawl. - IncludePatterns []string `json:"include-patterns"` - // ExcludeSubstrings: The substrings to exclude the URLs to crawl. - ExcludeSubstrings []string `json:"exclude-substrings"` +// filter defines the filter of the crawl website task +type filter struct { + // ExcludePattern: The pattern to exclude the URLs to crawl. + ExcludePattern string `json:"exclude-pattern"` + // IncludePattern: The pattern to include the URLs to crawl. + IncludePattern string `json:"include-pattern"` + + // excludeRegex: The compiled exclude pattern. + excludeRegex *regexp.Regexp + // includeRegex: The compiled include pattern. + includeRegex *regexp.Regexp +} + +func (f *filter) compile() error { + var err error + if f.ExcludePattern != "" { + f.excludeRegex, err = regexp.Compile(f.ExcludePattern) + if err != nil { + return fmt.Errorf("compiling exclude pattern: %v", err) + } + } + if f.IncludePattern != "" { + f.includeRegex, err = regexp.Compile(f.IncludePattern) + if err != nil { + return fmt.Errorf("compiling include pattern: %v", err) + } + } + return nil } func (i *CrawlWebsiteInput) preset() { @@ -85,6 +106,11 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro inputStruct.preset() + err = inputStruct.Filter.compile() + if err != nil { + return nil, fmt.Errorf("compiling filter: %v", err) + } + output := ScrapeWebsiteOutput{ Pages: []PageInfo{}, } @@ -193,10 +219,7 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro output.Pages = append(output.Pages, page) // Signal that we've added a new page - select { - case pageUpdateCh <- struct{}{}: - default: - } + pageUpdateCh <- struct{}{} // If the length of output.Pages is equal to MaxK, we should stop the scraping. if len(output.Pages) == inputStruct.MaxK { @@ -228,24 +251,17 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro for { select { case <-pageUpdateCh: - // Reset the timer whenever we get a new page - if !inactivityTimer.Stop() { - select { - case <-inactivityTimer.C: - default: - } - } inactivityTimer.Reset(2 * time.Second) + // If no new pages for 2 seconds, cancel the context case <-inactivityTimer.C: - // If no new pages for 2 seconds, cancel the context cancel() return + // If the context is done, we should return case <-ctx.Done(): return } } }() - <-ctx.Done() outputStruct, err := base.ConvertToStructpb(output) diff --git a/pkg/component/operator/web/v0/helper.go b/pkg/component/operator/web/v0/helper.go index de90a489d..c37a84a5e 100644 --- a/pkg/component/operator/web/v0/helper.go +++ b/pkg/component/operator/web/v0/helper.go @@ -3,7 +3,6 @@ package web import ( "fmt" "log" - "regexp" "github.com/PuerkitoBio/goquery" ) @@ -68,27 +67,12 @@ func getRemovedTagsHTML[T scrapeInput](doc *goquery.Document, input T) string { } // targetLink filters the URL based on the filter -func targetLink(link string, filter Filter) bool { - - if len(filter.ExcludePatterns) == 0 && len(filter.IncludePatterns) == 0 { - return true +func targetLink(link string, f filter) bool { + if f.excludeRegex != nil && f.excludeRegex.MatchString(link) { + return false } - - for _, pattern := range filter.ExcludePatterns { - if match, _ := regexp.MatchString(pattern, link); match { - return false - } - } - - if len(filter.IncludePatterns) == 0 { - return true + if f.includeRegex != nil && !f.includeRegex.MatchString(link) { + return false } - - for _, pattern := range filter.IncludePatterns { - if match, _ := regexp.MatchString(pattern, link); match { - return true - } - } - - return false + return true } diff --git a/pkg/component/operator/web/v0/helper_test.go b/pkg/component/operator/web/v0/helper_test.go index fc181effe..6e97e03ba 100644 --- a/pkg/component/operator/web/v0/helper_test.go +++ b/pkg/component/operator/web/v0/helper_test.go @@ -14,81 +14,81 @@ func TestTargetLink(t *testing.T) { name string link string - filter Filter + filter filter expected bool }{ // Test case for filter combination { name: "no filter", link: "https://www.example.com", - filter: Filter{}, + filter: filter{}, expected: true, }, { name: "include pattern match", link: "https://www.example.com", - filter: Filter{ - IncludePatterns: []string{"example.com"}, + filter: filter{ + IncludePattern: "example.com", }, expected: true, }, { name: "include pattern not match", link: "https://www.example.co", - filter: Filter{ - IncludePatterns: []string{"example.com"}, + filter: filter{ + IncludePattern: "example.com", }, expected: false, }, { name: "exclude pattern match", link: "https://www.example.com", - filter: Filter{ - ExcludePatterns: []string{"example.com"}, + filter: filter{ + ExcludePattern: "example.com", }, expected: false, }, { name: "exclude pattern not match", link: "https://www.example.com", - filter: Filter{ - ExcludePatterns: []string{"example.cos"}, + filter: filter{ + ExcludePattern: "example.cos", }, expected: true, }, { name: "include pattern match and exclude pattern not match", link: "https://www.example.com", - filter: Filter{ - IncludePatterns: []string{"example.com"}, - ExcludePatterns: []string{"example.cos"}, + filter: filter{ + IncludePattern: "example.com", + ExcludePattern: "example.cos", }, expected: true, }, { name: "include pattern not match and exclude pattern match", link: "https://www.example.co", - filter: Filter{ - IncludePatterns: []string{"example.com"}, - ExcludePatterns: []string{"example.co"}, + filter: filter{ + IncludePattern: "example.com", + ExcludePattern: "example.co", }, expected: false, }, { name: "include and exclude pattern both not match", link: "https://www.example.c", - filter: Filter{ - IncludePatterns: []string{"example.com"}, - ExcludePatterns: []string{"example.co"}, + filter: filter{ + IncludePattern: "example.com", + ExcludePattern: "example.co", }, expected: false, }, { name: "include and exclude pattern both match", link: "https://www.example.com", - filter: Filter{ - IncludePatterns: []string{"example.com"}, - ExcludePatterns: []string{"example.co"}, + filter: filter{ + IncludePattern: "example.com", + ExcludePattern: "example.co", }, expected: false, }, @@ -96,96 +96,96 @@ func TestTargetLink(t *testing.T) { { name: "digit match", link: "https://example1.com", - filter: Filter{ - IncludePatterns: []string{"example[\\d].com"}, + filter: filter{ + IncludePattern: "example[\\d].com", }, expected: true, }, { name: "disjunction match", link: "https://exampleA.com", - filter: Filter{ - IncludePatterns: []string{"example[A|B|C].com"}, + filter: filter{ + IncludePattern: "example[A|B|C].com", }, expected: true, }, { name: "match all subdomains of example.com", link: "https://blog.example.com", - filter: Filter{ - IncludePatterns: []string{".*\\.example\\.com"}, + filter: filter{ + IncludePattern: ".*\\.example\\.com", }, expected: true, }, { name: "match specific file extensions", link: "https://example.com/document.pdf", - filter: Filter{ - IncludePatterns: []string{".*\\.(pdf|doc|docx)$"}, + filter: filter{ + IncludePattern: ".*\\.(pdf|doc|docx)$", }, expected: true, }, { name: "match specific keywords in path", link: "https://example.com/blog/post-1", - filter: Filter{ - IncludePatterns: []string{".*(blog|news|article).*"}, + filter: filter{ + IncludePattern: ".*(blog|news|article).*", }, expected: true, }, { name: "match specific ports", link: "https://example.com:8080/api", - filter: Filter{ - IncludePatterns: []string{".*:(8080|8443)($|/.*)"}, + filter: filter{ + IncludePattern: ".*:(8080|8443)($|/.*)", }, expected: true, }, { name: "match https only", link: "https://example.com", - filter: Filter{ - IncludePatterns: []string{"^https://.*"}, + filter: filter{ + IncludePattern: "^https://.*", }, expected: true, }, { name: "exclude http protocol", link: "http://example.com", - filter: Filter{ - IncludePatterns: []string{"^https://.*"}, + filter: filter{ + IncludePattern: "^https://.*", }, expected: false, }, { name: "match specific country TLDs", link: "https://example.uk", - filter: Filter{ - IncludePatterns: []string{".*\\.(uk|fr|de)$"}, + filter: filter{ + IncludePattern: ".*\\.(uk|fr|de)$", }, expected: true, }, { name: "match URLs without query parameters", link: "https://example.com/path", - filter: Filter{ - IncludePatterns: []string{"^[^?]*$"}, + filter: filter{ + IncludePattern: "^[^?]*$", }, expected: true, }, { name: "not match URLs without query parameters", link: "https://example.com/path?id=123", - filter: Filter{ - IncludePatterns: []string{"^[^?]*$"}, + filter: filter{ + IncludePattern: "^[^?]*$", }, expected: false, }, { name: "match specific query parameters", link: "https://example.com/path?id=123", - filter: Filter{ - IncludePatterns: []string{".*[?&]id=[0-9]+.*"}, + filter: filter{ + IncludePattern: ".*[?&]id=[0-9]+.*", }, expected: true, }, @@ -193,16 +193,16 @@ func TestTargetLink(t *testing.T) { { name: "non-matching file extension", link: "https://example.com/document.txt", - filter: Filter{ - IncludePatterns: []string{".*\\.(pdf|doc|docx)$"}, + filter: filter{ + IncludePattern: ".*\\.(pdf|doc|docx)$", }, expected: false, }, { name: "non-matching country TLD", link: "https://example.us", - filter: Filter{ - IncludePatterns: []string{".*\\.(uk|fr|de)$"}, + filter: filter{ + IncludePattern: ".*\\.(uk|fr|de)$", }, expected: false, }, @@ -210,6 +210,8 @@ func TestTargetLink(t *testing.T) { for _, testCase := range testCases { c.Run(testCase.name, func(c *quicktest.C) { + err := testCase.filter.compile() + c.Assert(err, quicktest.IsNil) c.Assert(targetLink(testCase.link, testCase.filter), quicktest.Equals, testCase.expected) }) }