fix: clean the code and modify the input schema

instill-ai · Nov 11, 2024 · b07a167 · b07a167
1 parent f6e6069
commit b07a167
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 113 deletions.
diff --git a/pkg/component/operator/web/v0/README.mdx b/pkg/component/operator/web/v0/README.mdx
@@ -44,7 +44,7 @@ This task involves systematically navigating through a website, starting from a
 | Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. |
 | Timeout | `timeout` | integer | The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task. |
 | Max Depth | `max-depth` | integer | Max Depth specifies how deep the crawler will navigate from the root URL. If max depth is set to 1, the crawler will only scrape the root URL and will not follow any links to other pages. If max depth is set to 0, the crawler will scrape all reachable pages until the total number of scraped pages reaches max-k. If both max-k and max depth are defined, the crawler will prioritize the max-k setting when determining how many pages to scrape. |
-| [Filter](#crawl-site-filter) | `filter` | object | Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled. |
+| [Filter](#crawl-site-filter) | `filter` | object | Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled. |
 </div>
 
 
@@ -53,14 +53,14 @@ This task involves systematically navigating through a website, starting from a
 
 <h4 id="crawl-site-filter">Filter</h4>
 
-Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled.
+Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled.
 
 <div class="markdown-col-no-wrap" data-col-1 data-col-2>
 
 | Field | Field ID | Type | Note |
 | :--- | :--- | :--- | :--- |
-| Exclude Patterns | `exclude-patterns` | array | When the URL is matched, the URL will not be crawled.  |
-| Include Patterns | `include-patterns` | array | When the URL is matched, the URL will be crawled.  |
+| Exclude Pattern | `exclude-pattern` | string | When the URL is matched, the URL will not be crawled.  |
+| Include Pattern | `include-pattern` | string | When the URL is matched, the URL will be crawled.  |
 </div>
 </details>
 

diff --git a/pkg/component/operator/web/v0/config/tasks.json b/pkg/component/operator/web/v0/config/tasks.json
@@ -108,34 +108,28 @@
           "type": "integer"
         },
         "filter": {
-          "description": "Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled.",
+          "description": "Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled.",
           "instillUIOrder": 5,
           "type": "object",
           "title": "Filter",
           "properties": {
-            "exclude-patterns": {
+            "exclude-pattern": {
               "description": "When the URL is matched, the URL will not be crawled.",
               "instillAcceptFormats": [
-                "array:string"
+                "string"
               ],
-              "items": {
-                "type": "string"
-              },
               "instillUIOrder": 1,
-              "type": "array",
-              "title": "Exclude Patterns"
+              "type": "string",
+              "title": "Exclude Pattern"
             },
-            "include-patterns": {
+            "include-pattern": {
               "description": "When the URL is matched, the URL will be crawled.",
               "instillAcceptFormats": [
-                "array:string"
+                "string"
               ],
-              "items": {
-                "type": "string"
-              },
               "instillUIOrder": 2,
-              "type": "array",
-              "title": "Include Patterns"
+              "title": "Include Pattern",
+              "type": "string"
             }
           },
           "required": []

diff --git a/pkg/component/operator/web/v0/crawl_website.go b/pkg/component/operator/web/v0/crawl_website.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"net/url"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -40,17 +41,37 @@ type CrawlWebsiteInput struct {
 	// MaxDepth: The maximum depth of the pages to scrape.
 	MaxDepth int `json:"max-depth"`
 	// Filter: The filter to filter the URLs to crawl.
-	Filter Filter `json:"filter"`
+	Filter filter `json:"filter"`
 }
 
-// Filter defines the filter of the crawl website task
-type Filter struct {
-	// ExcludePatterns: The patterns to exclude the URLs to crawl.
-	ExcludePatterns []string `json:"exclude-patterns"`
-	// IncludePatterns: The patterns to include the URLs to crawl.
-	IncludePatterns []string `json:"include-patterns"`
-	// ExcludeSubstrings: The substrings to exclude the URLs to crawl.
-	ExcludeSubstrings []string `json:"exclude-substrings"`
+// filter defines the filter of the crawl website task
+type filter struct {
+	// ExcludePattern: The pattern to exclude the URLs to crawl.
+	ExcludePattern string `json:"exclude-pattern"`
+	// IncludePattern: The pattern to include the URLs to crawl.
+	IncludePattern string `json:"include-pattern"`
+
+	// excludeRegex: The compiled exclude pattern.
+	excludeRegex *regexp.Regexp
+	// includeRegex: The compiled include pattern.
+	includeRegex *regexp.Regexp
+}
+
+func (f *filter) compile() error {
+	var err error
+	if f.ExcludePattern != "" {
+		f.excludeRegex, err = regexp.Compile(f.ExcludePattern)
+		if err != nil {
+			return fmt.Errorf("compiling exclude pattern: %v", err)
+		}
+	}
+	if f.IncludePattern != "" {
+		f.includeRegex, err = regexp.Compile(f.IncludePattern)
+		if err != nil {
+			return fmt.Errorf("compiling include pattern: %v", err)
+		}
+	}
+	return nil
 }
 
 func (i *CrawlWebsiteInput) preset() {
@@ -85,6 +106,11 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
 
 	inputStruct.preset()
 
+	err = inputStruct.Filter.compile()
+	if err != nil {
+		return nil, fmt.Errorf("compiling filter: %v", err)
+	}
+
 	output := ScrapeWebsiteOutput{
 		Pages: []PageInfo{},
 	}
@@ -193,10 +219,7 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
 			output.Pages = append(output.Pages, page)
 
 			// Signal that we've added a new page
-			select {
-			case pageUpdateCh <- struct{}{}:
-			default:
-			}
+			pageUpdateCh <- struct{}{}
 
 			// If the length of output.Pages is equal to MaxK, we should stop the scraping.
 			if len(output.Pages) == inputStruct.MaxK {
@@ -228,24 +251,17 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
 		for {
 			select {
 			case <-pageUpdateCh:
-				// Reset the timer whenever we get a new page
-				if !inactivityTimer.Stop() {
-					select {
-					case <-inactivityTimer.C:
-					default:
-					}
-				}
 				inactivityTimer.Reset(2 * time.Second)
+			// If no new pages for 2 seconds, cancel the context
 			case <-inactivityTimer.C:
-				// If no new pages for 2 seconds, cancel the context
 				cancel()
 				return
+			// If the context is done, we should return
 			case <-ctx.Done():
 				return
 			}
 		}
 	}()
-
 	<-ctx.Done()
 
 	outputStruct, err := base.ConvertToStructpb(output)

diff --git a/pkg/component/operator/web/v0/helper.go b/pkg/component/operator/web/v0/helper.go
@@ -3,7 +3,6 @@ package web
 import (
 	"fmt"
 	"log"
-	"regexp"
 
 	"github.com/PuerkitoBio/goquery"
 )
@@ -68,27 +67,12 @@ func getRemovedTagsHTML[T scrapeInput](doc *goquery.Document, input T) string {
 }
 
 // targetLink filters the URL based on the filter
-func targetLink(link string, filter Filter) bool {
-
-	if len(filter.ExcludePatterns) == 0 && len(filter.IncludePatterns) == 0 {
-		return true
+func targetLink(link string, f filter) bool {
+	if f.excludeRegex != nil && f.excludeRegex.MatchString(link) {
+		return false
 	}
-
-	for _, pattern := range filter.ExcludePatterns {
-		if match, _ := regexp.MatchString(pattern, link); match {
-			return false
-		}
-	}
-
-	if len(filter.IncludePatterns) == 0 {
-		return true
+	if f.includeRegex != nil && !f.includeRegex.MatchString(link) {
+		return false
 	}
-
-	for _, pattern := range filter.IncludePatterns {
-		if match, _ := regexp.MatchString(pattern, link); match {
-			return true
-		}
-	}
-
-	return false
+	return true
 }