Skip to content

Commit

Permalink
fix: clean the code and modify the input schema
Browse files Browse the repository at this point in the history
  • Loading branch information
chuang8511 committed Nov 11, 2024
1 parent f6e6069 commit b07a167
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 113 deletions.
8 changes: 4 additions & 4 deletions pkg/component/operator/web/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ This task involves systematically navigating through a website, starting from a
| Max Number of Pages (required) | `max-k` | integer | Max-K sets a limit on the number of pages to fetch. If Max-K is set to 0, all available pages will be fetched within the time limit of 120 seconds. If Max-K is a positive number, the fetch will return up to that many pages, but no more. |
| Timeout | `timeout` | integer | The time to wait for a page to load in milliseconds. Min 0, Max 60000. Please notice the timeout here is set for each page rather than the whole crawl task. |
| Max Depth | `max-depth` | integer | Max Depth specifies how deep the crawler will navigate from the root URL. If max depth is set to 1, the crawler will only scrape the root URL and will not follow any links to other pages. If max depth is set to 0, the crawler will scrape all reachable pages until the total number of scraped pages reaches max-k. If both max-k and max depth are defined, the crawler will prioritize the max-k setting when determining how many pages to scrape. |
| [Filter](#crawl-site-filter) | `filter` | object | Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled. |
| [Filter](#crawl-site-filter) | `filter` | object | Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled. |
</div>


Expand All @@ -53,14 +53,14 @@ This task involves systematically navigating through a website, starting from a

<h4 id="crawl-site-filter">Filter</h4>

Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled.
Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled.

<div class="markdown-col-no-wrap" data-col-1 data-col-2>

| Field | Field ID | Type | Note |
| :--- | :--- | :--- | :--- |
| Exclude Patterns | `exclude-patterns` | array | When the URL is matched, the URL will not be crawled. |
| Include Patterns | `include-patterns` | array | When the URL is matched, the URL will be crawled. |
| Exclude Pattern | `exclude-pattern` | string | When the URL is matched, the URL will not be crawled. |
| Include Pattern | `include-pattern` | string | When the URL is matched, the URL will be crawled. |
</div>
</details>

Expand Down
24 changes: 9 additions & 15 deletions pkg/component/operator/web/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,34 +108,28 @@
"type": "integer"
},
"filter": {
"description": "Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-patterns or not match exclude-patterns. When both include-patterns and exclude-patterns are empty, all URLs will be crawled. It will process exclude-patterns first, then include-patterns. When exclude-patterns is not empty, only URLs that do not match exclude-patterns will be crawled. When include-patterns is not empty, only URLs that match include-patterns will be crawled.",
"description": "Filtering based on [regular expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions). The URL will be crawled if it matches either include-pattern or not match exclude-pattern. When both include-pattern and exclude-pattern are empty, all URLs will be crawled. It will process exclude-pattern first, then include-pattern. When exclude-pattern is not empty, only URLs that do not match exclude-pattern will be crawled. When include-pattern is not empty, only URLs that match include-pattern will be crawled.",
"instillUIOrder": 5,
"type": "object",
"title": "Filter",
"properties": {
"exclude-patterns": {
"exclude-pattern": {
"description": "When the URL is matched, the URL will not be crawled.",
"instillAcceptFormats": [
"array:string"
"string"
],
"items": {
"type": "string"
},
"instillUIOrder": 1,
"type": "array",
"title": "Exclude Patterns"
"type": "string",
"title": "Exclude Pattern"
},
"include-patterns": {
"include-pattern": {
"description": "When the URL is matched, the URL will be crawled.",
"instillAcceptFormats": [
"array:string"
"string"
],
"items": {
"type": "string"
},
"instillUIOrder": 2,
"type": "array",
"title": "Include Patterns"
"title": "Include Pattern",
"type": "string"
}
},
"required": []
Expand Down
60 changes: 38 additions & 22 deletions pkg/component/operator/web/v0/crawl_website.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"net/url"
"regexp"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -40,17 +41,37 @@ type CrawlWebsiteInput struct {
// MaxDepth: The maximum depth of the pages to scrape.
MaxDepth int `json:"max-depth"`
// Filter: The filter to filter the URLs to crawl.
Filter Filter `json:"filter"`
Filter filter `json:"filter"`
}

// Filter defines the filter of the crawl website task
type Filter struct {
// ExcludePatterns: The patterns to exclude the URLs to crawl.
ExcludePatterns []string `json:"exclude-patterns"`
// IncludePatterns: The patterns to include the URLs to crawl.
IncludePatterns []string `json:"include-patterns"`
// ExcludeSubstrings: The substrings to exclude the URLs to crawl.
ExcludeSubstrings []string `json:"exclude-substrings"`
// filter defines the filter of the crawl website task
type filter struct {
// ExcludePattern: The pattern to exclude the URLs to crawl.
ExcludePattern string `json:"exclude-pattern"`
// IncludePattern: The pattern to include the URLs to crawl.
IncludePattern string `json:"include-pattern"`

// excludeRegex: The compiled exclude pattern.
excludeRegex *regexp.Regexp
// includeRegex: The compiled include pattern.
includeRegex *regexp.Regexp
}

func (f *filter) compile() error {
var err error
if f.ExcludePattern != "" {
f.excludeRegex, err = regexp.Compile(f.ExcludePattern)
if err != nil {
return fmt.Errorf("compiling exclude pattern: %v", err)
}
}
if f.IncludePattern != "" {
f.includeRegex, err = regexp.Compile(f.IncludePattern)
if err != nil {
return fmt.Errorf("compiling include pattern: %v", err)
}
}
return nil
}

func (i *CrawlWebsiteInput) preset() {
Expand Down Expand Up @@ -85,6 +106,11 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro

inputStruct.preset()

err = inputStruct.Filter.compile()
if err != nil {
return nil, fmt.Errorf("compiling filter: %v", err)
}

output := ScrapeWebsiteOutput{
Pages: []PageInfo{},
}
Expand Down Expand Up @@ -193,10 +219,7 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
output.Pages = append(output.Pages, page)

// Signal that we've added a new page
select {
case pageUpdateCh <- struct{}{}:
default:
}
pageUpdateCh <- struct{}{}

// If the length of output.Pages is equal to MaxK, we should stop the scraping.
if len(output.Pages) == inputStruct.MaxK {
Expand Down Expand Up @@ -228,24 +251,17 @@ func (e *execution) CrawlWebsite(input *structpb.Struct) (*structpb.Struct, erro
for {
select {
case <-pageUpdateCh:
// Reset the timer whenever we get a new page
if !inactivityTimer.Stop() {
select {
case <-inactivityTimer.C:
default:
}
}
inactivityTimer.Reset(2 * time.Second)
// If no new pages for 2 seconds, cancel the context
case <-inactivityTimer.C:
// If no new pages for 2 seconds, cancel the context
cancel()
return
// If the context is done, we should return
case <-ctx.Done():
return
}
}
}()

<-ctx.Done()

outputStruct, err := base.ConvertToStructpb(output)
Expand Down
28 changes: 6 additions & 22 deletions pkg/component/operator/web/v0/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package web
import (
"fmt"
"log"
"regexp"

"github.com/PuerkitoBio/goquery"
)
Expand Down Expand Up @@ -68,27 +67,12 @@ func getRemovedTagsHTML[T scrapeInput](doc *goquery.Document, input T) string {
}

// targetLink filters the URL based on the filter
func targetLink(link string, filter Filter) bool {

if len(filter.ExcludePatterns) == 0 && len(filter.IncludePatterns) == 0 {
return true
func targetLink(link string, f filter) bool {
if f.excludeRegex != nil && f.excludeRegex.MatchString(link) {
return false
}

for _, pattern := range filter.ExcludePatterns {
if match, _ := regexp.MatchString(pattern, link); match {
return false
}
}

if len(filter.IncludePatterns) == 0 {
return true
if f.includeRegex != nil && !f.includeRegex.MatchString(link) {
return false
}

for _, pattern := range filter.IncludePatterns {
if match, _ := regexp.MatchString(pattern, link); match {
return true
}
}

return false
return true
}
Loading

0 comments on commit b07a167

Please sign in to comment.