Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"Dynamic Scope" #915

Open
wants to merge 9 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 55 additions & 46 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,85 +15,67 @@
"github.com/projectdiscovery/katana/internal/runner"
"github.com/projectdiscovery/katana/pkg/output"
"github.com/projectdiscovery/katana/pkg/types"
"github.com/projectdiscovery/katana/pkg/tfidf"
errorutil "github.com/projectdiscovery/utils/errors"
fileutil "github.com/projectdiscovery/utils/file"
folderutil "github.com/projectdiscovery/utils/folder"
"github.com/rs/xid"
)

var (
cfgFile string
options = &types.Options{}
cfgFile string
options = &types.Options{}
useDynamicScope bool
tfidfModel *tfidf.TfIdf

Check failure on line 29 in cmd/katana/main.go

View workflow job for this annotation

GitHub Actions / Lint Test

var `tfidfModel` is unused (unused)
similarityThreshold float64 = 0.7

Check failure on line 30 in cmd/katana/main.go

View workflow job for this annotation

GitHub Actions / Lint Test

var `similarityThreshold` is unused (unused)
)

func main() {
flagSet, err := readFlags()
if err != nil {
gologger.Fatal().Msgf("Could not read flags: %s\n", err)
}
handleError("Could not read flags", err)

if options.HealthCheck {
gologger.Print().Msgf("%s\n", runner.DoHealthCheck(options, flagSet))
os.Exit(0)
}

katanaRunner, err := runner.New(options)
if err != nil || katanaRunner == nil {
if options.Version {
return
}
gologger.Fatal().Msgf("could not create runner: %s\n", err)
// Initialize the TF-IDF model if dynamic scoping is enabled
if useDynamicScope {
tfidfModel = tfidf.New()
}

katanaRunner, err := runner.New(options)
handleError("could not create runner", err)
defer katanaRunner.Close()

// close handler
resumeFilename := defaultResumeFilename()
go func() {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt, syscall.SIGTERM)
for range c {
gologger.DefaultLogger.Info().Msg("- Ctrl+C pressed in Terminal")
katanaRunner.Close()

gologger.Info().Msgf("Creating resume file: %s\n", resumeFilename)
err := katanaRunner.SaveState(resumeFilename)
if err != nil {
gologger.Error().Msgf("Couldn't create resume file: %s\n", err)
}

os.Exit(0)
}
}()

if err := katanaRunner.ExecuteCrawling(); err != nil {
gologger.Fatal().Msgf("could not execute crawling: %s", err)
}
setupCloseHandler(katanaRunner, resumeFilename)

// on successful execution:
err = katanaRunner.ExecuteCrawling()
handleError("could not execute crawling", err)

// deduplicate the lines in each file in the store-field-dir
//use options.StoreFieldDir once https://github.com/projectdiscovery/katana/pull/877 is merged
// Deduplicate lines in each file in the store-field-dir
storeFieldDir := "katana_field"
_ = folderutil.DedupeLinesInFiles(storeFieldDir)

// remove the resume file in case it exists
// Remove the resume file if it exists
if fileutil.FileExists(resumeFilename) {
os.Remove(resumeFilename)
}

}

func readFlags() (*goflags.FlagSet, error) {
flagSet := goflags.NewFlagSet()
flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation
pipelines offering both headless and non-headless crawling.`)
flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation pipelines offering both headless and non-headless crawling.`)

// Input group
flagSet.CreateGroup("input", "Input",
flagSet.StringSliceVarP(&options.URLs, "list", "u", nil, "target url / list to crawl", goflags.FileCommaSeparatedStringSliceOptions),
flagSet.StringVar(&options.Resume, "resume", "", "resume scan using resume.cfg"),
flagSet.StringSliceVarP(&options.Exclude, "exclude", "e", nil, "exclude host matching specified filter ('cdn', 'private-ips', cidr, ip, regex)", goflags.CommaSeparatedStringSliceOptions),
)

// Configuration group
flagSet.CreateGroup("config", "Configuration",
flagSet.StringSliceVarP(&options.Resolvers, "resolvers", "r", nil, "list of custom resolver (file or comma separated)", goflags.FileCommaSeparatedStringSliceOptions),
flagSet.IntVarP(&options.MaxDepth, "depth", "d", 3, "maximum depth to crawl"),
Expand All @@ -115,13 +97,16 @@
flagSet.BoolVarP(&options.IgnoreQueryParams, "ignore-query-params", "iqp", false, "Ignore crawling same path with different query-param values"),
flagSet.BoolVarP(&options.TlsImpersonate, "tls-impersonate", "tlsi", false, "enable experimental client hello (ja3) tls randomization"),
flagSet.BoolVarP(&options.DisableRedirects, "disable-redirects", "dr", false, "disable following redirects (default false)"),
flagSet.BoolVarP(&options.UseDynamicScope, "use-dynamic-scope", "uds", false, "Use dynamic scoping to avoid crawling similar pages"),
)

// Debug group
flagSet.CreateGroup("debug", "Debug",
flagSet.BoolVarP(&options.HealthCheck, "hc", "health-check", false, "run diagnostic check up"),
flagSet.StringVarP(&options.ErrorLogFile, "error-log", "elog", "", "file to write sent requests error log"),
)

// Headless group
flagSet.CreateGroup("headless", "Headless",
flagSet.BoolVarP(&options.Headless, "headless", "hl", false, "enable headless hybrid crawling (experimental)"),
flagSet.BoolVarP(&options.UseInstalledChrome, "system-chrome", "sc", false, "use local installed chrome browser instead of katana installed"),
Expand All @@ -135,6 +120,7 @@
flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"),
)

// Scope group
flagSet.CreateGroup("scope", "Scope",
flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions),
flagSet.StringSliceVarP(&options.OutOfScope, "crawl-out-scope", "cos", nil, "out of scope url regex to be excluded by crawler", goflags.FileCommaSeparatedStringSliceOptions),
Expand All @@ -155,6 +141,7 @@
flagSet.StringVarP(&options.OutputFilterCondition, "filter-condition", "fdc", "", "filter response with dsl based condition"),
)

// Rate-Limit group
flagSet.CreateGroup("ratelimit", "Rate-Limit",
flagSet.IntVarP(&options.Concurrency, "concurrency", "c", 10, "number of concurrent fetchers to use"),
flagSet.IntVarP(&options.Parallelism, "parallelism", "p", 10, "number of concurrent inputs to process"),
Expand All @@ -163,11 +150,13 @@
flagSet.IntVarP(&options.RateLimitMinute, "rate-limit-minute", "rlm", 0, "maximum number of requests to send per minute"),
)

// Update group
flagSet.CreateGroup("update", "Update",
flagSet.CallbackVarP(runner.GetUpdateCallback(), "update", "up", "update katana to latest version"),
flagSet.BoolVarP(&options.DisableUpdateCheck, "disable-update-check", "duc", false, "disable automatic katana update check"),
)

// Output group
flagSet.CreateGroup("output", "Output",
flagSet.StringVarP(&options.OutputFile, "output", "o", "", "file to write output to"),
flagSet.BoolVarP(&options.StoreResponse, "store-response", "sr", false, "store http requests/responses"),
Expand Down Expand Up @@ -207,23 +196,43 @@

func defaultResumeFilename() string {
homedir, err := os.UserHomeDir()
if err != nil {
gologger.Fatal().Msgf("could not get home directory: %s", err)
}
handleError("could not get home directory", err)
configDir := filepath.Join(homedir, ".config", "katana")
return filepath.Join(configDir, fmt.Sprintf("resume-%s.cfg", xid.New().String()))
}

// cleanupOldResumeFiles cleans up resume files older than 10 days.
func setupCloseHandler(runner *runner.Runner, resumeFilename string) {
go func() {
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt, syscall.SIGTERM)
for range c {
gologger.DefaultLogger.Info().Msg("- Ctrl+C pressed in Terminal")
runner.Close()

gologger.Info().Msgf("Creating resume file: %s\n", resumeFilename)
err := runner.SaveState(resumeFilename)
if err != nil {
gologger.Error().Msgf("Couldn't create resume file: %s\n", err)
}

os.Exit(0)
}
}()
}

func cleanupOldResumeFiles() {
homedir, err := os.UserHomeDir()
if err != nil {
gologger.Fatal().Msgf("could not get home directory: %s", err)
}
handleError("could not get home directory", err)
root := filepath.Join(homedir, ".config", "katana")
filter := fileutil.FileFilters{
OlderThan: 24 * time.Hour * 10, // cleanup on the 10th day
Prefix: "resume-",
}
_ = fileutil.DeleteFilesOlderThan(root, filter)
}

func handleError(message string, err error) {
if err != nil {
gologger.Fatal().Msgf("%s: %s\n", message, err)
}
}
3 changes: 3 additions & 0 deletions internal/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/katana/pkg/engine"
"github.com/projectdiscovery/katana/pkg/engine/dynamic" // Import the dynamic package
"github.com/projectdiscovery/katana/pkg/engine/hybrid"
"github.com/projectdiscovery/katana/pkg/engine/parser"
"github.com/projectdiscovery/katana/pkg/engine/standard"
Expand Down Expand Up @@ -95,6 +96,8 @@ func New(options *types.Options) (*Runner, error) {
var crawler engine.Engine

switch {
case options.UseDynamicScope: // Add this case for dynamic scoping
crawler, err = dynamic.New(crawlerOptions)
case options.Headless:
crawler, err = hybrid.New(crawlerOptions)
default:
Expand Down
118 changes: 118 additions & 0 deletions pkg/engine/dynamic/crawl.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package dynamic

import (
"bytes"
"context"
"io"
"net/http"
"net/http/httputil"
"net/url"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/projectdiscovery/katana/pkg/engine/common"
"github.com/projectdiscovery/katana/pkg/navigation"
"github.com/projectdiscovery/katana/pkg/utils"
"github.com/projectdiscovery/retryablehttp-go"
errorutil "github.com/projectdiscovery/utils/errors"
mapsutil "github.com/projectdiscovery/utils/maps"
)

func (c *Crawler) makeRequest(s *common.CrawlSession, request *navigation.Request) (*navigation.Response, error) {
response := &navigation.Response{
Depth: request.Depth + 1,
RootHostname: s.Hostname,
}
ctx := context.WithValue(s.Ctx, navigation.Depth{}, request.Depth)
httpReq, err := http.NewRequestWithContext(ctx, request.Method, request.URL, nil)
if err != nil {
return response, err
}
if request.Body != "" && request.Method != "GET" {
httpReq.Body = io.NopCloser(strings.NewReader(request.Body))
}
req, err := retryablehttp.FromRequest(httpReq)
if err != nil {
return response, err
}
req.Header.Set("User-Agent", utils.WebUserAgent())

for k, v := range request.Headers {
req.Header.Set(k, v)
if k == "Host" {
req.Host = v
}
}
for k, v := range c.Headers {
req.Header.Set(k, v)
if k == "Host" {
req.Host = v
}
}

resp, err := s.HttpClient.Do(req)
if resp != nil {
defer func() {
if resp.Body != nil && resp.StatusCode != http.StatusSwitchingProtocols {
_, _ = io.Copy(io.Discard, resp.Body)
}
_ = resp.Body.Close()
}()
}

rawRequestBytes, _ := req.Dump()
request.Raw = string(rawRequestBytes)

if err != nil {
return response, err
}
if resp.StatusCode == http.StatusSwitchingProtocols {
return response, nil
}
limitReader := io.LimitReader(resp.Body, int64(c.Options.Options.BodyReadSize))
data, err := io.ReadAll(limitReader)
if err != nil {
return response, err
}
if !c.Options.UniqueFilter.UniqueContent(data) {
return &navigation.Response{}, nil
}

// Dynamic scoping logic: update the TF-IDF model and check for similarity
words := strings.Fields(string(data))
scores := tfidfModel.Calculate(words)

for _, score := range scores {
if score > similarityThreshold {
return &navigation.Response{}, nil
}
}

tfidfModel.AddDocument(request.URL, words)

technologies := c.Options.Wappalyzer.Fingerprint(resp.Header, data)
response.Technologies = mapsutil.GetKeys(technologies)

resp.Body = io.NopCloser(strings.NewReader(string(data)))

response.Body = string(data)
response.Resp = resp
response.Reader, err = goquery.NewDocumentFromReader(bytes.NewReader(data))
response.Reader.Url, _ = url.Parse(request.URL)
response.StatusCode = resp.StatusCode
response.Headers = utils.FlattenHeaders(resp.Header)
if c.Options.Options.FormExtraction {
response.Forms = append(response.Forms, utils.ParseFormFields(response.Reader)...)
}

resp.ContentLength = int64(len(data))

rawResponseBytes, _ := httputil.DumpResponse(resp, true)
response.Raw = string(rawResponseBytes)

if err != nil {
return response, errorutil.NewWithTag("dynamic", "could not make document from reader").Wrap(err)
}

return response, nil
}
48 changes: 48 additions & 0 deletions pkg/engine/dynamic/dynamic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package dynamic

import (
"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/katana/pkg/engine/common"
"github.com/projectdiscovery/katana/pkg/tfidf" // Import the new tfidf package
"github.com/projectdiscovery/katana/pkg/types"
errorutil "github.com/projectdiscovery/utils/errors"
)

var (
tfidfModel *tfidf.TfIdf
similarityThreshold float64 = 0.7
)

// Crawler is a dynamic crawler instance
type Crawler struct {
*common.Shared
}

// New returns a new dynamic crawler instance
func New(options *types.CrawlerOptions) (*Crawler, error) {
shared, err := common.NewShared(options)
if err != nil {
return nil, errorutil.NewWithErr(err).WithTag("dynamic")
}
tfidfModel = tfidf.New()
return &Crawler{Shared: shared}, nil
}

// Close closes the crawler process
func (c *Crawler) Close() error {
return nil
}

// Crawl crawls a URL with the specified options
func (c *Crawler) Crawl(rootURL string) error {
crawlSession, err := c.NewCrawlSessionWithURL(rootURL)
if err != nil {
return errorutil.NewWithErr(err).WithTag("dynamic")
}
defer crawlSession.CancelFunc()
gologger.Info().Msgf("Started dynamic crawling for => %v", rootURL)
if err := c.Do(crawlSession, c.makeRequest); err != nil {
return errorutil.NewWithErr(err).WithTag("dynamic")
}
return nil
}
Loading
Loading