Skip to content

Commit

Permalink
Merge pull request #827 from projectdiscovery/introduce_incremental_c…
Browse files Browse the repository at this point in the history
…rawling

add no-clobber flag
  • Loading branch information
Mzack9999 authored Jun 3, 2024
2 parents a0e85d9 + 0eb6841 commit 3a0a295
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 2 deletions.
1 change: 1 addition & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.StringVarP(&options.OutputFile, "output", "o", "", "file to write output to"),
flagSet.BoolVarP(&options.StoreResponse, "store-response", "sr", false, "store http requests/responses"),
flagSet.StringVarP(&options.StoreResponseDir, "store-response-dir", "srd", "", "store http requests/responses to custom directory"),
flagSet.BoolVarP(&options.NoClobber, "no-clobber", "ncb", false, "do not overwrite output file"),
flagSet.BoolVarP(&options.OmitRaw, "omit-raw", "or", false, "omit raw requests/responses from jsonl output"),
flagSet.BoolVarP(&options.OmitBody, "omit-body", "ob", false, "omit response body from jsonl output"),
flagSet.BoolVarP(&options.JSON, "jsonl", "j", false, "write output in jsonl format"),
Expand Down
1 change: 1 addition & 0 deletions pkg/output/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ type Options struct {
JSON bool
Verbose bool
StoreResponse bool
NoClobber bool
OmitRaw bool
OmitBody bool
OutputFile string
Expand Down
60 changes: 58 additions & 2 deletions pkg/output/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"

Expand All @@ -16,6 +17,7 @@ import (
"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/katana/pkg/utils/extensions"
errorutil "github.com/projectdiscovery/utils/errors"
fileutil "github.com/projectdiscovery/utils/file"
)

const (
Expand Down Expand Up @@ -48,6 +50,7 @@ type StandardWriter struct {
outputMutex *sync.Mutex
storeResponse bool
storeResponseDir string
noClobber bool
omitRaw bool
omitBody bool
errorFile *fileWriter
Expand All @@ -68,6 +71,7 @@ func New(options Options) (Writer, error) {
outputMutex: &sync.Mutex{},
storeResponse: options.StoreResponse,
storeResponseDir: options.StoreResponseDir,
noClobber: options.NoClobber,
omitRaw: options.OmitRaw,
omitBody: options.OmitBody,
matchRegex: options.MatchRegex,
Expand Down Expand Up @@ -117,8 +121,13 @@ func New(options Options) (Writer, error) {
if options.StoreResponseDir != DefaultResponseDir && options.StoreResponseDir != "" {
writer.storeResponseDir = options.StoreResponseDir
}
_ = os.RemoveAll(writer.storeResponseDir)
_ = os.MkdirAll(writer.storeResponseDir, os.ModePerm)
if options.NoClobber {
writer.storeResponseDir = createDirNameNoClobber(writer.storeResponseDir)
_ = os.MkdirAll(writer.storeResponseDir, os.ModePerm)
} else {
removeDirsWithSuffix(writer.storeResponseDir)
_ = os.MkdirAll(writer.storeResponseDir, os.ModePerm)
}
// todo: the index file seems never used?
_, err := newFileOutputWriter(filepath.Join(writer.storeResponseDir, indexFile))
if err != nil {
Expand Down Expand Up @@ -252,6 +261,53 @@ func (w *StandardWriter) Close() error {
return nil
}

func createDirNameNoClobber(dir string) string {
if !fileutil.FolderExists(dir) {
return dir
}

parentDir, dirName := filepath.Dir(dir), filepath.Base(dir)
entries, err := os.ReadDir(parentDir)
if err != nil {
return dirName
}

highestNum := 0
regex := regexp.MustCompile(fmt.Sprintf("^%s(\\d+)$", regexp.QuoteMeta(dirName)))
for _, entry := range entries {
if entry.IsDir() {
name := entry.Name()
matches := regex.FindStringSubmatch(name)
if matches != nil {
if num, err := strconv.Atoi(matches[1]); err == nil && num > highestNum {
highestNum = num
}
}
}
}

newDirName := fmt.Sprintf("%s%d", dirName, highestNum+1)
newFullPath := filepath.Join(parentDir, newDirName)
return newFullPath
}

func removeDirsWithSuffix(dir string) {
parentDir, dirName := filepath.Dir(dir), filepath.Base(dir)
entries, _ := os.ReadDir(parentDir)

pattern := fmt.Sprintf("^%s(\\d*)$", regexp.QuoteMeta(dirName))
regex := regexp.MustCompile(pattern)
for _, entry := range entries {
if entry.IsDir() {
name := entry.Name()
if regex.MatchString(name) {
fullPath := filepath.Join(parentDir, name)
_ = os.RemoveAll(fullPath)
}
}
}
}

// matchOutput checks if the event matches the output regex
func (w *StandardWriter) matchOutput(event *Result) bool {
if w.matchRegex == nil && w.outputMatchCondition == "" {
Expand Down
1 change: 1 addition & 0 deletions pkg/types/crawler_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func NewCrawlerOptions(options *Options) (*CrawlerOptions, error) {
Fields: options.Fields,
StoreFields: options.StoreFields,
StoreResponseDir: options.StoreResponseDir,
NoClobber: options.NoClobber,
OmitRaw: options.OmitRaw,
OmitBody: options.OmitBody,
FieldConfig: options.FieldConfig,
Expand Down
2 changes: 2 additions & 0 deletions pkg/types/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ type Options struct {
StoreResponse bool
// StoreResponseDir specifies if katana should use a custom directory to store http requests/responses
StoreResponseDir string
// NoClobber specifies if katana should overwrite existing output files
NoClobber bool
// OmitRaw omits raw requests/responses from the output
OmitRaw bool
// OmitBody omits the response body from the output
Expand Down

0 comments on commit 3a0a295

Please sign in to comment.