diff --git a/cmd/katana/main.go b/cmd/katana/main.go index 4299bd45..3824ea7e 100644 --- a/cmd/katana/main.go +++ b/cmd/katana/main.go @@ -131,6 +131,7 @@ pipelines offering both headless and non-headless crawling.`) flagSet.BoolVar(&options.Silent, "silent", false, "display output only"), flagSet.BoolVarP(&options.Verbose, "verbose", "v", false, "display verbose output"), flagSet.BoolVar(&options.Version, "version", false, "display project version"), + flagSet.StringVarP(&options.OutputGraph, "output-graph", "og", "", "graph folder (one graph per URL will be created)"), ) if err := flagSet.Parse(); err != nil { diff --git a/go.mod b/go.mod index eed2d334..40193f73 100644 --- a/go.mod +++ b/go.mod @@ -4,10 +4,12 @@ go 1.18 require ( github.com/PuerkitoBio/goquery v1.8.0 + github.com/dominikbraun/graph v0.14.0 github.com/go-rod/rod v0.112.3 github.com/json-iterator/go v1.1.12 github.com/logrusorgru/aurora v2.0.3+incompatible github.com/lukasbob/srcset v0.0.0-20190730101422-86b742e617f3 + github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 github.com/pkg/errors v0.9.1 github.com/projectdiscovery/fastdialer v0.0.20 github.com/projectdiscovery/goflags v0.1.6 diff --git a/go.sum b/go.sum index 4e390adf..90cf3bdf 100644 --- a/go.sum +++ b/go.sum @@ -19,6 +19,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE= +github.com/dominikbraun/graph v0.14.0 h1:Q1q7OQIKMPDQVNkwRhWQ5BUxCGM1tkcISH5sY6yNj+8= +github.com/dominikbraun/graph v0.14.0/go.mod h1:yOjYyogZLY1LSG9E33JWZJiq5k83Qy2C6POAuiViluc= github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= @@ -77,6 +79,8 @@ github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/lukasbob/srcset v0.0.0-20190730101422-86b742e617f3 h1:l1rIRmxNhzeQM+qA3D0CsDLo0Hx45q9JmK0BlCjt6Ks= github.com/lukasbob/srcset v0.0.0-20190730101422-86b742e617f3/go.mod h1:j16TYl5p17+vBMyaL6Nu4ojlOnfX8lc2k2cfmw6m5TQ= +github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 h1:bjfMeqxWEJ6IRUvGkiTkSwx0a6UdQJsbirRSoXogteY= +github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6/go.mod h1:WVJJvUw/pIOcwu2O8ZzHEhmigq2jzwRNfJVRMJB7bR8= github.com/mholt/archiver v3.1.1+incompatible h1:1dCVxuqs0dJseYEhi5pl7MYPH9zDa1wBi7mF09cbNkU= github.com/mholt/archiver v3.1.1+incompatible/go.mod h1:Dh2dOXnSdiLxRiPoVfIr/fI1TwETms9B8CTWfeh7ROU= github.com/microcosm-cc/bluemonday v1.0.21 h1:dNH3e4PSyE4vNX+KlRGHT5KrSvjeUkoNPwEORjffHJg= diff --git a/pkg/engine/hybrid/crawl.go b/pkg/engine/hybrid/crawl.go index a98c55ca..08de0283 100644 --- a/pkg/engine/hybrid/crawl.go +++ b/pkg/engine/hybrid/crawl.go @@ -2,7 +2,6 @@ package hybrid import ( "bytes" - "context" "io" "net/http" "net/url" @@ -14,13 +13,10 @@ import ( "github.com/go-rod/rod/lib/proto" "github.com/pkg/errors" "github.com/projectdiscovery/gologger" - "github.com/projectdiscovery/katana/pkg/engine/parser" "github.com/projectdiscovery/katana/pkg/navigation" - "github.com/projectdiscovery/katana/pkg/utils/queue" - "github.com/projectdiscovery/retryablehttp-go" ) -func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp.Client, queue *queue.VarietyQueue, parseResponseCallback func(nr navigation.Request), browser *rod.Browser, request navigation.Request, rootHostname string) (*navigation.Response, error) { +func (c *Crawler) navigateRequest(parseResponseCallback func(nr navigation.Request), browser *rod.Browser, request navigation.Request, rootHostname string, crawlerGraph *navigation.Graph) ([]*navigation.Response, error) { depth := request.Depth + 1 response := &navigation.Response{ Depth: depth, @@ -34,6 +30,7 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp } defer page.Close() + var asyncronousResponses []*navigation.Response pageRouter := NewHijack(page) pageRouter.SetPattern(&proto.FetchRequestPattern{ URLPattern: "*", @@ -66,7 +63,7 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp } bodyReader, _ := goquery.NewDocumentFromReader(bytes.NewReader(body)) - resp := navigation.Response{ + resp := &navigation.Response{ Resp: httpresp, Body: []byte(body), Reader: bodyReader, @@ -74,10 +71,12 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp Depth: depth, RootHostname: rootHostname, } - _ = resp + + asyncronousResponses = append(asyncronousResponses, resp) // process the raw response - parser.ParseResponse(resp, parseResponseCallback) + // parser.ParseResponse(*resp, parseResponseCallback) + return FetchContinueRequest(page, e) })() //nolint defer func() { @@ -125,26 +124,26 @@ func (c *Crawler) navigateRequest(ctx context.Context, httpclient *retryablehttp response.Resp = &http.Response{Header: make(http.Header), Request: &http.Request{URL: parsed}} // Create a copy of intrapolated shadow DOM elements and parse them separately - responseCopy := *response - responseCopy.Body = []byte(builder.String()) - if !c.options.UniqueFilter.UniqueContent(responseCopy.Body) { - return &navigation.Response{}, nil + responseShadowDom := *response + responseShadowDom.Body = []byte(builder.String()) + if !c.options.UniqueFilter.UniqueContent(responseShadowDom.Body) { + return nil, nil } - responseCopy.Reader, _ = goquery.NewDocumentFromReader(bytes.NewReader(responseCopy.Body)) - if responseCopy.Reader != nil { - parser.ParseResponse(responseCopy, parseResponseCallback) - } + responseShadowDom.Reader, _ = goquery.NewDocumentFromReader(bytes.NewReader(responseShadowDom.Body)) response.Body = []byte(body) if !c.options.UniqueFilter.UniqueContent(response.Body) { - return &navigation.Response{}, nil + return nil, nil } response.Reader, err = goquery.NewDocumentFromReader(bytes.NewReader(response.Body)) if err != nil { return nil, errors.Wrap(err, "could not parse html") } - return response, nil + + responses := []*navigation.Response{response, &responseShadowDom} + + return append(responses, asyncronousResponses...), nil } // traverseDOMNode performs traversal of node completely building a pseudo-HTML diff --git a/pkg/engine/hybrid/hybrid.go b/pkg/engine/hybrid/hybrid.go index cac6e4e1..41ce6766 100644 --- a/pkg/engine/hybrid/hybrid.go +++ b/pkg/engine/hybrid/hybrid.go @@ -1,17 +1,15 @@ package hybrid import ( - "bytes" "context" "fmt" - "io" "net/http" "net/url" "os" + "path/filepath" "sync/atomic" "time" - "github.com/PuerkitoBio/goquery" "github.com/go-rod/rod" "github.com/go-rod/rod/lib/launcher" "github.com/go-rod/rod/lib/launcher/flags" @@ -186,6 +184,15 @@ func (c *Crawler) Crawl(rootURL string) error { } } + var crawlerGraph *navigation.Graph + if c.options.Options.OutputGraph != "" { + var err error + crawlerGraph, err = navigation.NewGraph() + if err != nil { + return err + } + } + wg := sizedwaitgroup.New(c.options.Options.Concurrency) running := int32(0) for { @@ -217,20 +224,45 @@ func (c *Crawler) Crawl(rootURL string) error { if c.options.Options.Delay > 0 { time.Sleep(time.Duration(c.options.Options.Delay) * time.Second) } - resp, err := c.navigateRequest(ctx, httpclient, queue, parseResponseCallback, newBrowser, req, hostname) + + // responses contains: + // index 0 => primary syncronous node + // indexes 1..n => secondary asyncronous nodes + responses, err := c.navigateRequest(ctx, httpclient, queue, parseResponseCallback, incognitoBrowser, req, hostname, crawlerGraph) if err != nil { gologger.Warning().Msgf("Could not request seed URL: %s\n", err) return } - if resp == nil || resp.Resp == nil && resp.Reader == nil { - return + + for idx, resp := range responses { + if resp == nil || resp.Resp == nil && resp.Reader == nil { + return + } + + if crawlerGraph != nil { + resp.State, _ = crawlerGraph.AddState(req, *resp, resp.Resp.Request.URL.String()) + // the web state for response zero becomes the root for asyncronous requests + if idx == 0 { + req.State = resp.State + } + } + + // process the dom-rendered response + parser.ParseResponse(*resp, parseResponseCallback) } - // process the dom-rendered response - parser.ParseResponse(*resp, parseResponseCallback) }() } + wg.Wait() + if crawlerGraph != nil { + // use the domain name as filename + outputFile := filepath.Join(c.options.Options.OutputGraph, hostname) + if err := crawlerGraph.ExportTo(outputFile); err != nil { + return err + } + } + return nil } diff --git a/pkg/engine/standard/standard.go b/pkg/engine/standard/standard.go index 676ed291..113c1edd 100644 --- a/pkg/engine/standard/standard.go +++ b/pkg/engine/standard/standard.go @@ -6,6 +6,7 @@ import ( "io" "net/http" "net/url" + "path/filepath" "sync/atomic" "time" @@ -85,6 +86,15 @@ func (c *Crawler) Crawl(rootURL string) error { return errors.Wrap(err, "could not create http client") } + var crawlerGraph *navigation.Graph + if c.options.Options.OutputGraph != "" { + var err error + crawlerGraph, err = navigation.NewGraph() + if err != nil { + return err + } + } + wg := sizedwaitgroup.New(c.options.Options.Concurrency) running := int32(0) for { @@ -124,11 +134,24 @@ func (c *Crawler) Crawl(rootURL string) error { if resp.Resp == nil || resp.Reader == nil { return } + + if crawlerGraph != nil { + resp.State, _ = crawlerGraph.AddState(req, resp, req.URL) + } + parser.ParseResponse(resp, parseResponseCallback) }() } wg.Wait() + if crawlerGraph != nil { + // use the domain name as filename + outputFile := filepath.Join(c.options.Options.OutputGraph, hostname) + if err := crawlerGraph.ExportTo(outputFile); err != nil { + return err + } + } + return nil } diff --git a/pkg/navigation/content.go b/pkg/navigation/content.go new file mode 100644 index 00000000..bae090d3 --- /dev/null +++ b/pkg/navigation/content.go @@ -0,0 +1,70 @@ +package navigation + +import ( + "fmt" +) + +// ContentType represent the nature of page content +type ContentType uint8 + +// Types of content type +const ( + Core ContentType = iota + Dynamic +) + +// TagType represents the tag type +type TagType uint8 + +const ( + StartTag TagType = iota + EndTag + SelfClosingTag + Doctype + Comment + Text +) + +func (t TagType) String() string { + switch t { + case StartTag: + return "ST" + case EndTag: + return "ET" + case Doctype: + return "D" + case Comment: + return "C" + case Text: + return "T" + default: + return "" + } +} + +const ( + TextHtml string = "text/html" +) + +type Attribute struct { + Name string + Value string + Namespace string +} + +type Content struct { + TagType TagType + Type ContentType + Data string + Short string + Attributes []Attribute +} + +func (c *Content) IDs() (ids []string) { + for _, attribute := range c.Attributes { + id := fmt.Sprintf("A:%s:%s", attribute.Name, attribute.Value) + ids = append(ids, id) + } + ids = append(ids, fmt.Sprintf("%s:%s", c.TagType.String(), c.Data)) + return +} diff --git a/pkg/navigation/graph.go b/pkg/navigation/graph.go new file mode 100644 index 00000000..495d6ffe --- /dev/null +++ b/pkg/navigation/graph.go @@ -0,0 +1,114 @@ +package navigation + +import ( + "fmt" + + "github.com/dominikbraun/graph" +) + +type GraphOption func(g *Graph) error + +func WithApproximation(g *Graph) error { + g.Approximate = true + return nil +} + +type Graph struct { + graph graph.Graph[string, State] + data *GraphData + Approximate bool +} + +func NewGraph(graphOptions ...GraphOption) (*Graph, error) { + g := &Graph{ + graph: graph.New(StateHash, graph.Directed()), + data: &GraphData{}, + } + + for _, graphOption := range graphOptions { + if err := graphOption(g); err != nil { + return nil, err + } + } + + return g, nil +} + +func (g *Graph) AddState(req Request, resp Response, name string) (*State, error) { + newState, err := g.nearApproximateOrNew(req, resp, name) + if err != nil { + return nil, err + } + + g.data.Vertexes = append(g.data.Vertexes, newState) + // Color edge + // Html State => Green + // Static File => Red + var color string + if ContentTypeIsTextHtml(resp.Resp.Header, resp.Body) { + color = "green" + } else { + color = "red" + } + if err := g.graph.AddVertex(*newState, graph.VertexAttribute("color", color)); err != nil { + return nil, err + } + + // if req.State is nil => this is a root vertex => nothing to do + // otherwise we need to create an edge between the previous state and the current one + if req.State != nil { + properties := make(map[string]string) + properties["source"] = req.Source + properties["attribute"] = req.Attribute + properties["tag"] = req.Source + properties["source"] = req.Tag + properties["label"] = fmt.Sprintf("%s\n%s", req.Tag, req.Attribute) + edgeProperties := g.toEdgeProperties(properties) + if err := g.graph.AddEdge(StateHash(*req.State), StateHash(*newState), edgeProperties...); err != nil { + return nil, err + } + g.data.Edges = append(g.data.Edges, Edge{ + From: req.State, + To: newState, + Properties: properties, + }) + } + + return newState, nil +} + +func (g *Graph) toEdgeProperties(properties map[string]string) (edgeProperties []func(*graph.EdgeProperties)) { + for key, value := range properties { + edgeProperties = append(edgeProperties, graph.EdgeAttribute(key, value)) + } + return +} + +func (g *Graph) nearApproximateOrNew(req Request, resp Response, name string) (*State, error) { + newState, err := NewState(req, resp, name) + if err != nil { + return nil, err + } + + if !g.Approximate { + return newState, nil + } + + // Check if the current state was already visited previously + // using near approximate search (TODO: current linear complexity => binary search?) + var existingState *State + for _, state := range g.data.Vertexes { + // exact match + if state.Digest == newState.Digest { + return existingState, nil + } + + // simhash proximity + similarity := Similarity(newState, state) + if similarity >= 94 { + return existingState, nil + } + } + + return newState, nil +} diff --git a/pkg/navigation/graphdb.go b/pkg/navigation/graphdb.go new file mode 100644 index 00000000..31827c6d --- /dev/null +++ b/pkg/navigation/graphdb.go @@ -0,0 +1,93 @@ +package navigation + +import ( + "encoding/json" + "os" + "path/filepath" + + "github.com/dominikbraun/graph/draw" + "github.com/projectdiscovery/fileutil" +) + +type Edge struct { + From *State + To *State + Properties map[string]string +} + +type GraphData struct { + Vertexes []*State + Edges []Edge +} + +func (g *Graph) ExportTo(outputFile string) error { + basepath := filepath.Dir(outputFile) + if !fileutil.FolderExists(basepath) { + _ = fileutil.CreateFolder(basepath) + } + + if err := g.ExportToDotFile(outputFile + ".dot"); err != nil { + return err + } + + if err := g.ExportToStructureFile(outputFile + ".dot"); err != nil { + return err + } + + return nil +} + +func (g *Graph) ExportToDotFile(outputFile string) error { + outputGraphFile, err := os.Create(outputFile) + if err != nil { + return err + } + defer outputGraphFile.Close() + + return draw.DOT(g.graph, outputGraphFile) +} + +func (g *Graph) ExportToStructureFile(outputFile string) error { + outputGraphFile, err := os.Create(outputFile + ".json") + if err != nil { + return err + } + defer outputGraphFile.Close() + + data, err := json.Marshal(g.data) + if err != nil { + return err + } + + _, err = outputGraphFile.Write(data) + + return err +} + +func (g *Graph) ImportFromStructureFile(inputFile string) error { + f, err := os.Open(inputFile) + if err != nil { + return nil + } + defer f.Close() + + return json.NewDecoder(f).Decode(&g.data) +} + +func (g *Graph) RefreshStructure(inputFile string) error { + // re-add all the vertexes + for _, vertex := range g.data.Vertexes { + if err := g.graph.AddVertex(*vertex); err != nil { + return err + } + } + + // re-add all the edges + for _, edge := range g.data.Edges { + if err := g.graph.AddEdge(StateHash(*edge.From), StateHash(*edge.To), g.toEdgeProperties(edge.Properties)...); err != nil { + return err + } + } + + return nil +} diff --git a/pkg/navigation/request.go b/pkg/navigation/request.go index 6a22cab1..d1044fda 100644 --- a/pkg/navigation/request.go +++ b/pkg/navigation/request.go @@ -9,6 +9,7 @@ type Depth struct{} // Request is a navigation request for the crawler type Request struct { + State *State Method string URL string Body string @@ -41,5 +42,15 @@ func (n *Request) RequestURL() string { // newNavigationRequestURL generates a navigation request from a relative URL func NewNavigationRequestURLFromResponse(path, source, tag, attribute string, resp Response) Request { requestURL := resp.AbsoluteURL(path) - return Request{Method: "GET", URL: requestURL, RootHostname: resp.RootHostname, Depth: resp.Depth, Source: source, Attribute: attribute, Tag: tag} + request := Request{ + Method: "GET", + URL: requestURL, + RootHostname: resp.RootHostname, + Depth: resp.Depth, + Source: source, + Attribute: attribute, + Tag: tag, + State: resp.State, + } + return request } diff --git a/pkg/navigation/response.go b/pkg/navigation/response.go index 398995e1..3b86efe9 100644 --- a/pkg/navigation/response.go +++ b/pkg/navigation/response.go @@ -11,6 +11,7 @@ import ( // Response is a response generated from crawler navigation type Response struct { + State *State Resp *http.Response Depth int Reader *goquery.Document diff --git a/pkg/navigation/state.go b/pkg/navigation/state.go new file mode 100644 index 00000000..4ae64da4 --- /dev/null +++ b/pkg/navigation/state.go @@ -0,0 +1,205 @@ +package navigation + +import ( + "bytes" + "crypto/sha256" + "encoding/hex" + "math" + "math/rand" + "net/http" + + "github.com/mfonda/simhash" + stringsutil "github.com/projectdiscovery/utils/strings" + "golang.org/x/net/html" +) + +const maxFeatures = 10000 + +// State identifies a unique navigation webapp state that might be reached by many means +type State struct { + Name string + Structure []Content + Hash uint64 + Digest string + Data []byte +} + +// FromResponse calculates a state only based on the web page content +func NewState(req Request, resp Response, name string) (*State, error) { + s := &State{} + s.Name = name + + // first we collect the raw material + headers := resp.Resp.Header.Clone() + if err := s.hash(headers, resp.Body); err != nil { + return nil, err + } + return s, nil +} + +func ContentTypeIsTextHtml(headers http.Header, body []byte) bool { + return ContentTypeIs(headers, body, TextHtml) +} + +func ContentTypeIs(headers http.Header, body []byte, contentTypes ...string) bool { + contentType := headers.Get("Content-Type") + if contentType == "" { + contentType = http.DetectContentType(body) + } + return stringsutil.HasPrefixAny(contentType, contentTypes...) +} + +func (s *State) hash(headers http.Header, body []byte) error { + if !ContentTypeIsTextHtml(headers, body) { + // static files can have a deterministic hash based on content + // and a random simhash so they are counted as a unique node + s.Hash = s.randomHash(headers, body) + s.Digest = s.digest(headers, body) + return nil + } + + // we need to perform feature engineering: identify, extract and process features from raw material + // then create a unique hash of the web state + + // we handle the most common case of HTML and we attempt to identify the page structure by considering only the most significative html items + var tokenizedContents []Content + htmlTokenizer := html.NewTokenizer(bytes.NewReader(body)) + for { + // if next token is an error means we either reached the end of the file or the HTML was malformed + if tokenType := htmlTokenizer.Next(); tokenType == html.ErrorToken { + break + } + token := htmlTokenizer.Token() + tokenizedContent := Content{ + Data: token.Data, + Short: token.String(), + Attributes: htmlAttributesToCoreAttributes(token.Attr), + } + switch token.Type { + case html.TextToken: + tokenizedContent.TagType = Text + tokenizedContent.Type = Core + case html.StartTagToken: + tokenizedContent.TagType = StartTag + tokenizedContent.Type = Core + case html.EndTagToken: + tokenizedContent.TagType = EndTag + tokenizedContent.Type = Core + case html.CommentToken: + tokenizedContent.TagType = Comment + tokenizedContent.Type = Core + case html.SelfClosingTagToken: + tokenizedContent.TagType = SelfClosingTag + tokenizedContent.Type = Core + case html.DoctypeToken: + tokenizedContent.TagType = Doctype + tokenizedContent.Type = Core + default: + continue + } + tokenizedContents = append(tokenizedContents, tokenizedContent) + } + + // filter out dynamic content + filteredContents := filterContent(tokenizedContents) + + // the extracted content will be used to build the vectorized set of weighted features + // Note #1: using unitary weight (for now) + // Note #2: the weight cohefficient should keep into account => boost ratio of significant content (eg. forms) + frequency (eg. tfidf) + // Note #3: more weight recommendations at http://www2007.org/papers/paper215.pdf + // Now the hash can be used to compute the bitwise hamming distance with any other hash: + // ≈1: structures can be considered the same + // ≈0: structures are different + hash, err := fingerprintFeatures(filteredContents, 2) + if err != nil { + return err + } + + s.Hash = hash + s.Digest = s.digest(headers, body) + + // During the vectorization process, tendentially locality information is lost (page structure) + // so we save it for later to compute ordered sequences similarity + s.Structure = filteredContents + s.Data = body + + return nil +} + +func htmlAttributesToCoreAttributes(htmlAttributes []html.Attribute) (attributes []Attribute) { + for _, htmlAttribute := range htmlAttributes { + attributes = append(attributes, Attribute{ + Name: htmlAttribute.Key, + Value: htmlAttribute.Val, + Namespace: htmlAttribute.Namespace, + }) + } + return +} + +func filterContent(contents []Content) []Content { + var filteredContent []Content + for _, content := range contents { + // removing dynamic content + if content.Type == Dynamic { + continue + } + + filteredContent = append(filteredContent, content) + } + return filteredContent +} + +func fingerprintFeatures(contents []Content, shingle int) (uint64, error) { + var ( + simhashVector simhash.Vector + numberOfFeatures uint + ) + +content_loop: + for _, contentItem := range contents { + for _, id := range contentItem.IDs() { + if numberOfFeatures >= maxFeatures { + break content_loop + } + // shingled k-gram feature + skgram := make([][]byte, shingle) + skgram = append(skgram[1:], []byte(id)) + featureSum := simhash.NewFeature(bytes.Join(skgram, []byte(" "))).Sum() + + for idx := uint8(0); idx < 64; idx++ { + bit := ((featureSum >> idx) & 1) + if bit == 1 { + simhashVector[idx]++ + } else { + simhashVector[idx]-- + } + } + numberOfFeatures++ + } + } + + return simhash.Fingerprint(simhashVector), nil +} + +// generate a probalistic far hash so that the node is classified as unique +func (s *State) randomHash(headers http.Header, body []byte) uint64 { + return rand.Uint64() +} + +func (s *State) digest(headers http.Header, body []byte) string { + digest := sha256.Sum256(body) + return hex.EncodeToString(digest[:]) +} + +func StateHash(s State) string { + return s.Name + // return fmt.Sprintf("%v", s.Hash) +} + +func Similarity(s1, s2 *State) float64 { + hammingDistance := simhash.Compare(s1.Hash, s2.Hash) + // normalize the distance in [0-100] range + normalizedDistance := float64(hammingDistance) / float64(math.MaxUint8) + return 100 - (normalizedDistance * 100) +} diff --git a/pkg/navigation/transition.go b/pkg/navigation/transition.go new file mode 100644 index 00000000..c0285814 --- /dev/null +++ b/pkg/navigation/transition.go @@ -0,0 +1,6 @@ +package navigation + +// Transition represents the action required to pass from a state to another +type Transition struct { + Type string +} diff --git a/pkg/types/options.go b/pkg/types/options.go index 459c3a07..68cdba7a 100644 --- a/pkg/types/options.go +++ b/pkg/types/options.go @@ -89,6 +89,8 @@ type Options struct { HeadlessOptionalArguments goflags.StringSlice // HeadlessNoSandbox specifies if chrome should be start in --no-sandbox mode HeadlessNoSandbox bool + // OutputGraph graphs in .dot and .json (one per url) + OutputGraph string // SystemChromePath : Specify the chrome binary path for headless crawling SystemChromePath string // OnResult allows callback function on a result