Skip to content

Commit

Permalink
Fixed encoded characters in the subdomain results
Browse files Browse the repository at this point in the history
We use QueryUnescape to unescape the body before finding the subdomains avoding the presence of unwanted characters in the results instead TrimPrefix because is case sensitive, we can receive %2f or %2F.
  • Loading branch information
vzamanillo committed Jul 16, 2020
1 parent 5e9dec7 commit d8ef589
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions pkg/subscraping/sources/commoncrawl/commoncrawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"io"
"io/ioutil"
"net/url"
"strings"

jsoniter "github.com/json-iterator/go"
Expand Down Expand Up @@ -66,8 +67,8 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
}
}

for _, url := range searchIndexes {
further := s.getSubdomains(ctx, url, domain, session, results)
for _, apiURL := range searchIndexes {
further := s.getSubdomains(ctx, apiURL, domain, session, results)
if !further {
break
}
Expand All @@ -83,13 +84,13 @@ func (s *Source) Name() string {
return "commoncrawl"
}

func (s *Source) getSubdomains(ctx context.Context, url string, domain string, session *subscraping.Session, results chan subscraping.Result) bool {
func (s *Source) getSubdomains(ctx context.Context, searchURL string, domain string, session *subscraping.Session, results chan subscraping.Result) bool {
for {
select {
case <-ctx.Done():
return false
default:
resp, err := session.NormalGetWithContext(ctx, fmt.Sprintf("%s?url=*.%s&output=json", url, domain))
resp, err := session.NormalGetWithContext(ctx, fmt.Sprintf("%s?url=*.%s&output=json", searchURL, domain))
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return false
Expand All @@ -103,11 +104,10 @@ func (s *Source) getSubdomains(ctx context.Context, url string, domain string, s
}
resp.Body.Close()

src := string(body)
src, _ := string(body)

for _, subdomain := range session.Extractor.FindAllString(src, -1) {
subdomain = strings.TrimPrefix(subdomain, "25")
subdomain = strings.TrimPrefix(subdomain, "2F")

results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
}
Expand Down

0 comments on commit d8ef589

Please sign in to comment.