Skip to content

Commit

Permalink
refactor: Revise commoncrawl workings
Browse files Browse the repository at this point in the history
  • Loading branch information
enenumxela committed Jul 26, 2023
1 parent c7c8615 commit 38cdb8a
Showing 1 changed file with 19 additions and 16 deletions.
35 changes: 19 additions & 16 deletions pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
// Package commoncrawl implements functions to search URLs from commoncrawl.
package commoncrawl

import (
Expand All @@ -13,12 +12,12 @@ import (
"github.com/valyala/fasthttp"
)

type indexesResponse []struct {
type getIndexesResponse []struct {
ID string `json:"id"`
API string `json:"cdx-API"`
}

type response struct {
type getURLsResponse struct {
URL string `json:"url"`
Error string `json:"error"`
}
Expand All @@ -28,59 +27,63 @@ type Source struct{}
func (source *Source) Run(config *sources.Configuration, domain string) (URLsChannel chan sources.URL) {
URLsChannel = make(chan sources.URL)

if config.IncludeSubdomains {
domain = "*." + domain
}

go func() {
defer close(URLsChannel)

var err error

var indexesRes *fasthttp.Response
var getIndexesRes *fasthttp.Response

indexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json")
getIndexesRes, err = httpclient.SimpleGet("https://index.commoncrawl.org/collinfo.json")
if err != nil {
return
}

var indexesResponseData indexesResponse
var getIndexesResData getIndexesResponse

if err = json.Unmarshal(indexesRes.Body(), &indexesResponseData); err != nil {
if err = json.Unmarshal(getIndexesRes.Body(), &getIndexesResData); err != nil {
return
}

wg := new(sync.WaitGroup)

for _, indexData := range indexesResponseData {
for _, indexData := range getIndexesResData {
wg.Add(1)

go func(API string) {
defer wg.Done()

contentReqHeaders := map[string]string{
getURLsReqHeaders := map[string]string{
"Host": "index.commoncrawl.org",
}

var err error

var contentRes *fasthttp.Response
var getURLsRes *fasthttp.Response

contentRes, err = httpclient.Get(fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url", API, domain), "", contentReqHeaders)
getURLsRes, err = httpclient.Get(fmt.Sprintf("%s?url=%s/*&output=json&fl=url", API, domain), "", getURLsReqHeaders)
if err != nil {
return
}

scanner := bufio.NewScanner(bytes.NewReader(contentRes.Body()))
scanner := bufio.NewScanner(bytes.NewReader(getURLsRes.Body()))

for scanner.Scan() {
var data response
var getURLsResData getURLsResponse

if err = json.Unmarshal(scanner.Bytes(), &data); err != nil {
if err = json.Unmarshal(scanner.Bytes(), &getURLsResData); err != nil {
return
}

if data.Error != "" {
if getURLsResData.Error != "" {
return
}

URL := data.URL
URL := getURLsResData.URL

if !sources.IsInScope(URL, domain, config.IncludeSubdomains) {
return
Expand Down

0 comments on commit 38cdb8a

Please sign in to comment.