Skip to content

Commit

Permalink
refactor: Extract wayback functions
Browse files Browse the repository at this point in the history
enenumxela committed May 17, 2023
1 parent 1c41de9 commit 7c3fcfa
Showing 2 changed files with 111 additions and 57 deletions.
116 changes: 105 additions & 11 deletions pkg/xurlfind3r/sources/wayback/wayback.go
Original file line number Diff line number Diff line change
@@ -3,8 +3,10 @@ package wayback
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"regexp"
"strings"
"sync"

hqratelimiter "github.com/hueristiq/hqgoutils/ratelimiter"
@@ -33,27 +35,21 @@ func (source *Source) Run(config sources.Configuration, domain string) (URLs cha
defer close(waybackURLs)

var (
err error
res *fasthttp.Response
err error
results []string
)

if config.IncludeSubdomains {
domain = "*." + domain
}

limiter.Wait()

reqURL := fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=txt&fl=original&collapse=urlkey", domain)

res, err = httpclient.SimpleGet(reqURL)
results, err = getWaybackURLs(domain)
if err != nil {
return
}

scanner := bufio.NewScanner(bytes.NewReader(res.Body()))

for scanner.Scan() {
URL := scanner.Text()
for index := range results {
URL := results[index]
if URL == "" {
continue
}
@@ -87,6 +83,104 @@ func (source *Source) Run(config sources.Configuration, domain string) (URLs cha
return
}

func getWaybackURLs(domain string) (URLs []string, err error) {
URLs = []string{}

var (
res *fasthttp.Response
)

limiter.Wait()

reqURL := fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=txt&fl=original&collapse=urlkey", domain)

res, err = httpclient.SimpleGet(reqURL)
if err != nil {
return
}

scanner := bufio.NewScanner(bytes.NewReader(res.Body()))

for scanner.Scan() {
URL := scanner.Text()
if URL == "" {
continue
}

URLs = append(URLs, URL)
}

if err = scanner.Err(); err != nil {
return
}

return
}

func getWaybackSnapshots(URL string) (snapshots [][2]string, err error) {
var (
res *fasthttp.Response
)

limiter.Wait()

reqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=timestamp,original&collapse=digest", URL)

res, err = httpclient.SimpleGet(reqURL)
if err != nil {
return
}

if res.Header.ContentLength() == 0 {
return
}

if err = json.Unmarshal(res.Body(), &snapshots); err != nil {
return
}

if len(snapshots) < 2 {
return
}

snapshots = snapshots[1:]

return
}

func getWaybackContent(snapshot [2]string) (content string, err error) {
var (
timestamp = snapshot[0]
URL = snapshot[1]
res *fasthttp.Response
)

limiter.Wait()

reqURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s", timestamp, URL)

res, err = httpclient.SimpleGet(reqURL)
if err != nil {
return
}

content = string(res.Body())

if content == "" {
return
}

snapshotNotFoundFingerprint := "This page can't be displayed. Please use the correct URL address to access"

if strings.Contains(content, snapshotNotFoundFingerprint) {
err = fmt.Errorf(snapshotNotFoundFingerprint)

return
}

return
}

func (source *Source) Name() string {
return "wayback"
}
52 changes: 6 additions & 46 deletions pkg/xurlfind3r/sources/wayback/waybackrobots.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
package wayback

import (
"encoding/json"
"fmt"
"path/filepath"
"regexp"
"strings"
"sync"

hqurl "github.com/hueristiq/hqgoutils/url"
"github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/httpclient"
"github.com/valyala/fasthttp"
)

func parseWaybackRobots(URL string) (URLs chan string) {
@@ -21,35 +17,15 @@ func parseWaybackRobots(URL string) (URLs chan string) {

// retrieve snapshots
var (
err error
res *fasthttp.Response
err error
snapshots [][2]string
)

limiter.Wait()

reqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest", URL)

res, err = httpclient.SimpleGet(reqURL)
snapshots, err = getWaybackSnapshots(URL)
if err != nil {
return
}

if res.Header.ContentLength() == 0 {
return
}

snapshots := [][2]string{}

if err = json.Unmarshal(res.Body(), &snapshots); err != nil {
return
}

if len(snapshots) < 2 {
return
}

snapshots = snapshots[1:]

// retrieve conteny
wg := &sync.WaitGroup{}

@@ -60,31 +36,15 @@ func parseWaybackRobots(URL string) (URLs chan string) {
defer wg.Done()

var (
err error
res *fasthttp.Response
err error
content string
)

limiter.Wait()

reqURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s", row[0], row[1])

res, err = httpclient.SimpleGet(reqURL)
content, err = getWaybackContent(row)
if err != nil {
return
}

content := string(res.Body())

if content == "" {
return
}

contentNotFoundFingerprint := "This page can't be displayed. Please use the correct URL address to access"

if strings.Contains(content, contentNotFoundFingerprint) {
return
}

pattern := regexp.MustCompile(`Disallow:\s?.+`)

disallowed := pattern.FindAllStringSubmatch(content, -1)

0 comments on commit 7c3fcfa

Please sign in to comment.