From 1f07814e73d21133c416fc97f025a035be9c5492 Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Fri, 11 Jun 2021 13:51:39 +0530 Subject: [PATCH 1/3] Add retries to link check Signed-off-by: Saswata Mukherjee --- pkg/mdformatter/linktransformer/link.go | 28 ++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pkg/mdformatter/linktransformer/link.go b/pkg/mdformatter/linktransformer/link.go index f992750..29abbbf 100644 --- a/pkg/mdformatter/linktransformer/link.go +++ b/pkg/mdformatter/linktransformer/link.go @@ -11,8 +11,10 @@ import ( "path/filepath" "regexp" "sort" + "strconv" "strings" "sync" + "time" "github.com/bwplotka/mdox/pkg/mdformatter" "github.com/efficientgo/tools/core/pkg/merrors" @@ -33,6 +35,7 @@ var ( const ( originalURLKey = "originalURLKey" + retryKey = "retryKey" ) type chain struct { @@ -175,7 +178,30 @@ func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir strin v.c.OnError(func(response *colly.Response, err error) { v.rMu.Lock() defer v.rMu.Unlock() - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode) + retriesStr := response.Ctx.Get(retryKey) + retries, _ := strconv.Atoi(retriesStr) + switch response.StatusCode { + case 429: + if retries <= 2 { + response.Ctx.Put(retryKey, strconv.Itoa(retries+1)) + retryAfter, convErr := strconv.Atoi(response.Headers.Get("Retry-After")) + if convErr == nil { + time.Sleep(time.Duration(retryAfter) * time.Second) + } + + retryErr := response.Request.Retry() + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(retryErr, "%q rate limited even after retry; status code %v", response.Request.URL.String(), response.StatusCode) + } + case 301, 307, 503: + if retries <= 2 { + response.Ctx.Put(retryKey, strconv.Itoa(retries+1)) + + retryErr := response.Request.Retry() + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(retryErr, "%q not accessible even after retry; status code %v", response.Request.URL.String(), response.StatusCode) + } + default: + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode) + } }) return v, nil } From 45dc9560c7619522b610dd8711062d4a28bbe06e Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Sun, 13 Jun 2021 21:00:28 +0530 Subject: [PATCH 2/3] Implement suggestions Signed-off-by: Saswata Mukherjee --- pkg/mdformatter/linktransformer/link.go | 41 ++++++++++++++++--------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pkg/mdformatter/linktransformer/link.go b/pkg/mdformatter/linktransformer/link.go index 29abbbf..b4d061f 100644 --- a/pkg/mdformatter/linktransformer/link.go +++ b/pkg/mdformatter/linktransformer/link.go @@ -7,6 +7,7 @@ import ( "bufio" "bytes" "io" + "net/http" "os" "path/filepath" "regexp" @@ -34,8 +35,8 @@ var ( ) const ( - originalURLKey = "originalURLKey" - retryKey = "retryKey" + originalURLKey = "originalURLKey" + numberOfRetriesKey = "retryKey" ) type chain struct { @@ -178,26 +179,36 @@ func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir strin v.c.OnError(func(response *colly.Response, err error) { v.rMu.Lock() defer v.rMu.Unlock() - retriesStr := response.Ctx.Get(retryKey) + retriesStr := response.Ctx.Get(numberOfRetriesKey) retries, _ := strconv.Atoi(retriesStr) switch response.StatusCode { - case 429: - if retries <= 2 { - response.Ctx.Put(retryKey, strconv.Itoa(retries+1)) + case http.StatusTooManyRequests: + if retries <= 0 { + var retryAfter int + // Retry calls same methods as Visit and makes request with same options. + // So retryKey is incremented here if onError is called again after Retry. By default retries once. + response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1)) retryAfter, convErr := strconv.Atoi(response.Headers.Get("Retry-After")) - if convErr == nil { - time.Sleep(time.Duration(retryAfter) * time.Second) + if convErr != nil { + retryAfter = 1 } + time.Sleep(time.Duration(retryAfter) * time.Second) - retryErr := response.Request.Retry() - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(retryErr, "%q rate limited even after retry; status code %v", response.Request.URL.String(), response.StatusCode) + if retryErr := response.Request.Retry(); retryErr != nil { + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "remote link retry %v", response.Ctx.Get(originalURLKey)) + break + } + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q rate limited even after retry; status code %v", response.Request.URL.String(), response.StatusCode) } - case 301, 307, 503: - if retries <= 2 { - response.Ctx.Put(retryKey, strconv.Itoa(retries+1)) + case http.StatusMovedPermanently, http.StatusTemporaryRedirect, http.StatusServiceUnavailable: + if retries <= 0 { + response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1)) - retryErr := response.Request.Retry() - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(retryErr, "%q not accessible even after retry; status code %v", response.Request.URL.String(), response.StatusCode) + if retryErr := response.Request.Retry(); retryErr != nil { + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "remote link retry %v", response.Ctx.Get(originalURLKey)) + break + } + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible even after retry; status code %v", response.Request.URL.String(), response.StatusCode) } default: v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode) From d7e06553a67defd7127b51efd08ae4e059b82b7e Mon Sep 17 00:00:00 2001 From: Saswata Mukherjee Date: Wed, 16 Jun 2021 15:47:07 +0530 Subject: [PATCH 3/3] Add ctx for clean cancellation Signed-off-by: Saswata Mukherjee --- go.mod | 3 +- go.sum | 11 +++-- main.go | 2 +- pkg/mdformatter/linktransformer/link.go | 61 ++++++++++++++----------- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/go.mod b/go.mod index 30fa2ae..3cbbc58 100644 --- a/go.mod +++ b/go.mod @@ -4,11 +4,12 @@ go 1.14 require ( github.com/Kunde21/markdownfmt/v2 v2.1.0 + github.com/antchfx/xmlquery v1.3.4 // indirect github.com/efficientgo/tools/core v0.0.0-20210609125236-d73259166f20 github.com/efficientgo/tools/extkingpin v0.0.0-20210609125236-d73259166f20 github.com/go-kit/kit v0.10.0 github.com/gobwas/glob v0.2.3 - github.com/gocolly/colly/v2 v2.1.0 + github.com/gocolly/colly/v2 v2.1.1-0.20201013153555-8252c346cfb0 github.com/gohugoio/hugo v0.74.3 github.com/kr/pretty v0.2.1 // indirect github.com/mattn/go-shellwords v1.0.10 diff --git a/go.sum b/go.sum index 0e7facd..3644993 100644 --- a/go.sum +++ b/go.sum @@ -60,11 +60,13 @@ github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5 github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= -github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= +github.com/antchfx/xmlquery v1.3.4 h1:RuhsI4AA5Ma4XoXhaAr2VjJxU0Xp0W2zy/f9ZIpsF4s= +github.com/antchfx/xmlquery v1.3.4/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc= github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg= +github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= @@ -164,8 +166,8 @@ github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= -github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= -github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= +github.com/gocolly/colly/v2 v2.1.1-0.20201013153555-8252c346cfb0 h1:f+kHjWsqjft+/nCpQ6TcV3Lgs4lc+6rvBt2sfL4XsbE= +github.com/gocolly/colly/v2 v2.1.1-0.20201013153555-8252c346cfb0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= github.com/gogo/googleapis v1.1.0/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -545,6 +547,7 @@ golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200822124328-c89045814202 h1:VvcQYSHwXgi7W+TpUR6A9g6Up98WAHf3f/ulnJ62IyA= golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= diff --git a/main.go b/main.go index ff90d46..9148166 100644 --- a/main.go +++ b/main.go @@ -156,7 +156,7 @@ This directive runs executable with arguments and put its stderr and stdout outp if err != nil { return err } - v, err := linktransformer.NewValidator(logger, validateConfigContent, anchorDir) + v, err := linktransformer.NewValidator(ctx, logger, validateConfigContent, anchorDir) if err != nil { return err } diff --git a/pkg/mdformatter/linktransformer/link.go b/pkg/mdformatter/linktransformer/link.go index b4d061f..bfca382 100644 --- a/pkg/mdformatter/linktransformer/link.go +++ b/pkg/mdformatter/linktransformer/link.go @@ -6,6 +6,7 @@ package linktransformer import ( "bufio" "bytes" + "context" "io" "net/http" "os" @@ -139,7 +140,7 @@ type futureResult struct { // NewValidator returns mdformatter.LinkTransformer that crawls all links. // TODO(bwplotka): Add optimization and debug modes - this is the main source of latency and pain. -func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir string) (mdformatter.LinkTransformer, error) { +func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []byte, anchorDir string) (mdformatter.LinkTransformer, error) { var err error config := Config{} if string(linksValidateConfig) != "" { @@ -154,7 +155,7 @@ func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir strin validateConfig: config, localLinks: map[string]*[]string{}, remoteLinks: map[string]error{}, - c: colly.NewCollector(colly.Async()), + c: colly.NewCollector(colly.Async(), colly.StdlibContext(ctx)), destFutures: map[futureKey]*futureResult{}, } // Set very soft limits. @@ -183,33 +184,39 @@ func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir strin retries, _ := strconv.Atoi(retriesStr) switch response.StatusCode { case http.StatusTooManyRequests: - if retries <= 0 { - var retryAfter int - // Retry calls same methods as Visit and makes request with same options. - // So retryKey is incremented here if onError is called again after Retry. By default retries once. - response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1)) - retryAfter, convErr := strconv.Atoi(response.Headers.Get("Retry-After")) - if convErr != nil { - retryAfter = 1 - } - time.Sleep(time.Duration(retryAfter) * time.Second) - - if retryErr := response.Request.Retry(); retryErr != nil { - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "remote link retry %v", response.Ctx.Get(originalURLKey)) - break - } - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q rate limited even after retry; status code %v", response.Request.URL.String(), response.StatusCode) + if retries > 0 { + break } + var retryAfterSeconds int + // Retry calls same methods as Visit and makes request with same options. + // So retryKey is incremented here if onError is called again after Retry. By default retries once. + response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1)) + retryAfterSeconds, convErr := strconv.Atoi(response.Headers.Get("Retry-After")) + if convErr != nil { + retryAfterSeconds = 1 + } + select { + case <-time.After(time.Duration(retryAfterSeconds) * time.Second): + case <-v.c.Context.Done(): + return + } + + if retryErr := response.Request.Retry(); retryErr != nil { + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "remote link retry %v", response.Ctx.Get(originalURLKey)) + break + } + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q rate limited even after retry; status code %v", response.Request.URL.String(), response.StatusCode) case http.StatusMovedPermanently, http.StatusTemporaryRedirect, http.StatusServiceUnavailable: - if retries <= 0 { - response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1)) - - if retryErr := response.Request.Retry(); retryErr != nil { - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "remote link retry %v", response.Ctx.Get(originalURLKey)) - break - } - v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible even after retry; status code %v", response.Request.URL.String(), response.StatusCode) + if retries > 0 { + break + } + response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1)) + + if retryErr := response.Request.Retry(); retryErr != nil { + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "remote link retry %v", response.Ctx.Get(originalURLKey)) + break } + v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible even after retry; status code %v", response.Request.URL.String(), response.StatusCode) default: v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode) } @@ -219,7 +226,7 @@ func NewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir strin // MustNewValidator returns mdformatter.LinkTransformer that crawls all links. func MustNewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir string) mdformatter.LinkTransformer { - v, err := NewValidator(logger, linksValidateConfig, anchorDir) + v, err := NewValidator(context.TODO(), logger, linksValidateConfig, anchorDir) if err != nil { panic(err) }