From e78e511bf1037d46f05d0c73947e1189826bd64c Mon Sep 17 00:00:00 2001 From: Massimiliano Giovagnoli Date: Wed, 29 Mar 2023 21:21:48 +0200 Subject: [PATCH] initial commit Signed-off-by: Massimiliano Giovagnoli --- .github/workflows/integration.yaml | 20 ++ .github/workflows/release.yaml | 48 +++++ .gitignore | 3 + .golangci.yml | 44 ++++ .goreleaser.yaml | 78 +++++++ Dockerfile | 4 + Makefile | 80 +++++++ README.md | 32 +++ cmd/find/find.go | 105 +++++++++ doc/doc.go | 12 ++ doc/wfind.md | 19 ++ go.mod | 41 ++++ go.sum | 107 ++++++++++ internal/output/output.go | 37 ++++ main.go | 23 ++ pkg/find/constants.go | 14 ++ pkg/find/find.go | 329 +++++++++++++++++++++++++++++ pkg/find/find_test.go | 250 ++++++++++++++++++++++ pkg/find/utils.go | 23 ++ 19 files changed, 1269 insertions(+) create mode 100644 .github/workflows/integration.yaml create mode 100644 .github/workflows/release.yaml create mode 100644 .gitignore create mode 100644 .golangci.yml create mode 100644 .goreleaser.yaml create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 README.md create mode 100644 cmd/find/find.go create mode 100644 doc/doc.go create mode 100644 doc/wfind.md create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/output/output.go create mode 100644 main.go create mode 100644 pkg/find/constants.go create mode 100644 pkg/find/find.go create mode 100644 pkg/find/find_test.go create mode 100644 pkg/find/utils.go diff --git a/.github/workflows/integration.yaml b/.github/workflows/integration.yaml new file mode 100644 index 0000000..9cc46ff --- /dev/null +++ b/.github/workflows/integration.yaml @@ -0,0 +1,20 @@ +name: Integration + +on: + push: + branches: [ "*" ] + pull_request: + branches: [ "*" ] + +jobs: + golangci: + name: lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Run golangci-lint + uses: golangci/golangci-lint-action@v3.2.0 + with: + version: v1.50.0 + only-new-issues: false + args: --config .golangci.yml --timeout=5m diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..f16fc8c --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,48 @@ +name: Release + +on: + push: + tags: + - v* + +permissions: + contents: write # needed to write releases + id-token: write # needed for keyless signing + packages: write # needed for ghcr access + +jobs: + goreleaser: + runs-on: ubuntu-22.04 + steps: + + - name: Checkout + uses: actions/checkout@2541b1294d2704b0964813337f33b291d3f8596b #v3.0.2 + with: + fetch-depth: 0 + + - name: Fetch + run: git fetch --prune --force --tags + + - name: Setup Go + uses: actions/setup-go@84cbf8094393cdc5fe1fe1671ff2647332956b1a #v3.2.1 + with: + go-version: '1.20' + + - uses: sigstore/cosign-installer@c3667d99424e7e6047999fb6246c0da843953c65 #v3.0.1 + + - uses: anchore/sbom-action/download-syft@448520c4f19577ffce70a8317e619089054687e3 #v0.13.4 + + - uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Publish release + uses: goreleaser/goreleaser-action@f82d6c1c344bcacabba2c841718984797f664a6b #4.2.0 + with: + version: latest + args: release --clean + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..62d011f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea +.*.swp +/wfind diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..f2dec27 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,44 @@ +linters-settings: + govet: + check-shadowing: true + maligned: + suggest-new: true + goconst: + min-len: 2 + min-occurrences: 3 + gci: + sections: + - standard # Captures all standard packages if they do not match another section. + - default # Contains all imports that could not be matched to another section type. + - prefix(github.com/maxgio92/wfind) # Groups all imports with the specified Prefix. + tagliatelle: + case: + rules: + json: snake + +linters: + enable-all: true + disable: + - interfacer + - godox + - golint + - scopelint + - maligned + - gochecknoglobals + - gochecknoinits + - exhaustivestruct + - exhaustruct + - ireturn + - lll + - nonamedreturns + - wrapcheck + - varnamelen + +issues: + exclude-rules: + - path: / + linters: + - typecheck + +run: + skip-dirs: [] diff --git a/.goreleaser.yaml b/.goreleaser.yaml new file mode 100644 index 0000000..0d7d710 --- /dev/null +++ b/.goreleaser.yaml @@ -0,0 +1,78 @@ +project_name: wfind + +before: + hooks: + - go mod tidy + +builds: +- env: + - CGO_ENABLED=0 + - GO111MODULE=on + goos: + - linux + goarch: + - amd64 + - arm64 + ldflags: | + -X main.buildVersion={{ .Version }} + -s + -w + +checksum: + name_template: '{{ .ProjectName }}_{{ .Version }}_SHA256SUMS' + algorithm: sha256 + +changelog: + sort: asc + +# creates SBOMs of all archives and the source tarball using syft +# https://goreleaser.com/customization/sbom +sboms: +- id: archive + artifacts: archive +- id: source + artifacts: source + +# signs the checksum file +# all files (including the sboms) are included in the checksum, so we don't need to sign each one if we don't want to +# https://goreleaser.com/customization/sign +signs: +- cmd: cosign + env: + - COSIGN_EXPERIMENTAL=1 + certificate: '${artifact}.pem' + args: + - sign-blob + - '--output-certificate=${certificate}' + - '--output-signature=${signature}' + - '${artifact}' + - "--yes" # needed on cosign 2.0.0+ + artifacts: all + output: true + +kos: +- base_image: cgr.dev/chainguard/static + repository: ghcr.io/maxgio92/wfind + bare: true + tags: + - '{{ .Version }}' + - '{{ .Major }}.{{ .Minor }}' + - latest + platforms: + - linux/amd64 + - linux/arm64 + sbom: spdx + +# signs our docker image +# https://goreleaser.com/customization/docker_sign +docker_signs: +- cmd: cosign + env: + - COSIGN_EXPERIMENTAL=1 + artifacts: all + output: true + args: + - 'sign' + - '${artifact}' + - "--yes" # needed on cosign 2.0.0+ + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6d6bd0c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,4 @@ +FROM scratch +COPY wfind /wfind +ENTRYPOINT ["/wfind"] + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0f34b69 --- /dev/null +++ b/Makefile @@ -0,0 +1,80 @@ +APP := wfind +VERSION := 0.1.0 + +user := maxgio92 +oci_image := quay.io/$(user)/$(APP) + +bins := docker git go gofumpt golangci-lint + +PACKAGE_NAME := github.com/$(user)/$(APP) +GOLANG_CROSS_VERSION ?= v$(shell sed -nE 's/go[[:space:]]+([[:digit:]]\.[[:digit:]]+)/\1/p' go.mod) + +GIT_HEAD_COMMIT ?= $$($(git) rev-parse --short HEAD) +GIT_TAG_COMMIT ?= $$($(git) rev-parse --short $(VERSION)) +GIT_MODIFIED_1 ?= $$($(git) diff $(GIT_HEAD_COMMIT) $(GIT_TAG_COMMIT) --quiet && echo "" || echo ".dev") +GIT_MODIFIED_2 ?= $$($(git) diff --quiet && echo "" || echo ".dirty") +GIT_MODIFIED ?= $$(echo "$(GIT_MODIFIED_1)$(GIT_MODIFIED_2)") +GIT_REPO ?= $$($(git) config --get remote.origin.url) +BUILD_DATE ?= $$($(git) log -1 --format="%at" | xargs -I{} date -d @{} +%Y-%m-%dT%H:%M:%S) + +define declare_binpaths +$(1) = $(shell command -v 2>/dev/null $(1)) +endef + +$(foreach bin,$(bins),\ + $(eval $(call declare_binpaths,$(bin)))\ +) + +.PHONY: doc +doc: + @go run doc/doc.go + +.PHONY: build +build: + @$(go) build . + +.PHONY: run +run: + @$(go) run . + +.PHONY: test +test: + @$(go) test -v -cover -gcflags=-l ./... + +.PHONY: lint +lint: golangci-lint + @$(golangci-lint) run ./... + +.PHONY: golangci-lint +golangci-lint: + @$(go) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.50.0 + +.PHONY: gofumpt +gofumpt: + @$(go) install mvdan.cc/gofumpt@v0.3.1 + +.PHONY: oci/build +oci/build: test + @$(docker) build . -t $(oci_image):$(VERSION) -f Dockerfile \ + --build-arg GIT_HEAD_COMMIT=$(GIT_HEAD_COMMIT) \ + --build-arg GIT_TAG_COMMIT=$(GIT_TAG_COMMIT) \ + --build-arg GIT_MODIFIED=$(GIT_MODIFIED) \ + --build-arg GIT_REPO=$(GIT_REPO) \ + --build-arg GIT_LAST_TAG=$(VERSION) \ + --build-arg BUILD_DATE=$(BUILD_DATE) + +.PHONY: oci/push +oci/push: oci/build + @$(docker) push $(oci_image):$(VERSION) + +.PHONY: clean +clean: + @rm -f $(APP) + +.PHONY: help +help: list + +.PHONY: list +list: + @LC_ALL=C $(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b33ae0e --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# wfind: like find but for web sites + +`wfind` (world wide web find) search for files in a web site directory hierarchy over HTTP and HTTPS, through HTML references. + +The tool is inspired by GNU `find(1)` and `wget(1)`. + +### Usage + +``` +wfind URL [flags] +``` + +#### Options + +``` + -h, --help help for wfind + -n, --name string Base of file name (the path with the leading directories removed) pattern. + -r, --recursive Whether to examine entries recursing into directories. Disable to behave like GNU find -maxdepth=0 option. (default true) + -t, --type string The file type + -v, --verbose Enable verbosity to log all visited HTTP(s) files +``` + +### In action + +```shell +$ wfind https://mirrors.edge.kernel.org/debian/dists/ -t f -n Release +https://mirrors.edge.kernel.org/debian/dists/bullseye/Release +https://mirrors.edge.kernel.org/debian/dists/buster/Release +https://mirrors.edge.kernel.org/debian/dists/sid/Release +https://mirrors.edge.kernel.org/debian/dists/stretch/Release +... +``` diff --git a/cmd/find/find.go b/cmd/find/find.go new file mode 100644 index 0000000..7023c35 --- /dev/null +++ b/cmd/find/find.go @@ -0,0 +1,105 @@ +/* +Copyright © 2023 maxgio92 me@maxgio.me + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package find + +import ( + "fmt" + + "github.com/pkg/errors" + "github.com/spf13/cobra" + + "github.com/maxgio92/wfind/internal/output" + "github.com/maxgio92/wfind/pkg/find" +) + +type options struct { + *find.Options +} + +// NewCmd returns a new find command. +func NewCmd() *cobra.Command { + o := &options{ + Options: &find.Options{}, + } + + cmd := &cobra.Command{ + Use: "wfind URL", + Short: "Find folders and files in web sites using HTTP or HTTPS", + Args: cobra.MinimumNArgs(1), + RunE: o.Run, + } + + var filename string + + cmd.Flags().StringVarP(&filename, "name", "n", "", "Base of file name (the path with the leading directories removed) exact pattern.") + + // As of now only exact glob pattern expressions are allowed. The expression then translated to an exact-match regular expression. + o.FilenameRegexp = fmt.Sprintf("^%s$", filename) + + cmd.Flags().StringVarP(&o.FileType, "type", "t", "", "The file type") + cmd.Flags().BoolVarP(&o.Verbose, "verbose", "v", false, "Enable verbosity to log all visited HTTP(s) files") + cmd.Flags().BoolVarP(&o.Recursive, "recursive", "r", true, "Whether to examine entries recursing into directories. Disable to behave like GNU find -maxdepth=0 option.") + + return cmd +} + +// Execute adds all child commands to the root command and sets flags appropriately. +// This is called by main.main(). It only needs to happen once to the rootCmd. +func Execute() { + cmd := NewCmd() + output.ExitOnErr(cmd.Execute()) +} + +func (o *options) validate() error { + if err := o.Validate(); err != nil { + return errors.Wrap(err, "error validating options") + } + + return nil +} + +func (o *options) Run(_ *cobra.Command, args []string) error { + var seed string + if len(args) > 0 { + seed = args[0] + } + + o.SeedURLs = append(o.SeedURLs, seed) + + if err := o.validate(); err != nil { + return err + } + + finder := find.NewFind( + find.WithSeedURLs(o.SeedURLs), + find.WithFilenameRegexp(o.FilenameRegexp), + find.WithFileType(o.FileType), + find.WithRecursive(o.Recursive), + find.WithVerbosity(o.Verbose), + ) + + found, err := finder.Find() + if err != nil { + return errors.Wrap(err, "error finding the file") + } + + for _, v := range found.URLs { + output.Print(v) + } + + return nil +} diff --git a/doc/doc.go b/doc/doc.go new file mode 100644 index 0000000..1398b7c --- /dev/null +++ b/doc/doc.go @@ -0,0 +1,12 @@ +package main + +import ( + "github.com/spf13/cobra/doc" + + "github.com/maxgio92/wfind/cmd/find" + "github.com/maxgio92/wfind/internal/output" +) + +func main() { + output.ExitOnErr(doc.GenMarkdownTree(find.NewCmd(), "./doc")) +} diff --git a/doc/wfind.md b/doc/wfind.md new file mode 100644 index 0000000..59a13fc --- /dev/null +++ b/doc/wfind.md @@ -0,0 +1,19 @@ +## wfind + +Find folders and files in web sites using HTTP or HTTPS + +``` +wfind URL [flags] +``` + +### Options + +``` + -h, --help help for wfind + -n, --name string Base of file name (the path with the leading directories removed) pattern. + -r, --recursive Whether to examine entries recursing into directories. Disable to behave like GNU find -maxdepth=0 option. (default true) + -t, --type string The file type + -v, --verbose Enable verbosity to log all visited HTTP(s) files +``` + +###### Auto generated by spf13/cobra on 29-Mar-2023 diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c277e4b --- /dev/null +++ b/go.mod @@ -0,0 +1,41 @@ +module github.com/maxgio92/wfind + +go 1.20 + +require ( + github.com/gocolly/colly v1.2.0 + github.com/pkg/errors v0.9.1 + github.com/spf13/cobra v1.6.1 + github.com/stretchr/testify v1.8.1 + github.com/vitorsalgado/mocha/v3 v3.0.2 +) + +require ( + github.com/PuerkitoBio/goquery v1.8.0 // indirect + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.2.4 // indirect + github.com/antchfx/xmlquery v1.3.9 // indirect + github.com/antchfx/xpath v1.2.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/go-cmp v0.5.8 // indirect + github.com/inconshreveable/mousetrap v1.0.1 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/kr/pretty v0.3.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/rogpeppe/go-internal v1.8.0 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/stretchr/objx v0.5.0 // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.0.0-20220412020605-290c469a71a5 // indirect + golang.org/x/text v0.3.7 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.28.0 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..f4c9ebf --- /dev/null +++ b/go.sum @@ -0,0 +1,107 @@ +github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= +github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/antchfx/htmlquery v1.2.4 h1:qLteofCMe/KGovBI6SQgmou2QNyedFUW+pE+BpeZ494= +github.com/antchfx/htmlquery v1.2.4/go.mod h1:2xO6iu3EVWs7R2JYqBbp8YzG50gj/ofqs5/0VZoDZLc= +github.com/antchfx/xmlquery v1.3.9 h1:Y+zyMdiUZ4fasTQTkDb3DflOXP7+obcYEh80SISBmnQ= +github.com/antchfx/xmlquery v1.3.9/go.mod h1:wojC/BxjEkjJt6dPiAqUzoXO5nIMWtxHS8PD8TmN4ks= +github.com/antchfx/xpath v1.2.0 h1:mbwv7co+x0RwgeGAOHdrKy89GvHaGvxxBtPK0uF9Zr8= +github.com/antchfx/xpath v1.2.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= +github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc= +github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= +github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8= +github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA= +github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/vitorsalgado/mocha/v3 v3.0.2 h1:uTx/+7kZvTWddXzoF34vUQTa3OL9OE+f5fPjD2XCMoY= +github.com/vitorsalgado/mocha/v3 v3.0.2/go.mod h1:ZMpyjuNfWPqLP2v7ztaaLJwOcyl4jmmHVQCEoDsFD0Q= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220412020605-290c469a71a5 h1:bRb386wvrE+oBNdF1d/Xh9mQrfQ4ecYhW5qJ5GvTGT4= +golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw= +google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/output/output.go b/internal/output/output.go new file mode 100644 index 0000000..8bf0ed0 --- /dev/null +++ b/internal/output/output.go @@ -0,0 +1,37 @@ +/* +Copyright © 2023 maxgio92 me@maxgio.me + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package output + +import ( + "fmt" + "os" +) + +func ExitOnErr(err error) { + if err != nil { + //nolint:forbidigo + fmt.Println(err) + os.Exit(1) + } + + os.Exit(0) +} + +func Print(s string) { + //nolint:forbidigo + fmt.Println(s) +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..1bf1dff --- /dev/null +++ b/main.go @@ -0,0 +1,23 @@ +/* +Copyright © 2023 maxgio92 me@maxgio.me + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import cmd "github.com/maxgio92/wfind/cmd/find" + +func main() { + cmd.Execute() +} diff --git a/pkg/find/constants.go b/pkg/find/constants.go new file mode 100644 index 0000000..cc68d8c --- /dev/null +++ b/pkg/find/constants.go @@ -0,0 +1,14 @@ +package find + +const ( + folderRegex = `.+\/$` + + HTMLTagLink = "a[href]" + HTMLAttrRef = "href" + + UpDir = "../" + RootDir = "/" + + FileTypeReg string = "f" + FileTypeDir string = "d" +) diff --git a/pkg/find/find.go b/pkg/find/find.go new file mode 100644 index 0000000..cb596f8 --- /dev/null +++ b/pkg/find/find.go @@ -0,0 +1,329 @@ +package find + +import ( + "fmt" + "net/url" + "path" + "regexp" + "strings" + + "github.com/gocolly/colly" + d "github.com/gocolly/colly/debug" + "github.com/pkg/errors" +) + +// Result represents the output of the Find job. +type Result struct { + // BaseNames are the path base of the files found. + BaseNames []string + + // URLs are the universal resource location of the files found. + URLs []string +} + +// Options represents the options for the Find job. +type Options struct { + // SeedURLs are the URLs used as root URLs from which for the find's web scraping. + SeedURLs []string + + // FilenameRegexp is a regular expression for which a pattern should match the file names in the Result. + FilenameRegexp string + + // FileType is the file type for which the Find job examines the web hierarchy. + FileType string + + // Recursive enables the Find job to examine files referenced to by the seeds files recursively. + Recursive bool + + // Verbose enables the Find job verbosity printing every visited URL. + Verbose bool +} + +type Option func(opts *Options) + +func WithSeedURLs(seedURLs []string) Option { + return func(opts *Options) { + opts.SeedURLs = seedURLs + } +} + +func WithFilenameRegexp(filenameRegexp string) Option { + return func(opts *Options) { + opts.FilenameRegexp = filenameRegexp + } +} + +func WithFileType(fileType string) Option { + return func(opts *Options) { + opts.FileType = fileType + } +} + +func WithRecursive(recursive bool) Option { + return func(opts *Options) { + opts.Recursive = recursive + } +} + +func WithVerbosity(verbosity bool) Option { + return func(opts *Options) { + opts.Verbose = verbosity + } +} + +// NewFind returns a new Find object to find files over HTTP and HTTPS. +func NewFind(opts ...Option) *Options { + o := &Options{} + + for _, f := range opts { + f(o) + } + + return o +} + +// Validate validates the Find job options and returns an error. +func (o *Options) Validate() error { + // Validate seed URLs. + if len(o.SeedURLs) == 0 { + return errors.New("no seed URLs specified") + } + + for k, v := range o.SeedURLs { + _, err := url.Parse(v) + if err != nil { + return errors.New("a seed URL is not a valid URL") + } + + if !strings.HasSuffix(v, "/") { + o.SeedURLs[k] = v + "/" + } + } + + // Validate filename regular expression. + if o.FilenameRegexp == "" { + return errors.New("no filename regular expression specified") + } + + if _, err := regexp.Compile(o.FilenameRegexp); err != nil { + return errors.Wrap(err, "error validating the file name expression") + } + + // Validate file type. + if o.FileType == "" { + o.FileType = FileTypeReg + } else if o.FileType != FileTypeReg && o.FileType != FileTypeDir { + return errors.New("file type not supported") + } + + o.sanitize() + + return nil +} + +func (o *Options) sanitize() { + if strings.HasPrefix(o.FilenameRegexp, "^") && !strings.HasPrefix(o.FilenameRegexp, "^./") { + o.FilenameRegexp = strings.Replace(o.FilenameRegexp, "^", `^(\./)?`, 1) + } + + if o.FileType == FileTypeDir { + if strings.HasSuffix(o.FilenameRegexp, "$") && !strings.HasSuffix(o.FilenameRegexp, "/$") { + last := strings.LastIndex(o.FilenameRegexp, "$") + o.FilenameRegexp = o.FilenameRegexp[:last] + strings.Replace(o.FilenameRegexp[last:], "$", "/?$", 1) + } + } +} + +func (o *Options) Find() (*Result, error) { + if err := o.Validate(); err != nil { + return nil, errors.Wrap(err, "error validating find options") + } + + switch o.FileType { + case FileTypeReg: + return o.crawlFiles() + case FileTypeDir: + return o.crawlFolders() + default: + return o.crawlFiles() + } +} + +// crawlFiles returns a list of file names found from the seed URL, filtered by file name regex. +// +//nolint:funlen,cyclop +func (o *Options) crawlFiles() (*Result, error) { + seeds := []*url.URL{} + + err := o.Validate() + if err != nil { + return nil, err + } + + for _, v := range o.SeedURLs { + u, _ := url.Parse(v) + + seeds = append(seeds, u) + } + + var files, urls []string + + folderPattern := regexp.MustCompile(folderRegex) + + exactFilePattern := regexp.MustCompile(o.FilenameRegexp) + + fileRegex := strings.TrimPrefix(o.FilenameRegexp, "^") + filePattern := regexp.MustCompile(fileRegex) + + allowedDomains := getHostnamesFromURLs(seeds) + + // Create the collector settings + coOptions := []func(*colly.Collector){ + colly.AllowedDomains(allowedDomains...), + colly.Async(false), + } + + if o.Verbose { + coOptions = append(coOptions, colly.Debugger(&d.LogDebugger{})) + } + + // Create the collector. + co := colly.NewCollector(coOptions...) + + // Add the callback to Visit the linked resource, for each HTML element found + co.OnHTML(HTMLTagLink, func(e *colly.HTMLElement) { + link := e.Attr(HTMLAttrRef) + + // Do not traverse the hierarchy in reverse order. + if o.Recursive && !(strings.Contains(link, UpDir)) && link != RootDir { + //nolint:errcheck + co.Visit(e.Request.AbsoluteURL(link)) + } + }) + + // Add the analysis callback to find file URLs, for each Visit call + co.OnRequest(func(r *colly.Request) { + folderMatch := folderPattern.FindStringSubmatch(r.URL.String()) + + // If the URL is not of a folder. + if len(folderMatch) == 0 { + fileMatch := filePattern.FindStringSubmatch(r.URL.String()) + + // If the URL is of a file. + if len(fileMatch) > 0 { + fileName := path.Base(r.URL.String()) + fileNameMatch := exactFilePattern.FindStringSubmatch(fileName) + + // If the URL matches the file filter regex. + if len(fileNameMatch) > 0 { + files = append(files, fileName) + urls = append(urls, r.URL.String()) + } + } + // Otherwise abort the request. + r.Abort() + } + }) + + // Visit each root folder. + for _, seedURL := range seeds { + err := co.Visit(seedURL.String()) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("error scraping file with URL %seedURLs", seedURL.String())) + } + } + + return &Result{BaseNames: files, URLs: urls}, nil +} + +// crawlFolders returns a list of folder names found from each seed URL, filtered by folder name regex. +// +//nolint:funlen,cyclop +func (o *Options) crawlFolders() (*Result, error) { + seeds := []*url.URL{} + + err := o.Validate() + if err != nil { + return nil, err + } + + for _, v := range o.SeedURLs { + u, _ := url.Parse(v) + + seeds = append(seeds, u) + } + + var folders, urls []string + + folderPattern := regexp.MustCompile(folderRegex) + + exactFolderPattern := regexp.MustCompile(o.FilenameRegexp) + + allowedDomains := getHostnamesFromURLs(seeds) + if len(allowedDomains) < 1 { + //nolint:goerr113 + return nil, fmt.Errorf("invalid seed urls") + } + + // Create the collector settings + coOptions := []func(*colly.Collector){ + colly.AllowedDomains(allowedDomains...), + colly.Async(false), + } + + if o.Verbose { + coOptions = append(coOptions, colly.Debugger(&d.LogDebugger{})) + } + + // Create the collector. + co := colly.NewCollector(coOptions...) + + // Visit each specific folder. + co.OnHTML(HTMLTagLink, func(e *colly.HTMLElement) { + href := e.Attr(HTMLAttrRef) + + folderMatch := folderPattern.FindStringSubmatch(href) + + // if the URL is of a folder. + //nolint:nestif + if len(folderMatch) > 0 { + // Do not traverse the hierarchy in reverse order. + if strings.Contains(href, UpDir) || href == RootDir { + return + } + + exactFolderMatch := exactFolderPattern.FindStringSubmatch(href) + if len(exactFolderMatch) > 0 { + hrefAbsURL, _ := url.Parse(e.Request.AbsoluteURL(href)) + + if !urlSliceContains(seeds, hrefAbsURL) { + folders = append(folders, path.Base(hrefAbsURL.Path)) + urls = append(urls, hrefAbsURL.String()) + } + } + if o.Recursive { + //nolint:errcheck + co.Visit(e.Request.AbsoluteURL(href)) + } + } + }) + + co.OnRequest(func(r *colly.Request) { + folderMatch := folderPattern.FindStringSubmatch(r.URL.String()) + + // if the URL is not of a folder. + if len(folderMatch) == 0 { + r.Abort() + } + }) + + // Visit each root folder. + for _, seedURL := range seeds { + err := co.Visit(seedURL.String()) + if err != nil { + return nil, errors.Wrap(err, fmt.Sprintf("error scraping folder with URL %seedURLs", seedURL.String())) + } + } + + return &Result{BaseNames: folders, URLs: urls}, nil +} diff --git a/pkg/find/find_test.go b/pkg/find/find_test.go new file mode 100644 index 0000000..345de38 --- /dev/null +++ b/pkg/find/find_test.go @@ -0,0 +1,250 @@ +package find_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/vitorsalgado/mocha/v3" + "github.com/vitorsalgado/mocha/v3/expect" + "github.com/vitorsalgado/mocha/v3/reply" + + "github.com/maxgio92/wfind/pkg/find" +) + +const ( + homedir string = "home" + filename string = "File" + dirname string = "Dir" +) + +var ( + subdirs = []string{"foo", "bar", "baz"} + homedirBody = fmt.Sprintf(` + +Index of %s/ + +

Index of /%s/


../
+%s/
+%s/
+%s/
+

+ +`, homedir, homedir, subdirs[0], subdirs[0], subdirs[1], subdirs[1], subdirs[2], subdirs[2]) + subdirBodyF = ` + +Index of %s/%s/ + +

Index of /%s/%s/


../
+%s/
+%s
+

+` + subdirBodyDotSlashF = ` + +Index of %s/%s/ + +

Index of /%s/%s/


../
+%s/
+%s
+

+` + subdirBodies []string + subdirBodiesDotSlash []string +) + +func initFileHierarchy() { + for _, v := range subdirs { + subdirBodies = append(subdirBodies, + fmt.Sprintf(subdirBodyF, homedir, v, homedir, v, dirname, dirname, filename, filename), + ) + + subdirBodiesDotSlash = append(subdirBodiesDotSlash, + fmt.Sprintf(subdirBodyDotSlashF, homedir, v, homedir, v, dirname, dirname, filename, filename), + ) + } +} + +//nolint:dupl +func TestFindFileRecursive(t *testing.T) { + t.Parallel() + + initFileHierarchy() + + m := mocha.New(t).CloseOnCleanup(t) + m.Start() + + m.AddMocks( + // Home directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/", homedir)). + Or(expect.URLPath(fmt.Sprintf("/%s", homedir)))). + Reply(reply.OK().BodyString(homedirBody))) + + // Sub directories. + for i := range subdirs { + m.AddMocks( + // Sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/", homedir, subdirs[i])). + Or(expect.URLPath(fmt.Sprintf("/%s/%s", homedir, subdirs[i])))). + Reply(reply.OK().BodyString(subdirBodies[0])), + // File in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s/", homedir, subdirs[i], dirname)). + Or(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], dirname)))). + Reply(reply.OK().BodyString("")), + // Directory in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], filename))). + Reply(reply.OK().BodyString(""))) + } + + finder := find.NewFind( + find.WithSeedURLs([]string{fmt.Sprintf("%s/%s", m.URL(), homedir)}), + find.WithFilenameRegexp(fmt.Sprintf("^%s$", filename)), + find.WithFileType(find.FileTypeReg), + find.WithRecursive(true), + find.WithVerbosity(false), + ) + + found, err := finder.Find() + + assert.Nil(t, err) + assert.NotNil(t, found) + assert.Len(t, found.URLs, len(subdirs)) +} + +//nolint:dupl +func TestFindDirRecursive(t *testing.T) { + t.Parallel() + + initFileHierarchy() + + m := mocha.New(t).CloseOnCleanup(t) + m.Start() + + m.AddMocks( + // home dir. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/", homedir)). + Or(expect.URLPath(fmt.Sprintf("/%s", homedir)))). + Reply(reply.OK().BodyString(homedirBody))) + + // Sub directories. + for i := range subdirs { + m.AddMocks( + // Sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/", homedir, subdirs[i])). + Or(expect.URLPath(fmt.Sprintf("/%s/%s", homedir, subdirs[i])))). + Reply(reply.OK().BodyString(subdirBodies[0])), + // File in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s/", homedir, subdirs[i], dirname)). + Or(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], dirname)))). + Reply(reply.OK().BodyString("")), + // Directory in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], filename))). + Reply(reply.OK().BodyString(""))) + } + + finder := find.NewFind( + find.WithSeedURLs([]string{fmt.Sprintf("%s/%s", m.URL(), homedir)}), + find.WithFilenameRegexp(fmt.Sprintf("^%s$", dirname)), + find.WithFileType(find.FileTypeDir), + find.WithRecursive(true), + find.WithVerbosity(false), + ) + + found, err := finder.Find() + + assert.Nil(t, err) + assert.NotNil(t, found) + assert.Len(t, found.URLs, len(subdirs)) +} + +//nolint:dupl +func TestFindFileRecursiveDotSlash(t *testing.T) { + t.Parallel() + + initFileHierarchy() + + m := mocha.New(t).CloseOnCleanup(t) + m.Start() + + m.AddMocks( + // Home directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/", homedir)). + Or(expect.URLPath(fmt.Sprintf("/%s", homedir)))). + Reply(reply.OK().BodyString(homedirBody))) + + // Sub directories. + for i := range subdirs { + m.AddMocks( + // Sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/", homedir, subdirs[i])). + Or(expect.URLPath(fmt.Sprintf("/%s/%s", homedir, subdirs[i])))). + Reply(reply.OK().BodyString(subdirBodiesDotSlash[0])), + // File in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s/", homedir, subdirs[i], dirname)). + Or(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], dirname)))). + Reply(reply.OK().BodyString("")), + // Directory in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], filename))). + Reply(reply.OK().BodyString(""))) + } + + finder := find.NewFind( + find.WithSeedURLs([]string{fmt.Sprintf("%s/%s", m.URL(), homedir)}), + find.WithFilenameRegexp(fmt.Sprintf("^%s$", filename)), + find.WithFileType(find.FileTypeReg), + find.WithRecursive(true), + find.WithVerbosity(false), + ) + + found, err := finder.Find() + + assert.Nil(t, err) + assert.NotNil(t, found) + assert.Len(t, found.URLs, len(subdirs)) +} + +//nolint:dupl +func TestFindDirRecursiveDotSlash(t *testing.T) { + t.Parallel() + + initFileHierarchy() + + m := mocha.New(t).CloseOnCleanup(t) + m.Start() + + m.AddMocks( + // home dir. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/", homedir)). + Or(expect.URLPath(fmt.Sprintf("/%s", homedir)))). + Reply(reply.OK().BodyString(homedirBody))) + + // Sub directories. + for i := range subdirs { + m.AddMocks( + // Sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/", homedir, subdirs[i])). + Or(expect.URLPath(fmt.Sprintf("/%s/%s", homedir, subdirs[i])))). + Reply(reply.OK().BodyString(subdirBodiesDotSlash[0])), + // File in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s/", homedir, subdirs[i], dirname)). + Or(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], dirname)))). + Reply(reply.OK().BodyString("")), + // Directory in sub directory. + mocha.Get(expect.URLPath(fmt.Sprintf("/%s/%s/%s", homedir, subdirs[i], filename))). + Reply(reply.OK().BodyString(""))) + } + + finder := find.NewFind( + find.WithSeedURLs([]string{fmt.Sprintf("%s/%s", m.URL(), homedir)}), + find.WithFilenameRegexp(fmt.Sprintf("^%s$", dirname)), + find.WithFileType(find.FileTypeDir), + find.WithRecursive(true), + find.WithVerbosity(false), + ) + + found, err := finder.Find() + + assert.Nil(t, err) + assert.NotNil(t, found) + assert.Len(t, found.URLs, len(subdirs)) +} diff --git a/pkg/find/utils.go b/pkg/find/utils.go new file mode 100644 index 0000000..f99a70f --- /dev/null +++ b/pkg/find/utils.go @@ -0,0 +1,23 @@ +package find + +import "net/url" + +func getHostnamesFromURLs(urls []*url.URL) []string { + hostnames := []string{} + + for _, v := range urls { + hostnames = append(hostnames, v.Host) + } + + return hostnames +} + +func urlSliceContains(us []*url.URL, u *url.URL) bool { + for _, v := range us { + if v == u { + return true + } + } + + return false +}