From 40c72a4481459eea805c26549de1e4c6720bb86a Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Thu, 9 Mar 2023 16:28:14 +0100 Subject: [PATCH 01/10] Refactor source and filename as `URL` attributes, instead of `Runner` * They are needed in `URL` methods to check internal links, and this way we avoid having to set them as `Runner` attributes while looping over collected links. * This also allows a more consistent definition and usage of internal links metadata. --- lib/html_proofer/attribute/url.rb | 25 ++++++++++++---------- lib/html_proofer/check.rb | 6 +++--- lib/html_proofer/check/images.rb | 2 +- lib/html_proofer/element.rb | 2 +- lib/html_proofer/url_validator/internal.rb | 9 +++----- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/lib/html_proofer/attribute/url.rb b/lib/html_proofer/attribute/url.rb index 67691843..7507413a 100644 --- a/lib/html_proofer/attribute/url.rb +++ b/lib/html_proofer/attribute/url.rb @@ -3,13 +3,16 @@ module HTMLProofer class Attribute class Url < HTMLProofer::Attribute - attr_reader :url, :size + attr_reader :url, :size, :source, :filename REMOTE_SCHEMES = ["http", "https"].freeze - def initialize(runner, link_attribute, base_url: nil, extract_size: false) + def initialize(runner, link_attribute, base_url: nil, source: nil, filename: nil, extract_size: false) super + @source = source + @filename = filename + if @raw_attribute.nil? @url = nil else @@ -125,7 +128,7 @@ def base64? end def absolute_path - path = file_path || @runner.current_filename + path = file_path || @filename File.expand_path(path, Dir.pwd) end @@ -139,22 +142,22 @@ def file_path base = if absolute_path?(path) # path relative to root # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname - @runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source)) + @runner.options[:root_dir] || (File.directory?(@source) ? @source : File.dirname(@source)) # relative links, path is a file elsif File.exist?(File.expand_path( path, - @runner.current_source, - )) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source)) - File.dirname(@runner.current_filename) + @source, + )) || File.exist?(File.expand_path(path_dot_ext, @source)) + File.dirname(@filename) # relative links in nested dir, path is a file elsif File.exist?(File.join( - File.dirname(@runner.current_filename), + File.dirname(@filename), path, - )) || File.exist?(File.join(File.dirname(@runner.current_filename), path_dot_ext)) - File.dirname(@runner.current_filename) + )) || File.exist?(File.join(File.dirname(@filename), path_dot_ext)) + File.dirname(@filename) # relative link, path is a directory else - @runner.current_filename + @filename end file = File.join(base, path) diff --git a/lib/html_proofer/check.rb b/lib/html_proofer/check.rb index 32e40a5f..bf09c5cc 100644 --- a/lib/html_proofer/check.rb +++ b/lib/html_proofer/check.rb @@ -45,8 +45,8 @@ def add_to_internal_urls(url, line) @internal_urls[url_string] = [] if @internal_urls[url_string].nil? metadata = { - source: @runner.current_source, - filename: @runner.current_filename, + source: url.source, + filename: url.filename, line: line, base_url: base_url, found: false, @@ -59,7 +59,7 @@ def add_to_external_urls(url, line) @external_urls[url_string] = [] if @external_urls[url_string].nil? - @external_urls[url_string] << { filename: @runner.current_filename, line: line } + @external_urls[url_string] << { filename: url.filename, line: line } end class << self diff --git a/lib/html_proofer/check/images.rb b/lib/html_proofer/check/images.rb index b5b34aab..dee3b4fa 100644 --- a/lib/html_proofer/check/images.rb +++ b/lib/html_proofer/check/images.rb @@ -37,7 +37,7 @@ def run ) elsif @img.multiple_srcsets? || @img.multiple_sizes? @img.srcsets_wo_sizes.each do |srcset| - srcset_url = HTMLProofer::Attribute::Url.new(@runner, srcset, base_url: @img.base_url, extract_size: true) + srcset_url = HTMLProofer::Attribute::Url.new(@runner, srcset, base_url: @img.base_url, source: @img.url.source, filename: @img.url.filename, extract_size: true) if srcset_url.protocol_relative? add_failure( diff --git a/lib/html_proofer/element.rb b/lib/html_proofer/element.rb index a0bbe55c..1b66313a 100644 --- a/lib/html_proofer/element.rb +++ b/lib/html_proofer/element.rb @@ -16,7 +16,7 @@ def initialize(runner, node, base_url: nil) swap_attributes! @base_url = base_url - @url = Attribute::Url.new(runner, link_attribute, base_url: base_url) + @url = Attribute::Url.new(runner, link_attribute, base_url: base_url, source: @runner.current_source, filename: @runner.current_filename) @line = node.line @content = node.content diff --git a/lib/html_proofer/url_validator/internal.rb b/lib/html_proofer/url_validator/internal.rb index 8af8f5cb..daec0b03 100644 --- a/lib/html_proofer/url_validator/internal.rb +++ b/lib/html_proofer/url_validator/internal.rb @@ -29,15 +29,12 @@ def run_internal_link_checker(links) matched_count_to_log = pluralize(matched_files.count, "reference", "references") @logger.log(:debug, "(#{i + 1} / #{links.count}) Internal link #{link}: Checking #{matched_count_to_log}") matched_files.each do |metadata| - url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url]) - - @runner.current_source = metadata[:source] - @runner.current_filename = metadata[:filename] + url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url], source: metadata[:source], filename: metadata[:filename]) target_file_path = url.absolute_path unless file_exists?(target_file_path) @failed_checks << Failure.new( - @runner.current_filename, + metadata[:filename], "Links > Internal", "internally linking to #{url}, which does not exist", line: metadata[:line], @@ -62,7 +59,7 @@ def run_internal_link_checker(links) end unless hash_exists @failed_checks << Failure.new( - @runner.current_filename, + metadata[:filename], "Links > Internal", "internally linking to #{url}; the file exists, but the hash '#{url.hash}' does not", line: metadata[:line], From 41af0a18c0b5725eff319de35231d0fc3e7cc22f Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Thu, 9 Mar 2023 18:04:28 +0100 Subject: [PATCH 02/10] Extend `Check.add_failure` to allow an `Element` argument * So we can extract `line` and `content` from there instead of explicitly passing them every time. --- lib/html_proofer/check.rb | 6 ++--- lib/html_proofer/check/favicon.rb | 6 ++--- lib/html_proofer/check/images.rb | 25 ++++++++------------- lib/html_proofer/check/links.rb | 33 +++++++++++----------------- lib/html_proofer/check/open_graph.rb | 12 +++++----- lib/html_proofer/check/scripts.rb | 17 +++++--------- spec/html-proofer/check_spec.rb | 2 +- 7 files changed, 39 insertions(+), 62 deletions(-) diff --git a/lib/html_proofer/check.rb b/lib/html_proofer/check.rb index bf09c5cc..77ac6df2 100644 --- a/lib/html_proofer/check.rb +++ b/lib/html_proofer/check.rb @@ -24,14 +24,14 @@ def run raise NotImplementedError, "HTMLProofer::Check subclasses must implement #run" end - def add_failure(description, line: nil, status: nil, content: nil) + def add_failure(description, element: nil, line: nil, status: nil, content: nil) @failures << Failure.new( @runner.current_filename, short_name, description, - line: line, + line: element.nil? ? line : element.line, status: status, - content: content, + content: element.nil? ? content : element.content, ) end diff --git a/lib/html_proofer/check/favicon.rb b/lib/html_proofer/check/favicon.rb index 0c22d34d..19f5601b 100644 --- a/lib/html_proofer/check/favicon.rb +++ b/lib/html_proofer/check/favicon.rb @@ -19,16 +19,14 @@ def run if @favicon.url.protocol_relative? add_failure( "favicon link #{@favicon.url} is a protocol-relative URL, use explicit https:// instead", - line: @favicon.line, - content: @favicon.content, + element: @favicon, ) elsif @favicon.url.remote? add_to_external_urls(@favicon.url, @favicon.line) elsif !@favicon.url.exists? add_failure( "internal favicon #{@favicon.url.raw_attribute} does not exist", - line: @favicon.line, - content: @favicon.content, + element: @favicon, ) end else diff --git a/lib/html_proofer/check/images.rb b/lib/html_proofer/check/images.rb index dee3b4fa..864300f7 100644 --- a/lib/html_proofer/check/images.rb +++ b/lib/html_proofer/check/images.rb @@ -14,26 +14,23 @@ def run # screenshot filenames should return because of terrible names add_failure( "image has a terrible filename (#{@img.url.raw_attribute})", - line: @img.line, - content: @img.content, + element: @img, ) if terrible_filename? # does the image exist? if missing_src? - add_failure("image has no src or srcset attribute", line: @img.line, content: @img.content) + add_failure("image has no src or srcset attribute", element: @img) elsif @img.url.protocol_relative? add_failure( "image link #{@img.url} is a protocol-relative URL, use explicit https:// instead", - line: @img.line, - content: @img.content, + element: @img, ) elsif @img.url.remote? add_to_external_urls(@img.url, @img.line) elsif !@img.url.exists? && !@img.multiple_srcsets? && !@img.multiple_sizes? add_failure( "internal image #{@img.url.raw_attribute} does not exist", - line: @img.line, - content: @img.content, + element: @img, ) elsif @img.multiple_srcsets? || @img.multiple_sizes? @img.srcsets_wo_sizes.each do |srcset| @@ -42,13 +39,12 @@ def run if srcset_url.protocol_relative? add_failure( "image link #{srcset_url.url} is a protocol-relative URL, use explicit https:// instead", - line: @img.line, - content: @img.content, + element: @img, ) elsif srcset_url.remote? add_to_external_urls(srcset_url.url, @img.line) elsif !srcset_url.exists? - add_failure("internal image #{srcset} does not exist", line: @img.line, content: @img.content) + add_failure("internal image #{srcset} does not exist", element: @img) end end end @@ -58,22 +54,19 @@ def run if missing_alt_tag? && !ignore_missing_alt? add_failure( "image #{@img.url.raw_attribute} does not have an alt attribute", - line: @img.line, - content: @img.content, + element: @img, ) elsif (empty_alt_tag? || alt_all_spaces?) && !ignore_empty_alt? add_failure( "image #{@img.url.raw_attribute} has an alt attribute, but no content", - line: @img.line, - content: @img.content, + element: @img, ) end end add_failure( "image #{@img.url.raw_attribute} uses the http scheme", - line: @img.line, - content: @img.content, + element: @img, ) if @runner.enforce_https? && @img.url.http? end diff --git a/lib/html_proofer/check/links.rb b/lib/html_proofer/check/links.rb index ec72f001..3fc47ac0 100644 --- a/lib/html_proofer/check/links.rb +++ b/lib/html_proofer/check/links.rb @@ -10,7 +10,7 @@ def run next if @link.ignore? if !allow_hash_href? && @link.node["href"] == "#" - add_failure("linking to internal hash #, which points to nowhere", line: @link.line, content: @link.content) + add_failure("linking to internal hash #, which points to nowhere", element: @link) next end @@ -18,21 +18,20 @@ def run if blank?(@link.url.raw_attribute) next if allow_missing_href? - add_failure("'#{@link.node.name}' tag is missing a reference", line: @link.line, content: @link.content) + add_failure("'#{@link.node.name}' tag is missing a reference", element: @link) next end # is it even a valid URL? unless @link.url.valid? - add_failure("#{@link.href} is an invalid URL", line: @link.line, content: @link.content) + add_failure("#{@link.href} is an invalid URL", element: @link) next end if @link.url.protocol_relative? add_failure( "#{@link.url} is a protocol-relative URL, use explicit https:// instead", - line: @link.line, - content: @link.content, + element: @link, ) next end @@ -50,7 +49,7 @@ def run next if @link.node["rel"] == "dns-prefetch" unless @link.url.path? - add_failure("#{@link.url.raw_attribute} is an invalid URL", line: @link.line, content: @link.content) + add_failure("#{@link.url.raw_attribute} is an invalid URL", element: @link) next end @@ -60,8 +59,7 @@ def run if @link.url.unslashed_directory?(@link.url.absolute_path) add_failure( "internally linking to a directory #{@link.url.raw_attribute} without trailing slash", - line: @link.line, - content: @link.content, + element: @link, ) next end @@ -88,7 +86,7 @@ def check_schemes when "http" return unless @runner.options[:enforce_https] - add_failure("#{@link.url.raw_attribute} is not an HTTPS link", line: @link.line, content: @link.content) + add_failure("#{@link.url.raw_attribute} is not an HTTPS link", element: @link) end end @@ -96,14 +94,12 @@ def handle_mailto if @link.url.path.empty? add_failure( "#{@link.url.raw_attribute} contains no email address", - line: @link.line, - content: @link.content, + element: @link, ) unless ignore_empty_mailto? elsif !/#{URI::MailTo::EMAIL_REGEXP}/o.match?(@link.url.path) add_failure( "#{@link.url.raw_attribute} contains an invalid email address", - line: @link.line, - content: @link.content, + element: @link, ) end end @@ -111,8 +107,7 @@ def handle_mailto def handle_tel add_failure( "#{@link.url.raw_attribute} contains no phone number", - line: @link.line, - content: @link.content, + element: @link, ) if @link.url.path.empty? end @@ -130,16 +125,14 @@ def check_sri if blank?(@link.node["integrity"]) && blank?(@link.node["crossorigin"]) add_failure( "SRI and CORS not provided in: #{@link.url.raw_attribute}", - line: @link.line, - content: @link.content, + element: @link, ) elsif blank?(@link.node["integrity"]) - add_failure("Integrity is missing in: #{@link.url.raw_attribute}", line: @link.line, content: @link.content) + add_failure("Integrity is missing in: #{@link.url.raw_attribute}", element: @link) elsif blank?(@link.node["crossorigin"]) add_failure( "CORS not provided for external resource in: #{@link.link.url.raw_attribute}", - line: @link.line, - content: @link.content, + element: @link, ) end end diff --git a/lib/html_proofer/check/open_graph.rb b/lib/html_proofer/check/open_graph.rb index fd32aef2..3f10ee67 100644 --- a/lib/html_proofer/check/open_graph.rb +++ b/lib/html_proofer/check/open_graph.rb @@ -11,24 +11,22 @@ def run # does the open_graph exist? if missing_content? - add_failure("open graph has no content attribute", line: @open_graph.line, content: @open_graph.content) + add_failure("open graph has no content attribute", element: @open_graph) elsif empty_content? - add_failure("open graph content attribute is empty", line: @open_graph.line, content: @open_graph.content) + add_failure("open graph content attribute is empty", element: @open_graph) elsif !@open_graph.url.valid? - add_failure("#{@open_graph.src} is an invalid URL", line: @open_graph.line) + add_failure("#{@open_graph.src} is an invalid URL", element: @open_graph) elsif @open_graph.url.protocol_relative? add_failure( "open graph link #{@open_graph.url} is a protocol-relative URL, use explicit https:// instead", - line: @open_graph.line, - content: @open_graph.content, + element: @open_graph, ) elsif @open_graph.url.remote? add_to_external_urls(@open_graph.url, @open_graph.line) else add_failure( "internal open graph #{@open_graph.url.raw_attribute} does not exist", - line: @open_graph.line, - content: @open_graph.content, + element: @open_graph, ) unless @open_graph.url.exists? end end diff --git a/lib/html_proofer/check/scripts.rb b/lib/html_proofer/check/scripts.rb index d07999fc..f80e20f8 100644 --- a/lib/html_proofer/check/scripts.rb +++ b/lib/html_proofer/check/scripts.rb @@ -12,12 +12,11 @@ def run # does the script exist? if missing_src? - add_failure("script is empty and has no src attribute", line: @script.line, content: @script.content) + add_failure("script is empty and has no src attribute", element: @script) elsif @script.url.protocol_relative? add_failure( "script link #{@script.url} is a protocol-relative URL, use explicit https:// instead", - line: @script.line, - content: @script.content, + element: @script, ) elsif @script.url.remote? add_to_external_urls(@script.url, @script.line) @@ -25,8 +24,7 @@ def run elsif !@script.url.exists? add_failure( "internal script reference #{@script.src} does not exist", - line: @script.line, - content: @script.content, + element: @script, ) end end @@ -42,20 +40,17 @@ def check_sri if blank?(@script.node["integrity"]) && blank?(@script.node["crossorigin"]) add_failure( "SRI and CORS not provided in: #{@script.url.raw_attribute}", - line: @script.line, - content: @script.content, + element: @script, ) elsif blank?(@script.node["integrity"]) add_failure( "Integrity is missing in: #{@script.url.raw_attribute}", - line: @script.line, - content: @script.content, + element: @script, ) elsif blank?(@script.node["crossorigin"]) add_failure( "CORS not provided for external resource in: #{@script.url.raw_attribute}", - line: @script.line, - content: @script.content, + element: @script, ) end end diff --git a/spec/html-proofer/check_spec.rb b/spec/html-proofer/check_spec.rb index 311077a4..09a348fb 100644 --- a/spec/html-proofer/check_spec.rb +++ b/spec/html-proofer/check_spec.rb @@ -13,7 +13,7 @@ def run next if @link.ignore? - return add_failure("Don't email the Octocat directly!", line: @link.line) if mailto_octocat? + return add_failure("Don't email the Octocat directly!", element: @link) if mailto_octocat? end end end From 4f5d22714c51596f5bea2471ed82c67dc2026b10 Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Thu, 9 Mar 2023 18:16:04 +0100 Subject: [PATCH 03/10] Align "Custom tests" example to the current functionality / code style * Example code as in the `check_spec.rb`. --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7d0a5f64..7e6ed6e4 100644 --- a/README.md +++ b/README.md @@ -422,23 +422,23 @@ Want to write your own test? Sure, that's possible! Just create a class that inherits from `HTMLProofer::Check`. This subclass must define one method called `run`. This is called on your content, and is responsible for performing the validation on whatever elements you like. When you catch a broken issue, call `add_failure(message, line: line, content: content)` to explain the error. `line` refers to the line numbers, and `content` is the node content of the broken element. -If you're working with the element's attributes (as most checks do), you'll also want to call `create_element(node)` as part of your suite. This constructs an object that contains all the attributes of the HTML element you're iterating on. +If you're working with the element's attributes (as most checks do), you'll also want to call `create_element(node)` as part of your suite. This constructs an object that contains all the attributes of the HTML element you're iterating on, and can also be used directly to call `add_failure(message, element: element)`. Here's an example custom test demonstrating these concepts. It reports `mailto` links that point to `octocat@github.com`: ``` ruby -class MailToOctocat < ::HTMLProofer::Check +class MailToOctocat < HTMLProofer::Check def mailto_octocat? - @link.url.raw_attribute == 'mailto:octocat@github.com' + @link.url.raw_attribute == "mailto:octocat@github.com" end def run - @html.css('a').each do |node| + @html.css("a").each do |node| @link = create_element(node) next if @link.ignore? - return add_failure("Don't email the Octocat directly!", line: @link.line) if mailto_octocat? + return add_failure("Don't email the Octocat directly!", element: @link) if mailto_octocat? end end end @@ -448,7 +448,7 @@ Don't forget to include this new check in HTMLProofer's options, for example: ```ruby # removes default checks and just runs this one -HTMLProofer.check_directories(["out/"], {checks: ['MailToOctocat']}) +HTMLProofer.check_directories(["out/"], { checks: ["MailToOctocat"] }) ``` See our [list of third-party custom classes](https://github.com/gjtorikian/html-proofer/wiki/Extensions-(custom-classes)) and add your own to this list. From b4c76f73971248f4932e41311827e992a7e7fd9a Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Thu, 9 Mar 2023 18:40:11 +0100 Subject: [PATCH 04/10] Lint Ruby code chunks in README.md * Using `rubocop-md` --- README.md | 82 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 7e6ed6e4..a5848ce8 100644 --- a/README.md +++ b/README.md @@ -88,26 +88,26 @@ It can also run through the command-line. Here's an example: ```ruby -require 'html-proofer' -require 'html/pipeline' -require 'find' +require "html-proofer" +require "html/pipeline" +require "find" # make an out dir Dir.mkdir("out") unless File.exist?("out") -pipeline = HTML::Pipeline.new [ +pipeline = HTML::Pipeline.new([ HTML::Pipeline::MarkdownFilter, - HTML::Pipeline::TableOfContentsFilter -], gfm: true + HTML::Pipeline::TableOfContentsFilter, +], +gfm: true) # iterate over files, and generate HTML from Markdown Find.find("./docs") do |path| - if File.extname(path) == ".md" - contents = File.read(path) - result = pipeline.call(contents) + next unless File.extname(path) == ".md" + contents = File.read(path) + result = pipeline.call(contents) - File.open("out/#{path.split("/").pop.sub('.md', '.html')}", 'w') { |file| file.write(result[:output].to_s) } - end + File.open("out/#{path.split("/").pop.sub(".md", ".html")}", "w") { |file| file.write(result[:output].to_s) } end # test your out dir! @@ -119,7 +119,7 @@ HTMLProofer.check_directory("./out").run If you simply want to check a single file, use the `check_file` method: ``` ruby -HTMLProofer.check_file('/path/to/a/file.html').run +HTMLProofer.check_file("/path/to/a/file.html").run ``` ### Checking directories @@ -127,13 +127,13 @@ HTMLProofer.check_file('/path/to/a/file.html').run If you want to check a directory, use `check_directory`: ``` ruby -HTMLProofer.check_directory('./out').run +HTMLProofer.check_directory("./out").run ``` If you want to check multiple directories, use `check_directories`: ``` ruby -HTMLProofer.check_directories(['./one', './two']).run +HTMLProofer.check_directories(["./one", "./two"]).run ``` ### Checking an array of links @@ -141,7 +141,7 @@ HTMLProofer.check_directories(['./one', './two']).run With `check_links`, you can also pass in an array of links: ``` ruby -HTMLProofer.check_links(['https://github.com', 'https://jekyllrb.com']).run +HTMLProofer.check_links(["https://github.com", "https://jekyllrb.com"]).run ``` ### Swapping information @@ -149,7 +149,7 @@ HTMLProofer.check_links(['https://github.com', 'https://jekyllrb.com']).run Sometimes, the information in your HTML is not the same as how your server serves content. In these cases, you can use `swap_urls` to map the URL in a file to the URL you'd like it to become. For example: ```ruby -run_proofer(file, :file, swap_urls: { %r{^https//placeholder.com}: 'https://website.com' }) +run_proofer(file, :file, swap_urls: { %r{^https//placeholder.com} => "https://website.com" }) ``` In this case, any link that matches the `^https://placeholder.com` will be converted to `https://website.com`. @@ -157,7 +157,7 @@ In this case, any link that matches the `^https://placeholder.com` will be conve A similar swapping process can be done for attributes: ```ruby -run_proofer(file, :file, swap_attributes: { 'img': [['data-src', 'src']] }) +run_proofer(file, :file, swap_attributes: { "img": [["data-src", "src"]] }) ``` In this case, we are telling HTMLProofer that, for any `img` tag detected, for any `src` attribute, pretend it's actually the `src` attribute instead. Since the value is an array of arrays, you can pass in as many attribute swaps as you need for each element. @@ -216,7 +216,7 @@ htmlproofer --assume-extension ./_site --swap-urls '^/BASEURL/:/' or in your `Rakefile` ```ruby -require 'html-proofer' +require "html-proofer" task :test do sh "bundle exec jekyll build" @@ -251,15 +251,16 @@ This can also apply to parent elements, all the way up to the `` tag: Say you've got some new files in a pull request, and your tests are failing because links to those files are not live yet. One thing you can do is run a diff against your base branch and explicitly ignore the new files, like this: ```ruby - directories = %w(content) - merge_base = `git merge-base origin/production HEAD`.chomp - diffable_files = `git diff -z --name-only --diff-filter=AC #{merge_base}`.split("\0") - diffable_files = diffable_files.select do |filename| - next true if directories.include?(File.dirname(filename)) - filename.end_with?('.md') - end.map { |f| Regexp.new(File.basename(f, File.extname(f))) } +directories = ['content'] +merge_base = %x(git merge-base origin/production HEAD).chomp +diffable_files = %x(git diff -z --name-only --diff-filter=AC #{merge_base}).split("\0") +diffable_files = diffable_files.select do |filename| + next true if directories.include?(File.dirname(filename)) + + filename.end_with?(".md") +end.map { |f| Regexp.new(File.basename(f, File.extname(f))) } - HTMLProofer.check_directory('./output', { ignore_urls: diffable_files }).run +HTMLProofer.check_directory("./output", { ignore_urls: diffable_files }).run ``` ## Configuration @@ -301,7 +302,7 @@ In addition, there are a few "namespaced" options. These are: [Typhoeus](https://github.com/typhoeus/typhoeus) is used to make fast, parallel requests to external URLs. You can pass in any of Typhoeus' options for the external link checks with the options namespace of `:typhoeus`. For example: ``` ruby -HTMLProofer.new("out/", {extensions: [".htm"], typhoeus: { verbose: true, ssl_verifyhost: 2 } }) +HTMLProofer.new("out/", { extensions: [".htm"], typhoeus: { verbose: true, ssl_verifyhost: 2 } }) ``` This sets `HTMLProofer`'s extensions to use _.htm_, gives Typhoeus a configuration for it to be verbose, and use specific SSL settings. Check the [Typhoeus documentation](https://github.com/typhoeus/typhoeus#other-curl-options) for more information on what options it can receive. @@ -316,9 +317,9 @@ The default value is: { followlocation: true, connecttimeout: 10, - timeout: 30 + timeout: 30, }, - hydra: { max_concurrency: 50 } + hydra: { max_concurrency: 50 }, } ``` @@ -331,7 +332,7 @@ You can provide a block to set some logic before an external link is checked. Fo ```ruby proofer = HTMLProofer.check_directory(item, opts) proofer.before_request do |request| - request.options[:headers]['Authorization'] = "Bearer " if request.base_url == "https://github.com" + request.options[:headers]["Authorization"] = "Bearer " if request.base_url == "https://github.com" end proofer.run ``` @@ -352,25 +353,25 @@ You can enable caching for this by passing in the configuration option `:cache`, For example, passing the following options means "recheck external links older than thirty days": ``` ruby -{ cache: { timeframe: { external: '30d' } } } +{ cache: { timeframe: { external: "30d" } } } ``` And the following options means "recheck internal links older than two weeks": ``` ruby -{ cache: { timeframe: { internal: '2w' } } } +{ cache: { timeframe: { internal: "2w" } } } ``` Naturally, to support both internal and external link caching, both keys would need to be provided. The following checks external links every two weeks, but internal links only once a week: ``` ruby -{ cache: { timeframe: { external: '2w', internal: '1w' } } } +{ cache: { timeframe: { external: "2w", internal: "1w" } } } ``` You can change the filename or the directory where the cache file is kept by also providing the `storage_dir` key: ``` ruby -{ cache: { cache_file: 'stay_cachey.json', storage_dir: '/tmp/html-proofer-cache-money' } } +{ cache: { cache_file: "stay_cachey.json", storage_dir: "/tmp/html-proofer-cache-money" } } ``` Links that were failures are kept in the cache and *always* rechecked. If they pass, the cache is updated to note the new timestamp. @@ -479,7 +480,8 @@ To ignore SSL certificates, turn off Typhoeus' SSL verification: HTMLProofer.check_directory("out/", { typhoeus: { ssl_verifypeer: false, - ssl_verifyhost: 0} + ssl_verifyhost: 0, +}, }).run ``` @@ -490,8 +492,9 @@ To change the User-Agent used by Typhoeus: ``` ruby HTMLProofer.check_directory("out/", { typhoeus: { - headers: { "User-Agent" => "Mozilla/5.0 (compatible; My New User-Agent)" } -}}).run + headers: { "User-Agent" => "Mozilla/5.0 (compatible; My New User-Agent)" }, + } +}).run ``` Alternatively, you can specifify these options on the commandline with: @@ -508,8 +511,9 @@ Sometimes links fail because they don't have access to cookies. To fix this you HTMLProofer.check_directory("out/", { typhoeus: { cookiefile: ".cookies", - cookiejar: ".cookies" -}}).run + cookiejar: ".cookies", + } +}).run ``` ```bash From e0e01702ad2bc34d3de41bb71667ef8553721946 Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Fri, 10 Mar 2023 11:16:23 +0100 Subject: [PATCH 05/10] Simplify relative link logic, removing unnecessary code * Most of the logic is legacy from the very early HTMLProofer, and is irrelevant not since `@filename` can only be the file where the link is defined, with `File.join` properly handling both same-directory, nested, and parent links, together with the ultimate `File.expand_path` in `absolute_path. * The legacy logic comes pretty much from #6 and #23. * This way we can also avoid `File.exist`, which can ultimately be delegated to checking the existence, not constructing the path. --- lib/html_proofer/attribute/url.rb | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/lib/html_proofer/attribute/url.rb b/lib/html_proofer/attribute/url.rb index 7507413a..59a76cac 100644 --- a/lib/html_proofer/attribute/url.rb +++ b/lib/html_proofer/attribute/url.rb @@ -136,28 +136,12 @@ def absolute_path def file_path return if path.nil? || path.empty? - path_dot_ext = "" - - path_dot_ext = path + @runner.options[:assume_extension] unless blank?(@runner.options[:assume_extension]) - base = if absolute_path?(path) # path relative to root # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname @runner.options[:root_dir] || (File.directory?(@source) ? @source : File.dirname(@source)) # relative links, path is a file - elsif File.exist?(File.expand_path( - path, - @source, - )) || File.exist?(File.expand_path(path_dot_ext, @source)) - File.dirname(@filename) - # relative links in nested dir, path is a file - elsif File.exist?(File.join( - File.dirname(@filename), - path, - )) || File.exist?(File.join(File.dirname(@filename), path_dot_ext)) - File.dirname(@filename) - # relative link, path is a directory else - @filename + File.dirname(@filename) end file = File.join(base, path) From d52d82844f382e81290630a502d9a45df43c5e71 Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Fri, 10 Mar 2023 11:34:42 +0100 Subject: [PATCH 06/10] Refactor internal link paths resolution / existence check * We now construct and maintain a hash of resolved paths, as a way to have a single instance of going through OS for checking the existence of alternative resolved paths, including assumed extension and directory index file. --- lib/html_proofer/attribute/url.rb | 38 ++++++++++++++-------- lib/html_proofer/runner.rb | 4 +-- lib/html_proofer/url_validator/internal.rb | 10 ++---- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/lib/html_proofer/attribute/url.rb b/lib/html_proofer/attribute/url.rb index 59a76cac..68d2d221 100644 --- a/lib/html_proofer/attribute/url.rb +++ b/lib/html_proofer/attribute/url.rb @@ -118,9 +118,29 @@ def query_values def exists? return true if base64? - return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path) + !resolved_path.nil? + end + + def resolved_path + path_to_resolve = absolute_path + + return @runner.resolved_paths[path_to_resolve] if @runner.resolved_paths.key?(path_to_resolve) + + # extensionless URLs + path_with_extension = "#{path_to_resolve}#{@runner.options[:assume_extension]}" + resolved = if @runner.options[:assume_extension] && File.file?(path_with_extension) + path_with_extension # existence checked implicitly by File.file? + # implicit index support + elsif File.directory?(path_to_resolve) && !unslashed_directory?(path_to_resolve) + path_with_index = File.join(path_to_resolve, @runner.options[:directory_index_file]) + path_with_index if File.file?(path_with_index) + # explicit file or directory + elsif File.exist?(path_to_resolve) + path_to_resolve + end + @runner.resolved_paths[path_to_resolve] = resolved - @runner.checked_paths[absolute_path] = File.exist?(absolute_path) + resolved end def base64? @@ -128,12 +148,12 @@ def base64? end def absolute_path - path = file_path || @filename + path = resolve_path || @filename File.expand_path(path, Dir.pwd) end - def file_path + def resolve_path return if path.nil? || path.empty? base = if absolute_path?(path) # path relative to root @@ -144,15 +164,7 @@ def file_path File.dirname(@filename) end - file = File.join(base, path) - - if @runner.options[:assume_extension] && File.file?("#{file}#{@runner.options[:assume_extension]}") - file = "#{file}#{@runner.options[:assume_extension]}" - elsif File.directory?(file) && !unslashed_directory?(file) # implicit index support - file = File.join(file, @runner.options[:directory_index_file]) - end - - file + File.join(base, path) end def unslashed_directory?(file) diff --git a/lib/html_proofer/runner.rb b/lib/html_proofer/runner.rb index ce5040f4..36330775 100644 --- a/lib/html_proofer/runner.rb +++ b/lib/html_proofer/runner.rb @@ -6,7 +6,7 @@ module HTMLProofer class Runner include HTMLProofer::Utils - attr_reader :options, :cache, :logger, :internal_urls, :external_urls, :checked_paths, :current_check + attr_reader :options, :cache, :logger, :internal_urls, :external_urls, :resolved_paths, :current_check attr_accessor :current_filename, :current_source, :reporter URL_TYPES = [:external, :internal].freeze @@ -26,7 +26,7 @@ def initialize(src, opts = {}) @before_request = [] - @checked_paths = {} + @resolved_paths = {} @current_check = nil @current_source = nil diff --git a/lib/html_proofer/url_validator/internal.rb b/lib/html_proofer/url_validator/internal.rb index daec0b03..7527ba8e 100644 --- a/lib/html_proofer/url_validator/internal.rb +++ b/lib/html_proofer/url_validator/internal.rb @@ -31,8 +31,7 @@ def run_internal_link_checker(links) matched_files.each do |metadata| url = HTMLProofer::Attribute::Url.new(@runner, link, base_url: metadata[:base_url], source: metadata[:source], filename: metadata[:filename]) - target_file_path = url.absolute_path - unless file_exists?(target_file_path) + unless url.exists? @failed_checks << Failure.new( metadata[:filename], "Links > Internal", @@ -48,6 +47,7 @@ def run_internal_link_checker(links) hash_exists = hash_exists_for_url?(url) if hash_exists.nil? # the hash needs to be checked in the target file, we collect the url and metadata + target_file_path = url.resolved_path unless file_paths_hashes_to_check.key?(target_file_path) file_paths_hashes_to_check[target_file_path] = {} end @@ -106,12 +106,6 @@ def run_internal_link_checker(links) @failed_checks end - private def file_exists?(absolute_path) - return @runner.checked_paths[absolute_path] if @runner.checked_paths.key?(absolute_path) - - @runner.checked_paths[absolute_path] = File.exist?(absolute_path) - end - # verify the hash w/o just based on the URL, w/o looking at the target file # => returns nil if the has could not be verified private def hash_exists_for_url?(url) From 6913d4efb3577ad318b8392a48bb9609cbb7b004 Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Tue, 14 Mar 2023 23:10:28 +0100 Subject: [PATCH 07/10] Rename handling of internal full path construction * To not clash / be confusing with `resolved_path`. --- lib/html_proofer/attribute/url.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/html_proofer/attribute/url.rb b/lib/html_proofer/attribute/url.rb index 68d2d221..5b2fd004 100644 --- a/lib/html_proofer/attribute/url.rb +++ b/lib/html_proofer/attribute/url.rb @@ -148,12 +148,12 @@ def base64? end def absolute_path - path = resolve_path || @filename + path = full_path || @filename File.expand_path(path, Dir.pwd) end - def resolve_path + def full_path return if path.nil? || path.empty? base = if absolute_path?(path) # path relative to root From aba17cc6b7aade4936a522f73293602619bcb65d Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Tue, 14 Mar 2023 23:11:27 +0100 Subject: [PATCH 08/10] Fix inline comments for full internal path construction --- lib/html_proofer/attribute/url.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/html_proofer/attribute/url.rb b/lib/html_proofer/attribute/url.rb index 5b2fd004..5c5c59f3 100644 --- a/lib/html_proofer/attribute/url.rb +++ b/lib/html_proofer/attribute/url.rb @@ -157,10 +157,10 @@ def full_path return if path.nil? || path.empty? base = if absolute_path?(path) # path relative to root - # either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname + # either overwrite with root_dir; or, if source is directory, use that; or, just get the source file's dirname @runner.options[:root_dir] || (File.directory?(@source) ? @source : File.dirname(@source)) - # relative links, path is a file else + # path relative to the file where the link is defined File.dirname(@filename) end From 3d6df6e31f32c0603df023b74a97e61162fcf7d3 Mon Sep 17 00:00:00 2001 From: Riccardo Porreca Date: Wed, 15 Mar 2023 09:08:57 +0100 Subject: [PATCH 09/10] Read-only `current_filename`/`_source` attributes in `Runner` --- lib/html_proofer/runner.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/html_proofer/runner.rb b/lib/html_proofer/runner.rb index 36330775..05b2a8d9 100644 --- a/lib/html_proofer/runner.rb +++ b/lib/html_proofer/runner.rb @@ -6,8 +6,8 @@ module HTMLProofer class Runner include HTMLProofer::Utils - attr_reader :options, :cache, :logger, :internal_urls, :external_urls, :resolved_paths, :current_check - attr_accessor :current_filename, :current_source, :reporter + attr_reader :options, :cache, :logger, :internal_urls, :external_urls, :resolved_paths, :current_check, :current_filename, :current_source + attr_accessor :reporter URL_TYPES = [:external, :internal].freeze From 27bf3eedae9c41f2584b927573a9b1480163094a Mon Sep 17 00:00:00 2001 From: "Garen J. Torikian" Date: Wed, 15 Mar 2023 13:43:00 -0400 Subject: [PATCH 10/10] :gem: bump to 5.0.6 --- lib/html_proofer/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/html_proofer/version.rb b/lib/html_proofer/version.rb index 6954358c..e07675f5 100644 --- a/lib/html_proofer/version.rb +++ b/lib/html_proofer/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module HTMLProofer - VERSION = "5.0.5" + VERSION = "5.0.6" end