Make sure UrlSkipper doesn’t ignore fragments

We're decoupling fragments (`#anchor`) from new destination URLs so that linkcheck tries to access physical URLs only once. (Otherwise, it would assume the different URLs of `/path#anchor1` and `/path#anchor2` both need checking.) But skipping according to fragment should work. User may want to skip all links that go to `/path/#angular`, for example. So this CL moves the skipping logic a bit higher up, before we decouple the fragment from the URL. Destination is still added.
filiph · Dec 17, 2016 · d213f2c · d213f2c
1 parent 83de58d
commit d213f2c
Show file tree

Hide file tree

Showing 10 changed files with 116 additions and 40 deletions.
diff --git a/lib/linkcheck.dart b/lib/linkcheck.dart
@@ -38,7 +38,6 @@ void printStats(CrawlResult result, int broken, int withWarning, int withInfo,
   int count = result.destinations.length;
   int ignored = result.destinations
       .where((destination) =>
-          destination.wasSkipped ||
           destination.wasDeniedByRobotsTxt ||
           destination.isUnsupportedScheme ||
           (destination.isExternal && !destination.wasTried))
@@ -200,8 +199,8 @@ Future<int> run(List<String> arguments, Stdout stdout) async {
     var file = new File(inputFile);
     try {
       urls.addAll(file.readAsLinesSync().where((url) => url.isNotEmpty));
-    } on FileSystemException {
-      print("Can't read file '$inputFile'.");
+    } on FileSystemException catch (e) {
+      print("Can't read input file '$inputFile': $e");
       return 2;
     }
   }
@@ -210,8 +209,8 @@ Future<int> run(List<String> arguments, Stdout stdout) async {
     var file = new File(skipFile);
     try {
       skipper = new UrlSkipper(file.path, file.readAsLinesSync());
-    } on FileSystemException {
-      print("Can't read file '$skipFile'.");
+    } on FileSystemException catch (e) {
+      print("Can't read skip file '$skipFile': $e");
       return 2;
     }
   }

diff --git a/lib/src/crawl.dart b/lib/src/crawl.dart
@@ -283,6 +283,16 @@ Future<CrawlResult> crawl(
     // Add links' destinations to [newDestinations] if they haven't been
     // seen before.
     for (var link in result.links) {
+      // Mark links as skipped first.
+      if (skipper.skips(link.destinationUrlWithFragment)) {
+        link.wasSkipped = true;
+        if (verbose) {
+          print("- will not be checking: ${link.destination} - "
+              "${skipper.explain(link.destinationUrlWithFragment)}");
+        }
+        continue;
+      }
+
       if (bin[link.destination.url] == null) {
         // Completely new destination.
         assert(open.where((d) => d.url == link.destination.url).isEmpty);
@@ -297,12 +307,14 @@ Future<CrawlResult> crawl(
             print("- destination: ${link.destination} already "
                 "seen on this page");
           }
-        } else {
-          if (verbose) {
-            print("- completely new destination: ${link.destination}");
-          }
-          newDestinations.add(link.destination);
+          continue;
         }
+
+        if (verbose) {
+          print("- completely new destination: ${link.destination}");
+        }
+
+        newDestinations.add(link.destination);
       }
     }
 
@@ -326,24 +338,12 @@ Future<CrawlResult> crawl(
         continue;
       }
 
-      destination.wasSkipped = skipper.skips(destination.url);
-
       // Making sure this is set. The next (wasSkipped) section could
       // short-circuit this loop so we have to assign to isExternal here
       // while we have the chance.
       destination.isExternal =
           !uriGlobs.any((glob) => glob.matches(destination.uri));
 
-      if (destination.wasSkipped) {
-        closed.add(destination);
-        bin[destination.url] = Bin.closed;
-        if (verbose) {
-          print("Will not be checking: $destination - "
-              "${skipper.explain(destination.url)}");
-        }
-        continue;
-      }
-
       // The URL is external and wasn't skipped. We'll find out whether to
       // check it according to the [shouldCheckExternal] option.
       if (destination.isExternal) {
@@ -374,7 +374,6 @@ Future<CrawlResult> crawl(
     // Do any destinations have different hosts? Add them to unknownServers.
     Iterable<String> newHosts = newDestinations
         .where((destination) => !destination.isInvalid)
-        .where((destination) => !destination.wasSkipped)
         .where((destination) => shouldCheckExternal || !destination.isExternal)
         .map((destination) => destination.uri.authority)
         .where((String host) =>
@@ -420,7 +419,6 @@ Future<CrawlResult> crawl(
       destination.wasTried ||
       (destination.isExternal && !shouldCheckExternal) ||
       destination.isUnsupportedScheme ||
-      destination.wasSkipped ||
       destination.wasDeniedByRobotsTxt));
 
   if (verbose) {

diff --git a/lib/src/destination.dart b/lib/src/destination.dart
@@ -40,12 +40,6 @@ class Destination {
 
   bool isExternal;
 
-  /// Whether or not this destination was marked as skipped.
-  ///
-  /// User has an option to skip URLs via regexp patterns. When this destination
-  /// has a match, [wasSkipped] will be `true`.
-  bool wasSkipped = false;
-
   /// True if this [Destination] is parseable and could contain links to
   /// other destinations. For example, HTML and CSS files are sources. JPEGs
   /// and

diff --git a/lib/src/link.dart b/lib/src/link.dart
@@ -8,28 +8,58 @@ class Link {
   Destination destination;
   String fragment;
 
-  Link(this.origin, this.destination, String fragment)
+  /// Whether or not this link was marked as skipped.
+  ///
+  /// User has an option to skip URLs via regexp patterns. When this destination
+  /// has a match, [wasSkipped] will be `true`.
+  bool wasSkipped = false;
+
+  Link(this.origin, this.destination, String fragment,
+      [this.wasSkipped = false])
       : fragment = fragment == null || fragment.isEmpty ? null : fragment;
 
   Link.fromMap(Map<String, Object> map)
       : this(
             new Origin.fromMap(map["origin"] as Map<String, Object>),
             new Destination.fromMap(map["destination"] as Map<String, Object>),
-            map["destinationAnchor"]);
+            map["destinationAnchor"],
+            map["wasSkipped"]);
 
-  bool get hasError => destination.isBroken;
+  bool get breaksAnchor =>
+      !wasSkipped &&
+      destination.wasParsed &&
+      !destination.satisfiesFragment(fragment);
 
-  bool get hasWarning => breaksAnchor;
+  /// Returns the destination URL with [fragment] (if there was any).
+  ///
+  /// For example, let's say the original HTML at `http://example.com/path/`
+  /// includes this code:
+  ///
+  ///     <a href="../about/#contact">...</a>
+  ///
+  /// In this case [destination]'s [Destination.url] will be
+  /// `http://example/about/` (because destinations shouldn't be duplicated
+  /// when there are more anchors on the page).
+  ///
+  /// That works for most needs of linkcheck but sometimes we need
+  /// the resolved destination URL _with_ the original fragment. For that,
+  /// there is [destinationUrlWithFragment].
+  String get destinationUrlWithFragment {
+    if (fragment == null) return destination.url;
+    return "${destination.url}#$fragment";
+  }
+
+  bool get hasError => destination.isBroken; // TODO: add wasSkipped?
 
   bool get hasInfo => destination.wasDeniedByRobotsTxt;
 
-  bool get breaksAnchor =>
-      destination.wasParsed && !destination.satisfiesFragment(fragment);
+  bool get hasWarning => breaksAnchor;
 
   Map<String, Object> toMap() => {
         "origin": origin.toMap(),
         "destination": destination.toMap(),
-        "destinationAnchor": fragment
+        "destinationAnchor": fragment,
+        "wasSkipped": wasSkipped
       };
 
   String toString() => "$origin => $destination"

diff --git a/lib/src/parsers/url_skipper.dart b/lib/src/parsers/url_skipper.dart
@@ -35,7 +35,7 @@ class UrlSkipper {
         .map((rec) => "${rec.pattern.pattern} (line ${rec.line})")
         .join(", ");
     return "URL '$url' skipped because it was matched by the following "
-        "regural expressions of skip file '$path': $list";
+        "regular expressions of skip file '$path': $list";
   }
 
   static Iterable<_UrlSkipperRecord> _parse(Iterable<String> lines) sync* {

diff --git a/test/case8/index.html b/test/case8/index.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title></title>
+</head>
+<body>
+    <a href="subdirectory/other.html">Other</a>
+    <a href="subdirectory/ignored#something">Ignored</a>
+</body>
+</html>
diff --git a/test/case8/skip-file.txt b/test/case8/skip-file.txt
@@ -0,0 +1 @@
+subdirectory/ignored#something
diff --git a/test/case8/subdirectory/other.html b/test/case8/subdirectory/other.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title></title>
+</head>
+<body>
+<a href="../index.html">Home</a>
+<a href="./ignored#something">Ignored</a>
+</body>
+</html>
diff --git a/test/e2e_test.dart b/test/e2e_test.dart
@@ -49,8 +49,7 @@ void main() {
       }
     });
 
-    test("reports info when link is behind robots.txt rule",
-        () async {
+    test("reports info when link is behind robots.txt rule", () async {
       var server = await Dhttpd.start(path: getServingPath(5), port: port);
       try {
         int result = await run([":$port"], out);
@@ -127,6 +126,19 @@ void main() {
         await server.destroy();
       }
     });
+
+    test("skips URLs according to their resolved URL with fragment", () async {
+      var server = await Dhttpd.start(path: getServingPath(8), port: port);
+      try {
+        int result = await run(
+            [":$port", "--skip-file", "test/case8/skip-file.txt"], out);
+        expect(result, 0);
+        expect(out.output, contains("0 warnings"));
+        expect(out.output, contains("0 errors"));
+      } finally {
+        await server.destroy();
+      }
+    });
   }, tags: ["integration"]);
 }
 

diff --git a/test/url_skipper_test.dart b/test/url_skipper_test.dart
@@ -72,4 +72,24 @@ void main() {
     expect(skipper.skips("http://google.com/something/else"), true);
     expect(skipper.skips("http://google.com/#about"), true);
   });
+
+  test("blank lines are ignored", () {
+    var contents = r"""
+    # This is a comment
+
+    \.com$
+    /else
+    """.trim().split("\n");
+    var skipper = new UrlSkipper(dummyPath, contents);
+    expect(skipper.skips("http://google.com"), true);
+    expect(skipper.skips("http://google.com/something/"), false);
+    expect(skipper.skips("http://google.com/something/else"), true);
+  });
+
+  test("hash (#) at end of regexp works", () {
+    var contents = ["/path/to/page#"];
+    var skipper = new UrlSkipper(dummyPath, contents);
+    expect(skipper.skips("http://example.com/path/to/page"), false);
+    expect(skipper.skips("http://example.com/path/to/page#rxjs"), true);
+  });
 }