Skip to content

Commit

Permalink
Make sure UrlSkipper doesn’t ignore fragments
Browse files Browse the repository at this point in the history
We're decoupling fragments (`#anchor`) from new destination URLs so that linkcheck tries to access physical URLs only once. (Otherwise, it would assume the different URLs of `/path#anchor1` and `/path#anchor2` both need checking.)

But skipping according to fragment should work. User may want to skip all links that go to `/path/#angular`, for example.

So this CL moves the skipping logic a bit higher up, before we decouple the fragment from the URL. Destination is still added.
  • Loading branch information
filiph committed Dec 17, 2016
1 parent 83de58d commit d213f2c
Show file tree
Hide file tree
Showing 10 changed files with 116 additions and 40 deletions.
9 changes: 4 additions & 5 deletions lib/linkcheck.dart
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ void printStats(CrawlResult result, int broken, int withWarning, int withInfo,
int count = result.destinations.length;
int ignored = result.destinations
.where((destination) =>
destination.wasSkipped ||
destination.wasDeniedByRobotsTxt ||
destination.isUnsupportedScheme ||
(destination.isExternal && !destination.wasTried))
Expand Down Expand Up @@ -200,8 +199,8 @@ Future<int> run(List<String> arguments, Stdout stdout) async {
var file = new File(inputFile);
try {
urls.addAll(file.readAsLinesSync().where((url) => url.isNotEmpty));
} on FileSystemException {
print("Can't read file '$inputFile'.");
} on FileSystemException catch (e) {
print("Can't read input file '$inputFile': $e");
return 2;
}
}
Expand All @@ -210,8 +209,8 @@ Future<int> run(List<String> arguments, Stdout stdout) async {
var file = new File(skipFile);
try {
skipper = new UrlSkipper(file.path, file.readAsLinesSync());
} on FileSystemException {
print("Can't read file '$skipFile'.");
} on FileSystemException catch (e) {
print("Can't read skip file '$skipFile': $e");
return 2;
}
}
Expand Down
36 changes: 17 additions & 19 deletions lib/src/crawl.dart
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,16 @@ Future<CrawlResult> crawl(
// Add links' destinations to [newDestinations] if they haven't been
// seen before.
for (var link in result.links) {
// Mark links as skipped first.
if (skipper.skips(link.destinationUrlWithFragment)) {
link.wasSkipped = true;
if (verbose) {
print("- will not be checking: ${link.destination} - "
"${skipper.explain(link.destinationUrlWithFragment)}");
}
continue;
}

if (bin[link.destination.url] == null) {
// Completely new destination.
assert(open.where((d) => d.url == link.destination.url).isEmpty);
Expand All @@ -297,12 +307,14 @@ Future<CrawlResult> crawl(
print("- destination: ${link.destination} already "
"seen on this page");
}
} else {
if (verbose) {
print("- completely new destination: ${link.destination}");
}
newDestinations.add(link.destination);
continue;
}

if (verbose) {
print("- completely new destination: ${link.destination}");
}

newDestinations.add(link.destination);
}
}

Expand All @@ -326,24 +338,12 @@ Future<CrawlResult> crawl(
continue;
}

destination.wasSkipped = skipper.skips(destination.url);

// Making sure this is set. The next (wasSkipped) section could
// short-circuit this loop so we have to assign to isExternal here
// while we have the chance.
destination.isExternal =
!uriGlobs.any((glob) => glob.matches(destination.uri));

if (destination.wasSkipped) {
closed.add(destination);
bin[destination.url] = Bin.closed;
if (verbose) {
print("Will not be checking: $destination - "
"${skipper.explain(destination.url)}");
}
continue;
}

// The URL is external and wasn't skipped. We'll find out whether to
// check it according to the [shouldCheckExternal] option.
if (destination.isExternal) {
Expand Down Expand Up @@ -374,7 +374,6 @@ Future<CrawlResult> crawl(
// Do any destinations have different hosts? Add them to unknownServers.
Iterable<String> newHosts = newDestinations
.where((destination) => !destination.isInvalid)
.where((destination) => !destination.wasSkipped)
.where((destination) => shouldCheckExternal || !destination.isExternal)
.map((destination) => destination.uri.authority)
.where((String host) =>
Expand Down Expand Up @@ -420,7 +419,6 @@ Future<CrawlResult> crawl(
destination.wasTried ||
(destination.isExternal && !shouldCheckExternal) ||
destination.isUnsupportedScheme ||
destination.wasSkipped ||
destination.wasDeniedByRobotsTxt));

if (verbose) {
Expand Down
6 changes: 0 additions & 6 deletions lib/src/destination.dart
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ class Destination {

bool isExternal;

/// Whether or not this destination was marked as skipped.
///
/// User has an option to skip URLs via regexp patterns. When this destination
/// has a match, [wasSkipped] will be `true`.
bool wasSkipped = false;

/// True if this [Destination] is parseable and could contain links to
/// other destinations. For example, HTML and CSS files are sources. JPEGs
/// and
Expand Down
44 changes: 37 additions & 7 deletions lib/src/link.dart
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,58 @@ class Link {
Destination destination;
String fragment;

Link(this.origin, this.destination, String fragment)
/// Whether or not this link was marked as skipped.
///
/// User has an option to skip URLs via regexp patterns. When this destination
/// has a match, [wasSkipped] will be `true`.
bool wasSkipped = false;

Link(this.origin, this.destination, String fragment,
[this.wasSkipped = false])
: fragment = fragment == null || fragment.isEmpty ? null : fragment;

Link.fromMap(Map<String, Object> map)
: this(
new Origin.fromMap(map["origin"] as Map<String, Object>),
new Destination.fromMap(map["destination"] as Map<String, Object>),
map["destinationAnchor"]);
map["destinationAnchor"],
map["wasSkipped"]);

bool get hasError => destination.isBroken;
bool get breaksAnchor =>
!wasSkipped &&
destination.wasParsed &&
!destination.satisfiesFragment(fragment);

bool get hasWarning => breaksAnchor;
/// Returns the destination URL with [fragment] (if there was any).
///
/// For example, let's say the original HTML at `http://example.com/path/`
/// includes this code:
///
/// <a href="../about/#contact">...</a>
///
/// In this case [destination]'s [Destination.url] will be
/// `http://example/about/` (because destinations shouldn't be duplicated
/// when there are more anchors on the page).
///
/// That works for most needs of linkcheck but sometimes we need
/// the resolved destination URL _with_ the original fragment. For that,
/// there is [destinationUrlWithFragment].
String get destinationUrlWithFragment {
if (fragment == null) return destination.url;
return "${destination.url}#$fragment";
}

bool get hasError => destination.isBroken; // TODO: add wasSkipped?

bool get hasInfo => destination.wasDeniedByRobotsTxt;

bool get breaksAnchor =>
destination.wasParsed && !destination.satisfiesFragment(fragment);
bool get hasWarning => breaksAnchor;

Map<String, Object> toMap() => {
"origin": origin.toMap(),
"destination": destination.toMap(),
"destinationAnchor": fragment
"destinationAnchor": fragment,
"wasSkipped": wasSkipped
};

String toString() => "$origin => $destination"
Expand Down
2 changes: 1 addition & 1 deletion lib/src/parsers/url_skipper.dart
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class UrlSkipper {
.map((rec) => "${rec.pattern.pattern} (line ${rec.line})")
.join(", ");
return "URL '$url' skipped because it was matched by the following "
"regural expressions of skip file '$path': $list";
"regular expressions of skip file '$path': $list";
}

static Iterable<_UrlSkipperRecord> _parse(Iterable<String> lines) sync* {
Expand Down
11 changes: 11 additions & 0 deletions test/case8/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<a href="subdirectory/other.html">Other</a>
<a href="subdirectory/ignored#something">Ignored</a>
</body>
</html>
1 change: 1 addition & 0 deletions test/case8/skip-file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
subdirectory/ignored#something
11 changes: 11 additions & 0 deletions test/case8/subdirectory/other.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<a href="../index.html">Home</a>
<a href="./ignored#something">Ignored</a>
</body>
</html>
16 changes: 14 additions & 2 deletions test/e2e_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ void main() {
}
});

test("reports info when link is behind robots.txt rule",
() async {
test("reports info when link is behind robots.txt rule", () async {
var server = await Dhttpd.start(path: getServingPath(5), port: port);
try {
int result = await run([":$port"], out);
Expand Down Expand Up @@ -127,6 +126,19 @@ void main() {
await server.destroy();
}
});

test("skips URLs according to their resolved URL with fragment", () async {
var server = await Dhttpd.start(path: getServingPath(8), port: port);
try {
int result = await run(
[":$port", "--skip-file", "test/case8/skip-file.txt"], out);
expect(result, 0);
expect(out.output, contains("0 warnings"));
expect(out.output, contains("0 errors"));
} finally {
await server.destroy();
}
});
}, tags: ["integration"]);
}

Expand Down
20 changes: 20 additions & 0 deletions test/url_skipper_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,24 @@ void main() {
expect(skipper.skips("http://google.com/something/else"), true);
expect(skipper.skips("http://google.com/#about"), true);
});

test("blank lines are ignored", () {
var contents = r"""
# This is a comment
\.com$
/else
""".trim().split("\n");
var skipper = new UrlSkipper(dummyPath, contents);
expect(skipper.skips("http://google.com"), true);
expect(skipper.skips("http://google.com/something/"), false);
expect(skipper.skips("http://google.com/something/else"), true);
});

test("hash (#) at end of regexp works", () {
var contents = ["/path/to/page#"];
var skipper = new UrlSkipper(dummyPath, contents);
expect(skipper.skips("http://example.com/path/to/page"), false);
expect(skipper.skips("http://example.com/path/to/page#rxjs"), true);
});
}

0 comments on commit d213f2c

Please sign in to comment.