Skip to content

Commit

Permalink
Fix #45 Las respuestas a las peticiones de robots.txt lanzadas por Ro…
Browse files Browse the repository at this point in the history
…botsFilter terminan en Ma...
  • Loading branch information
david committed Jun 10, 2014
1 parent e4b2b29 commit 0deefb2
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
24 changes: 18 additions & 6 deletions src/main/scala/es/udc/scrawl/pipeline/RobotsFilter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ class RobotsFilter(config: Config) extends Stage with ActorLogging {
url.withPath(new Path.Slash(new Path.Segment("robots.txt", Path.Empty))), 0), headers)
}

def isRobotsTask(task: Task): Boolean = task match {
case DefaultTask(id, url, _)
if id == s"robots.txt-${url.authority}" &&
url.path.toString() == "/robots.txt" => true
case _ => false
}

case object CheckTimeouts

override def preStart() {
Expand Down Expand Up @@ -63,7 +70,7 @@ class RobotsFilter(config: Config) extends Stage with ActorLogging {

case response@Response(task@Task(_, url, _), code, headers, body) =>
if (url.path.toString().toLowerCase == "/robots.txt" &&
(!robotFiles.contains(url.authority) || !allAllowed.contains(url.authority))) {
!robotFiles.contains(url.authority) && !allAllowed.contains(url.authority)) {
if (code != StatusCodes.OK) {
allAllowed += url.authority
waiting.get(url.authority) match {
Expand All @@ -83,7 +90,8 @@ class RobotsFilter(config: Config) extends Stage with ActorLogging {
parser.allowed(r.headers.getOrElse("User-Agent", "*"), r.task.url))

allow.foreach(right ! _)
deny.foreach(r => left ! new Error(task, new RobotsPathFiltered(r.headers.getOrElse("User-Agent", "*"))))
deny.foreach(r =>
left ! new Error(task, new RobotsPathFiltered(r.headers.getOrElse("User-Agent", "*"))))

waiting.remove(url.authority) //All the request were sent, clear the entry
case None => left ! response
Expand All @@ -95,11 +103,13 @@ class RobotsFilter(config: Config) extends Stage with ActorLogging {
}
}
} else {
left ! response
if (!isRobotsTask(task)) {
left ! response
}
}
case error@Error(Task(_, url, _), _) =>
case error@Error(task@Task(_, url, _), _) =>
if (url.path.toString().toLowerCase == "/robots.txt" &&
(!robotFiles.contains(url.authority) || !allAllowed.contains(url.authority))) {
!robotFiles.contains(url.authority) && !allAllowed.contains(url.authority)) {
allAllowed += url.authority
waiting.get(url.authority) match {
case Some((_, _, set)) =>
Expand All @@ -108,7 +118,9 @@ class RobotsFilter(config: Config) extends Stage with ActorLogging {
case None => left ! error //No waiting request, the robots request comes from another stage
}
} else {
left ! error
if (!isRobotsTask(task)) {
left ! error
}
}
case CheckTimeouts =>
waiting.foreach {
Expand Down
28 changes: 28 additions & 0 deletions src/test/scala/es/udc/scrawl/pipeline/RobotsFilterTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -291,5 +291,33 @@ with BeforeAndAfterAll {
&& headers == Map("User-Agent" -> "Mozilla") => Unit
}
}

"filter the robot.txt request sent by the stage if the stage has already a robots.txt" in {
val (robots, left, right) = initRobots()
val robotsFile =
"""User-Agent: *
|Disallow:
""".stripMargin
//Cache the request to //test.com
left.send(robots, new Request(new DefaultTask("id", Uri("http://test.com/path"), 0), Map("User-Agent" -> "Mozilla")))

val robotsRequest = right.expectMsgPF() {
case request@Request(Task(_, url, _), headers)
if url == Uri("http://test.com/robots.txt")
&& headers == Map("User-Agent" -> "Mozilla") => request
}

//The stage will send another robots.txt request after the reception of this request
left.send(robots, new Request(new DefaultTask("id", Uri("http://test.com/ping"), 0), Map("User-Agent" -> "Mozilla")))

//The first robots.txt will be parsed and stored
val robotsResponse = new Response(robotsRequest.task, StatusCodes.OK, Map(), robotsFile)
right.send(robots, robotsResponse)
left.expectNoMsg()

//The second one must be filtered
right.send(robots, robotsResponse)
left.expectNoMsg()
}
}
}

0 comments on commit 0deefb2

Please sign in to comment.