utils/simple.py

#!/usr/bin/env python2.7

from lxml import etree
import regex

# XXX: this doesn't work for from patterns that use the ?: in (?:www\.)?
#      (one of many examples in Zoosk.com.xml)
# XXX: this doesn't figure out if a target host causes a particular rule
#      to be completely inapplicable (in which case it should probably be
#      ignored) for determining simplicity
# XXX: this doesn't catch simple rules that use alternation with
#      backreferences, like from="^http://(foo|bar)\.example\.com/"
#      to="\1.example.com"

def simple(f):
    tree = etree.parse(f)
    targets = [target.attrib["host"] for target in tree.xpath("/ruleset/target")]
    return all([
    # ruleset must not be default_off
    "default_off" not in tree.xpath("/ruleset")[0].attrib,
    # ruleset must not contain a match_rule
    "match_rule" not in tree.xpath("/ruleset")[0].attrib,
    # XXX: maybe also check for platform="mixedcontent" here
    # ruleset must not apply any securecookie patterns
    not tree.xpath("/ruleset/securecookie"),
    # ruleset must not contain any exclusions
    not tree.xpath("/ruleset/exclusion"),
    # targets must not contain any wildcards
    not any("*" in target for target in targets),
    # ruleset must not contain any downgrade rules
    not any("downgrade" in rule.attrib for rule in tree.xpath("/ruleset/rule")),
    # and every rule must itself be simple according to the criteria below
    all(simple_rule(rule, targets) for rule in tree.xpath("/ruleset/rule"))
    ])

def simple_rule(rule, targets):
    """Is this rule a simple rule?  A simple rule rewrites a single hostname,
    perhaps with an optional leading www\., to itself or to itself plus www.,
    at the top level with no other effects."""
    rule_from = rule.attrib["from"]
    rule_to = rule.attrib["to"]
    # Simple rule with no capture
    if regex.match(r"^\^http://[-A-Za-z0-9.\\]+/$", rule_from):
        applicable_host = unescape(regex.search(r"^\^http://([-A-Za-z0-9.\\])+/$", rule_from).groups()[0])
        if regex.match(r"^https://%s/" % applicable_host, rule_to) or regex.match("r^https://%s/" % applicable_host, rule_to):
            return True
        else:
            return False
    # Optional www
    if regex.match(r"^\^http://\(www\\\.\)\?[-A-Za-z0-9.\\]+/$", rule_from):
        applicable_host = unescape(regex.search(r"^\^http://\(www\\\.\)\?([-A-Za-z0-9.\\]+)/$", rule_from).groups()[0])
        if regex.match(r"^https://www\.%s/" % applicable_host, rule_to) or regex.match(r"^https://%s/" % applicable_host, rule_to):
            return True
        else:
            return False
    return False

def unescape(s):
    return s.replace(r"\.", ".")