Skip to content

Commit

Permalink
test: add random testing of crates.io regex
Browse files Browse the repository at this point in the history
This patch adds some infastructure to scrape crates.io
for regex, then run each of the regex found in this way though
a random testing gauntlet to make sure that all the different
backends behave in the same way. These random tests are
expensive, so we only run them in when the magic
`RUST_REGEX_RANDOM_TEST` environment variable is set.
In debug mode, these tests take quite a while, so we special
case them in CI to run in release mode.

To make this better we should add something which can generate
a matching string from a regex. As is we just focus on
the negative case.

There is one bug that this uncovered that this patch does not
fixed. A minimal version of it is commented out in the
`tests/test_crates_regex.rs` file.

PR #472
  • Loading branch information
ethanpailes authored and BurntSushi committed Jun 21, 2018
1 parent d107c80 commit 5fd3bd1
Show file tree
Hide file tree
Showing 7 changed files with 3,615 additions and 3 deletions.
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ name = "backtrack-utf8bytes"
path = "tests/test_backtrack_bytes.rs"
name = "backtrack-bytes"

# Run all backends against each regex found on crates.io and make sure
# that they all do the same thing.
[[test]]
path = "tests/test_crates_regex.rs"
name = "crates-regex"

[profile.release]
debug = true

Expand Down
11 changes: 11 additions & 0 deletions HACKING.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,11 @@ matching engine we want to test. The entry points are:
backtracking on every regex and use *arbitrary* byte based programs.
* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use
backtracking on every regex and use *UTF-8* byte based programs.
* `tests/test_crates_regex.rs` - tests to make sure that all of the
backends behave in the same way against a number of quickcheck
generated random inputs. These tests need to be enabled through
the `RUST_REGEX_RANDOM_TEST` environment variable (see
below).

The lazy DFA and pure literal engines are absent from this list because
they cannot be used on every regular expression. Instead, we rely on
Expand All @@ -259,6 +264,12 @@ entry points, it can take a while to compile everything. To reduce compile
times slightly, try using `cargo test --test default`, which will only use the
`tests/test_default.rs` entry point.

The random testing takes quite a while, so it is not enabled by default.
In order to run the random testing you can set the
`RUST_REGEX_RANDOM_TEST` environment variable to anything before
invoking `cargo test`. Note that this variable is inspected at compile
time, so if the tests don't seem to be running, you may need to run
`cargo clean`.

## Benchmarking

Expand Down
13 changes: 10 additions & 3 deletions ci/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@ cargo build --verbose
cargo doc --verbose

# Run tests. If we have nightly, then enable our nightly features.
# Right now there are no nightly features, but that may change in the
# future.
CARGO_TEST_EXTRA_FLAGS=""
if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
cargo test --verbose --features unstable
else
cargo test --verbose
CARGO_TEST_EXTRA_FLAGS=""
fi
cargo test --verbose ${CARGO_TEST_EXTRA_FLAGS}

# Run the random tests in release mode, as this is faster.
RUST_REGEX_RANDOM_TEST=1 \
cargo test --release --verbose \
${CARGO_TEST_EXTRA_FLAGS} --test crates-regex

# Run a test that confirms the shootout benchmarks are correct.
ci/run-shootout-test
Expand Down
189 changes: 189 additions & 0 deletions scripts/scrape_crates_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/usr/bin/env python3

from subprocess import call
import argparse
import datetime
import glob
import json
import os
import re
import shutil
import tempfile
import time
import urllib3

CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"])

# if only requests was in the standard library...
urllib3.disable_warnings()
http = urllib3.PoolManager()


def argparser():
p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
help=("A directory where we can find crates.io-index "
+ "(if this isn't set it will be automatically "
+ "downloaded)."))
p.add_argument("-o", "--output-file", metavar="OUTPUT",
default="crates_regex.rs",
help="The name of the output file to create.")
return p


PRELUDE = """// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
// on {date}.
"""


def main():
args = argparser().parse_args()
out = open(os.path.abspath(args.output_file), "w")
out.write(PRELUDE.format(date=str(datetime.datetime.now())))
if args.crates_index:
args.crates_index = os.path.abspath(args.crates_index)

# enter our scratch directory
old_dir = os.getcwd()
work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
os.chdir(work_dir)

crates_index = (args.crates_index
if os.path.join(old_dir, args.crates_index)
else download_crates_index())

for (name, vers) in iter_crates(crates_index):
if name in KNOWN_UNMAINTAINED_CRATES:
continue

with Crate(work_dir, name, vers) as c:
i = 0
for line in c.iter_lines():
for r in RE_REGEX.findall(line):
print((name, vers, r))
if len(r) >= 2 and r[-2] == "\\":
continue
out.write("// {}-{}: {}\n".format(name, vers, r))
out.write("consistent!({}_{}, {});\n\n".format(
name.replace("-", "_"), i, r))
out.flush()
i += 1

# Leave the scratch directory
os.chdir(old_dir)
shutil.rmtree(work_dir)
out.close()


def download_crates_index():
if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
print("Error cloning the crates.io index")
exit(1)
return "crates.io-index"


def iter_crates(crates_index):
exclude = set(["config.json", ".git"])
for crate_index_file in iter_files(crates_index, exclude=exclude):
with open(crate_index_file) as f:
most_recent = list(f)
most_recent = most_recent[len(most_recent) - 1]

crate_info = json.loads(most_recent)
if "regex" not in set(d["name"] for d in crate_info["deps"]):
continue

if crate_info["yanked"]:
continue
yield (crate_info["name"], crate_info["vers"])


def iter_files(d, exclude=set()):
for x in os.listdir(d):
if x in exclude:
continue

fullfp = os.path.abspath(d + "/" + x)
if os.path.isfile(fullfp):
yield fullfp
elif os.path.isdir(fullfp):
for f in iter_files(fullfp, exclude):
yield f


class Crate(object):
def __init__(self, work_dir, name, version):
self.name = name
self.version = version
self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download"
.format(name=self.name, version=self.version))
self.filename = "{}/{}-{}.tar.gz".format(
work_dir, self.name, self.version)

def __enter__(self):
max_retries = 1
retries = 0
while retries < max_retries:
retries += 1

r = http.request("GET", self.url, preload_content=False)
try:
print("[{}/{}] Downloading {}".format(
retries, max_retries + 1, self.url))
with open(self.filename, "wb") as f:
while True:
data = r.read(1024)
if not data:
break
f.write(data)
except requests.exceptions.ConnectionError:
time.sleep(1)
r.release_conn()
continue

r.release_conn()
break

call(["tar", "-xf", self.filename])

return self

def __exit__(self, ty, value, tb):
# We are going to clean up the whole temp dir anyway, so
# we don't really need to do this. Its nice to clean up
# after ourselves though.
try:
shutil.rmtree(self.filename[:-len(".tar.gz")])
os.remove(self.filename)
except _:
pass

def iter_srcs(self):
g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
for rsrc in glob.iglob(g):
yield rsrc

def iter_lines(self):
for src in self.iter_srcs():
with open(src) as f:
for line in f:
yield line


if __name__ == "__main__":
main()
Loading

0 comments on commit 5fd3bd1

Please sign in to comment.