-
Notifications
You must be signed in to change notification settings - Fork 450
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test: add random testing of crates.io regex
This patch adds some infastructure to scrape crates.io for regex, then run each of the regex found in this way though a random testing gauntlet to make sure that all the different backends behave in the same way. These random tests are expensive, so we only run them in when the magic `RUST_REGEX_RANDOM_TEST` environment variable is set. In debug mode, these tests take quite a while, so we special case them in CI to run in release mode. To make this better we should add something which can generate a matching string from a regex. As is we just focus on the negative case. There is one bug that this uncovered that this patch does not fixed. A minimal version of it is commented out in the `tests/test_crates_regex.rs` file. PR #472
- Loading branch information
1 parent
d107c80
commit 5fd3bd1
Showing
7 changed files
with
3,615 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from subprocess import call | ||
import argparse | ||
import datetime | ||
import glob | ||
import json | ||
import os | ||
import re | ||
import shutil | ||
import tempfile | ||
import time | ||
import urllib3 | ||
|
||
CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git" | ||
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)") | ||
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"]) | ||
|
||
# if only requests was in the standard library... | ||
urllib3.disable_warnings() | ||
http = urllib3.PoolManager() | ||
|
||
|
||
def argparser(): | ||
p = argparse.ArgumentParser("A script to scrape crates.io for regex.") | ||
p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR", | ||
help=("A directory where we can find crates.io-index " | ||
+ "(if this isn't set it will be automatically " | ||
+ "downloaded).")) | ||
p.add_argument("-o", "--output-file", metavar="OUTPUT", | ||
default="crates_regex.rs", | ||
help="The name of the output file to create.") | ||
return p | ||
|
||
|
||
PRELUDE = """// Copyright 2018 The Rust Project Developers. See the COPYRIGHT | ||
// file at the top-level directory of this distribution and at | ||
// http://rust-lang.org/COPYRIGHT. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py' | ||
// on {date}. | ||
""" | ||
|
||
|
||
def main(): | ||
args = argparser().parse_args() | ||
out = open(os.path.abspath(args.output_file), "w") | ||
out.write(PRELUDE.format(date=str(datetime.datetime.now()))) | ||
if args.crates_index: | ||
args.crates_index = os.path.abspath(args.crates_index) | ||
|
||
# enter our scratch directory | ||
old_dir = os.getcwd() | ||
work_dir = tempfile.mkdtemp(prefix="scrape-crates-io") | ||
os.chdir(work_dir) | ||
|
||
crates_index = (args.crates_index | ||
if os.path.join(old_dir, args.crates_index) | ||
else download_crates_index()) | ||
|
||
for (name, vers) in iter_crates(crates_index): | ||
if name in KNOWN_UNMAINTAINED_CRATES: | ||
continue | ||
|
||
with Crate(work_dir, name, vers) as c: | ||
i = 0 | ||
for line in c.iter_lines(): | ||
for r in RE_REGEX.findall(line): | ||
print((name, vers, r)) | ||
if len(r) >= 2 and r[-2] == "\\": | ||
continue | ||
out.write("// {}-{}: {}\n".format(name, vers, r)) | ||
out.write("consistent!({}_{}, {});\n\n".format( | ||
name.replace("-", "_"), i, r)) | ||
out.flush() | ||
i += 1 | ||
|
||
# Leave the scratch directory | ||
os.chdir(old_dir) | ||
shutil.rmtree(work_dir) | ||
out.close() | ||
|
||
|
||
def download_crates_index(): | ||
if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0: | ||
print("Error cloning the crates.io index") | ||
exit(1) | ||
return "crates.io-index" | ||
|
||
|
||
def iter_crates(crates_index): | ||
exclude = set(["config.json", ".git"]) | ||
for crate_index_file in iter_files(crates_index, exclude=exclude): | ||
with open(crate_index_file) as f: | ||
most_recent = list(f) | ||
most_recent = most_recent[len(most_recent) - 1] | ||
|
||
crate_info = json.loads(most_recent) | ||
if "regex" not in set(d["name"] for d in crate_info["deps"]): | ||
continue | ||
|
||
if crate_info["yanked"]: | ||
continue | ||
yield (crate_info["name"], crate_info["vers"]) | ||
|
||
|
||
def iter_files(d, exclude=set()): | ||
for x in os.listdir(d): | ||
if x in exclude: | ||
continue | ||
|
||
fullfp = os.path.abspath(d + "/" + x) | ||
if os.path.isfile(fullfp): | ||
yield fullfp | ||
elif os.path.isdir(fullfp): | ||
for f in iter_files(fullfp, exclude): | ||
yield f | ||
|
||
|
||
class Crate(object): | ||
def __init__(self, work_dir, name, version): | ||
self.name = name | ||
self.version = version | ||
self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download" | ||
.format(name=self.name, version=self.version)) | ||
self.filename = "{}/{}-{}.tar.gz".format( | ||
work_dir, self.name, self.version) | ||
|
||
def __enter__(self): | ||
max_retries = 1 | ||
retries = 0 | ||
while retries < max_retries: | ||
retries += 1 | ||
|
||
r = http.request("GET", self.url, preload_content=False) | ||
try: | ||
print("[{}/{}] Downloading {}".format( | ||
retries, max_retries + 1, self.url)) | ||
with open(self.filename, "wb") as f: | ||
while True: | ||
data = r.read(1024) | ||
if not data: | ||
break | ||
f.write(data) | ||
except requests.exceptions.ConnectionError: | ||
time.sleep(1) | ||
r.release_conn() | ||
continue | ||
|
||
r.release_conn() | ||
break | ||
|
||
call(["tar", "-xf", self.filename]) | ||
|
||
return self | ||
|
||
def __exit__(self, ty, value, tb): | ||
# We are going to clean up the whole temp dir anyway, so | ||
# we don't really need to do this. Its nice to clean up | ||
# after ourselves though. | ||
try: | ||
shutil.rmtree(self.filename[:-len(".tar.gz")]) | ||
os.remove(self.filename) | ||
except _: | ||
pass | ||
|
||
def iter_srcs(self): | ||
g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")]) | ||
for rsrc in glob.iglob(g): | ||
yield rsrc | ||
|
||
def iter_lines(self): | ||
for src in self.iter_srcs(): | ||
with open(src) as f: | ||
for line in f: | ||
yield line | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.