Bind anserini-tools submodule to tools/ (castorini#1282)

+ Remove src/main/python/msmarco/ since the scripts are in tools/ already. + Update documentation for MS MARCO passage and doc ranking.
ChrisHegarty · Jun 17, 2020 · 8a7c250 · 8a7c250
1 parent 8b01a48
commit 8a7c250
Show file tree

Hide file tree

Showing 20 changed files with 90 additions and 833 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "eval"]
-	path = eval
-	url = https://github.com/castorini/anserini-eval.git
+[submodule "tools"]
+	path = tools
+	url = https://github.com/castorini/anserini-tools.git
diff --git a/.travis.yml b/.travis.yml
@@ -16,7 +16,7 @@ install: true
 script:
   - mvn clean package appassembler:assemble
   - find target/appassembler/bin/ -type f -exec sed -i 's/-Xmx.*G/-Xmx2G/g' {} \;
-  - cd eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd .. && cd ..
+  - cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
   - python src/main/python/run_regression.py --collection cacm --index --fail_eval
 after_success:
   - bash <(curl -s https://codecov.io/bash)
diff --git a/README.md b/README.md
@@ -33,12 +33,12 @@ After cloning our repo (use `--recurse-submodules` option to make sure the `eval
 mvn clean package appassembler:assemble
 ```
 
-The `eval/` directory, which contains evaluation tools and scripts, is actually [this repo](https://github.com/castorini/anserini-eval), integrated as a [Git submodule](https://git-scm.com/book/en/v2/Git-Tools-Submodules) (so that it can be shared across related projects).
+The `tools/` directory, which contains evaluation tools and other scripts, is actually [this repo](https://github.com/castorini/anserini-tools), integrated as a [Git submodule](https://git-scm.com/book/en/v2/Git-Tools-Submodules) (so that it can be shared across related projects).
 Build as follows (you might get warnings, but okay to ignore):
 
 ```bash
-cd eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../..
-cd eval/ndeval && make && cd ../..
+cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
+cd tools/eval/ndeval && make && cd ../../..
 ```
 
 With that, you should be ready to go!
@@ -89,8 +89,8 @@ For the most part, manual copying and pasting of commands into a shell is requir
 + [Baselines for the TREC-COVID Challenge](docs/experiments-covid.md)
 + [Working with the 20 Newsgroups Dataset](docs/experiments-20newsgroups.md)
 + [Replicating "Neural Hype" Experiments](docs/experiments-forum2018.md)
-+ [Guide to running BM25 baselines on the MS MARCO Passage Retrieval Task](docs/experiments-msmarco-passage.md)
-+ [Guide to running BM25 baselines on the MS MARCO Document Retrieval Task](docs/experiments-msmarco-doc.md)
++ [Guide to BM25 baselines for the MS MARCO Passage Retrieval Task](docs/experiments-msmarco-passage.md)
++ [Guide to BM25 baselines for the MS MARCO Document Retrieval Task](docs/experiments-msmarco-doc.md)
 + [Guide to replicating doc2query results](docs/experiments-doc2query.md)
 + [Guide to replicating docTTTTTquery results](docs/experiments-docTTTTTquery.md)
 + [Guide to running experiments on the AI2 Open Research Corpus](docs/experiments-openresearch.md)

diff --git a/docs/experiments-msmarco-doc.md b/docs/experiments-msmarco-doc.md
@@ -1,6 +1,6 @@
-# Anserini: BM25 Baselines on [MS MARCO Doc Retrieval Task](https://github.com/microsoft/TREC-2019-Deep-Learning)
+# Anserini: BM25 Baselines for MS MARCO Doc Retrieval
 
-This page contains instructions for running BM25 baselines on the MS MARCO *document* ranking task.
+This page contains instructions for running BM25 baselines on the [MS MARCO *document* ranking task](https://microsoft.github.io/msmarco/).
 Note that there is a separate [MS MARCO *passage* ranking task](experiments-msmarco-passage.md).
 
 ## Data Prep
@@ -10,9 +10,11 @@ First, we need to download and extract the MS MARCO document dataset:
 
 ```
 mkdir collections/msmarco-doc
-mkdir indexes/msmarco-doc
 
 wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz -P collections/msmarco-doc
+
+# Alternative mirror:
+# wget https://www.dropbox.com/s/w6caao3sfx9nluo/msmarco-docs.trec.gz -P collections/msmarco-doc
 ```
 
 To confirm, `msmarco-docs.trec.gz` should have MD5 checksum of `d4863e4f342982b51b9a8fc668b2d0c0`.
@@ -23,8 +25,8 @@ Build the index with the following command:
 ```
 nohup sh target/appassembler/bin/IndexCollection -collection CleanTrecCollection \
  -generator DefaultLuceneDocumentGenerator -threads 1 -input collections/msmarco-doc \
- -index indexes/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs -storePositions -storeDocvectors -storeRaw \
- >& logs/log.msmarco-doc.pos+docvectors+rawdocs &
+ -index indexes/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs \
+ -storePositions -storeDocvectors -storeRaw >& logs/log.msmarco-doc.pos+docvectors+rawdocs &
 ```
 
 On a modern desktop with an SSD, indexing takes around 40 minutes.
@@ -40,7 +42,7 @@ The final log lines should look something like this:
 2020-01-14 16:36:30,961 INFO  [main] index.IndexCollection (IndexCollection.java:859) - Total 3,213,835 documents indexed in 00:45:32
 ```
 
-## Retrieving and Evaluating the Dev set
+## Performing Retrieval on the Dev Queries
 
 Let's download the queries and qrels:
 
@@ -54,6 +56,14 @@ wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries
 wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz -P collections/msmarco-doc/queries-and-qrels
 wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-qrels.tsv.gz -P collections/msmarco-doc/queries-and-qrels
 
+# Alternative mirrors:
+# wget https://www.dropbox.com/s/p6k7ph7v0r400ab/msmarco-doctrain-queries.tsv.gz -P collections/msmarco-doc/queries-and-qrels
+# wget https://www.dropbox.com/s/zyt1n2gpylt0dhj/msmarco-doctrain-top100.gz -P collections/msmarco-doc/queries-and-qrels
+# wget https://www.dropbox.com/s/7xw812wpf4t3fpu/msmarco-doctrain-qrels.tsv.gz -P collections/msmarco-doc/queries-and-qrels
+# wget https://www.dropbox.com/s/d5wcox23s17wpf1/msmarco-docdev-queries.tsv.gz -P collections/msmarco-doc/queries-and-qrels
+# wget https://www.dropbox.com/s/vamkn5dppjhygm5/msmarco-docdev-top100.gz -P collections/msmarco-doc/queries-and-qrels
+# wget https://www.dropbox.com/s/9ad6f8midcmlrrx/msmarco-docdev-qrels.tsv.gz -P collections/msmarco-doc/queries-and-qrels
+
 gunzip collections/msmarco-doc/queries-and-qrels/*.gz
 ```
 
@@ -73,26 +83,31 @@ In this guide, to save time, we are only going to perform retrieval on the dev q
 This can be accomplished as follows:
 
 ```
-target/appassembler/bin/SearchCollection -topicreader TsvInt -index indexes/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs \
- -topics collections/msmarco-doc/queries-and-qrels/msmarco-docdev-queries.tsv -output runs/run.msmarco-doc.dev.bm25.txt -bm25
+target/appassembler/bin/SearchCollection -topicreader TsvInt \
+ -index indexes/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs \
+ -topics collections/msmarco-doc/queries-and-qrels/msmarco-docdev-queries.tsv \
+ -output runs/run.msmarco-doc.dev.bm25.txt -bm25
 ```
 
 On a modern desktop with an SSD, the run takes around 12 minutes.
+
+## Evaluating the Results
+
 After the run completes, we can evaluate with `trec_eval`:
 
 ```
-$ eval/trec_eval.9.0.4/trec_eval -c -mmap -mrecall.1000 collections/msmarco-doc/queries-and-qrels/msmarco-docdev-qrels.tsv runs/run.msmarco-doc.dev.bm25.txt
+$ tools/eval/trec_eval.9.0.4/trec_eval -c -mmap -mrecall.1000 collections/msmarco-doc/queries-and-qrels/msmarco-docdev-qrels.tsv runs/run.msmarco-doc.dev.bm25.txt
 map                   	all	0.2310
 recall_1000           	all	0.8856
 ```
 
 Let's compare to the baselines provided by Microsoft (note that to be fair, we restrict evaluation to top 100 hits per topic):
 
 ```
-$ eval/trec_eval.9.0.4/trec_eval -c -mmap -M 100 collections/msmarco-doc/queries-and-qrels/msmarco-docdev-qrels.tsv collections/msmarco-doc/queries-and-qrels/msmarco-docdev-top100
+$ tools/eval/trec_eval.9.0.4/trec_eval -c -mmap -M 100 collections/msmarco-doc/queries-and-qrels/msmarco-docdev-qrels.tsv collections/msmarco-doc/queries-and-qrels/msmarco-docdev-top100
 map                   	all	0.2219
 
-$ eval/trec_eval.9.0.4/trec_eval -c -mmap -M 100 collections/msmarco-doc/queries-and-qrels/msmarco-docdev-qrels.tsv runs/run.msmarco-doc.dev.bm25.txt
+$ tools/eval/trec_eval.9.0.4/trec_eval -c -mmap -M 100 collections/msmarco-doc/queries-and-qrels/msmarco-docdev-qrels.tsv runs/run.msmarco-doc.dev.bm25.txt
 map                   	all	0.2303
 ```
 
@@ -112,8 +127,10 @@ The tuned parameters using this approach are `k1=3.44`, `b=0.87`.
 To perform a run with these parameters, issue the following command:
 
 ```
-target/appassembler/bin/SearchCollection -topicreader TsvString -index indexes/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs \
- -topics collections/msmarco-doc/queries-and-qrels/msmarco-docdev-queries.tsv -output runs/run.msmarco-doc.dev.bm25.tuned.txt -bm25 -bm25.k1 3.44 -bm25.b 0.87
+target/appassembler/bin/SearchCollection -topicreader TsvString \
+ -index indexes/msmarco-doc/lucene-index.msmarco-doc.pos+docvectors+rawdocs \
+ -topics collections/msmarco-doc/queries-and-qrels/msmarco-docdev-queries.tsv \
+ -output runs/run.msmarco-doc.dev.bm25.tuned.txt -bm25 -bm25.k1 3.44 -bm25.b 0.87
 ```
 
 Here's the comparison between the Anserini default and tuned parameters:

diff --git a/docs/experiments-msmarco-passage.md b/docs/experiments-msmarco-passage.md
@@ -1,6 +1,6 @@
-# Anserini: BM25 Baselines on [MS MARCO Passage Retrieval](https://github.com/microsoft/MSMARCO-Passage-Ranking)
+# Anserini: BM25 Baselines for MS MARCO Passage Retrieval
 
-This page contains instructions for running BM25 baselines on the MS MARCO *passage* ranking task.
+This page contains instructions for running BM25 baselines on the [MS MARCO *passage* ranking task](https://microsoft.github.io/msmarco/).
 Note that there is a separate [MS MARCO *document* ranking task](experiments-msmarco-doc.md).
 We also have a [separate page](experiments-doc2query.md) describing document expansion experiments (Doc2query) for this task.
 
@@ -11,74 +11,73 @@ First, we need to download and extract the MS MARCO passage dataset:
 
 ```bash
 mkdir collections/msmarco-passage
-mkdir indexes/msmarco-passage
 
 wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz -P collections/msmarco-passage
-tar -xzvf collections/msmarco-passage/collectionandqueries.tar.gz -C collections/msmarco-passage
+
+# Alternative mirror:
+# wget https://www.dropbox.com/s/9f54jg2f71ray3b/collectionandqueries.tar.gz -P collections/msmarco-passage
+
+tar xvfz collections/msmarco-passage/collectionandqueries.tar.gz -C collections/msmarco-passage
 ```
 
 To confirm, `collectionandqueries.tar.gz` should have MD5 checksum of `31644046b18952c1386cd4564ba2ae69`.
 
 Next, we need to convert the MS MARCO tsv collection into Anserini's jsonl files (which have one json object per line):
 
 ```bash
-python src/main/python/msmarco/convert_collection_to_jsonl.py \
- --collection_path collections/msmarco-passage/collection.tsv --output_folder collections/msmarco-passage/collection_jsonl
+python tools/scripts/msmarco/convert_collection_to_jsonl.py \
+ --collection-path collections/msmarco-passage/collection.tsv \
+ --output-folder collections/msmarco-passage/collection_jsonl
 ```
 
 The above script should generate 9 jsonl files in `collections/msmarco-passage/collection_jsonl`, each with 1M lines (except for the last one, which should have 841,823 lines).
 
 We can now index these docs as a `JsonCollection` using Anserini:
 
 ```bash
-sh target/appassembler/bin/IndexCollection -collection JsonCollection \
- -generator DefaultLuceneDocumentGenerator -threads 9 -input collections/msmarco-passage/collection_jsonl \
+sh target/appassembler/bin/IndexCollection -threads 9 -collection JsonCollection \
+ -generator DefaultLuceneDocumentGenerator -input collections/msmarco-passage/collection_jsonl \
  -index indexes/msmarco-passage/lucene-index-msmarco -storePositions -storeDocvectors -storeRaw 
 ```
 
 Upon completion, we should have an index with 8,841,823 documents.
-The indexing speed may vary... on a modern desktop with an SSD, indexing takes less than two minutes.
+The indexing speed may vary; on a modern desktop with an SSD, indexing takes a couple of minutes.
 
-## Retrieving and Evaluating the Dev set
+## Performing Retrieval on the Dev Queries
 
 Since queries of the set are too many (+100k), it would take a long time to retrieve all of them. To speed this up, we use only the queries that are in the qrels file: 
 
 ```bash
-python src/main/python/msmarco/filter_queries.py --qrels collections/msmarco-passage/qrels.dev.small.tsv \
- --queries collections/msmarco-passage/queries.dev.tsv --output_queries collections/msmarco-passage/queries.dev.small.tsv
+python tools/scripts/msmarco/filter_queries.py \
+ --qrels collections/msmarco-passage/qrels.dev.small.tsv \
+ --queries collections/msmarco-passage/queries.dev.tsv \
+ --output collections/msmarco-passage/queries.dev.small.tsv
 ```
 
 The output queries file should contain 6980 lines.
-
-We can now retrieve this smaller set of queries.  Note that the following command requires that the [Pyserini](https://github.com/castorini/pyserini/) Python package is installed .
+We can now perform a retrieval run using this smaller set of queries:
 
 ```bash
-python src/main/python/msmarco/retrieve.py --hits 1000 --threads 1 \
- --index indexes/msmarco-passage/lucene-index-msmarco --qid_queries collections/msmarco-passage/queries.dev.small.tsv \
- --output runs/run.msmarco-passage.dev.small.tsv
+sh target/appassembler/bin/SearchMsmarco -hits 1000 -threads 1 \
+ -index indexes/msmarco-passage/lucene-index-msmarco \
+ -queries collections/msmarco-passage/queries.dev.small.tsv \
+ -output runs/run.msmarco-passage.dev.small.tsv
 ```
 
-Note that by default, the above script uses BM25 with tuned parameters `k1=0.82`, `b=0.68` (more details below).
+Note that by default, the above script uses BM25 with tuned parameters `k1=0.82`, `b=0.68`.
 The option `-hits` specifies the of documents per query to be retrieved.
-Thus, the output file should have approximately 6980 * 1000 = 6.9M lines. 
+Thus, the output file should have approximately 6980 × 1000 = 6.9M lines.
 
 Retrieval speed will vary by machine:
-On a modern desktop with an SSD, we can get ~0.06 s/query (taking about seven minutes). We can also perform multithreaded retrieval by changing the `--threads` argument.
-
-Alternatively, we can run the same script implemented in Java, which is a bit faster:
-
-```bash
-sh target/appassembler/bin/SearchMsmarco  -hits 1000 -threads 1 \
- -index indexes/msmarco-passage/lucene-index-msmarco -qid_queries collections/msmarco-passage/queries.dev.small.tsv \
- -output runs/run.msmarco-passage.dev.small.tsv
-```
+On a modern desktop with an SSD, we can get ~0.07 s/query, so the run should finish in under ten minutes.
+We can perform multi-threaded retrieval by changing the `-threads` argument.
 
-Similarly, we can perform multithreaded retrieval by changing the `-threads` argument.
+## Evaluating the Results
 
 Finally, we can evaluate the retrieved documents using this the official MS MARCO evaluation script: 
 
 ```bash
-python src/main/python/msmarco/msmarco_eval.py \
+python tools/scripts/msmarco/msmarco_eval.py \
  collections/msmarco-passage/qrels.dev.small.tsv runs/run.msmarco-passage.dev.small.tsv
 ```
 
@@ -95,17 +94,19 @@ We can also use the official TREC evaluation tool, `trec_eval`, to compute other
 For that we first need to convert runs and qrels files to the TREC format:
 
 ```bash
-python src/main/python/msmarco/convert_msmarco_to_trec_run.py \
- --input_run runs/run.msmarco-passage.dev.small.tsv --output_run runs/run.msmarco-passage.dev.small.trec
+python tools/scripts/msmarco/convert_msmarco_to_trec_run.py \
+ --input runs/run.msmarco-passage.dev.small.tsv \
+ --output runs/run.msmarco-passage.dev.small.trec
 
-python src/main/python/msmarco/convert_msmarco_to_trec_qrels.py \
- --input_qrels collections/msmarco-passage/qrels.dev.small.tsv --output_qrels collections/msmarco-passage/qrels.dev.small.trec
+python tools/scripts/msmarco/convert_msmarco_to_trec_qrels.py \
+ --input collections/msmarco-passage/qrels.dev.small.tsv \
+ --output collections/msmarco-passage/qrels.dev.small.trec
 ```
 
 And run the `trec_eval` tool:
 
 ```bash
-eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap \
+tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap \
  collections/msmarco-passage/qrels.dev.small.trec runs/run.msmarco-passage.dev.small.trec
 ```
 

diff --git a/eval b/eval
diff --git a/src/main/java/io/anserini/search/SearchMsmarco.java b/src/main/java/io/anserini/search/SearchMsmarco.java
@@ -35,49 +35,48 @@
 import java.util.stream.Collectors;
 
 /**
- * Class that performs retrieval for the MS MARCO passage ranking task. This is the Java version of the Python script
- * <code>src/main/python/msmarco/retrieve.py</code>.
+ * Class that performs retrieval for the MS MARCO passage ranking task.
  */
 public class SearchMsmarco {
   public static class Args {
     // required arguments
-    @Option(name = "-qid_queries", metaVar = "[file]", required = true, usage="query id - query mapping file")
+    @Option(name = "-queries", metaVar = "[file]", required = true, usage="Queries file.")
     public String qid_queries = "";
 
-    @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file")
+    @Option(name = "-output", metaVar = "[file]", required = true, usage = "Output run file.")
     public String output = "";
 
-    @Option(name = "-index", metaVar = "[path]", required = true, usage = "index path")
+    @Option(name = "-index", metaVar = "[path]", required = true, usage = "Index path.")
     public String index = "";
 
     // optional arguments
-    @Option(name = "-threads", metaVar = "[number]", usage = "maximum number of threads")
+    @Option(name = "-threads", metaVar = "[number]", usage = "Maximum number of threads.")
     public int threads = 1;
 
-    @Option(name = "-hits", metaVar = "[number]", usage = "number of hits to retrieve")
+    @Option(name = "-hits", metaVar = "[number]", usage = "Number of hits to retrieve.")
     public int hits = 10;
 
-    @Option(name = "-k1", metaVar = "[value]", usage = "BM25 k1 parameter")
+    @Option(name = "-k1", metaVar = "[value]", usage = "BM25 k1 parameter.")
     public float k1 = 0.82f;
 
-    @Option(name = "-b", metaVar = "[value]", usage = "BM25 b parameter")
+    @Option(name = "-b", metaVar = "[value]", usage = "BM25 b parameter.")
     public float b = 0.68f;
 
     // See our MS MARCO documentation to understand how these parameter values were tuned.
-    @Option(name = "-rm3", usage = "use RM3 query expansion model")
+    @Option(name = "-rm3", usage = "Use RM3.")
     public boolean rm3 = false;
 
-    @Option(name = "-fbTerms", metaVar = "[number]", usage = "RM3 parameter: number of expansion terms")
+    @Option(name = "-fbTerms", metaVar = "[number]", usage = "RM3: number of expansion terms.")
     public int fbTerms = 10;
 
-    @Option(name = "-fbDocs", metaVar = "[number]", usage = "RM3 parameter: number of documents")
+    @Option(name = "-fbDocs", metaVar = "[number]", usage = "RM3: number of documents.")
     public int fbDocs = 10;
 
-    @Option(name = "-originalQueryWeight", metaVar = "[value]", usage = "RM3 parameter: weight to assign to the original query")
+    @Option(name = "-originalQueryWeight", metaVar = "[value]", usage = "RM3: weight of original query.")
     public float originalQueryWeight = 0.5f;
 
     @Option(name = "-fields", metaVar = "[key=value]", handler = MapOptionHandler.class,
-            usage = "Fields to search with assigned float weight")
+            usage = "Fields to search with assigned float weights.")
     public Map<String, String> fields = new HashMap<>();
   }
 

diff --git a/src/main/python/msmarco/.gitkeep b/src/main/python/msmarco/.gitkeep