-
-
Notifications
You must be signed in to change notification settings - Fork 699
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
512 additions
and
104 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
// # Basic Example | ||
// | ||
// This example covers the basic functionalities of | ||
// tantivy. | ||
// | ||
// We will : | ||
// - define our schema | ||
// - create an index in a directory | ||
// - index a few documents into our index | ||
// - search for the best document matching a basic query | ||
// - retrieve the best document's original content. | ||
|
||
use std::collections::HashSet; | ||
|
||
// --- | ||
// Importing tantivy... | ||
use tantivy::collector::{Count, TopDocs}; | ||
use tantivy::query::{FuzzyTermQuery, QueryParser}; | ||
use tantivy::schema::*; | ||
use tantivy::{doc, DocId, Index, ReloadPolicy, Score, SegmentReader}; | ||
use tempfile::TempDir; | ||
|
||
fn main() -> tantivy::Result<()> { | ||
// Let's create a temporary directory for the | ||
// sake of this example | ||
let index_path = TempDir::new()?; | ||
|
||
// # Defining the schema | ||
// | ||
// The Tantivy index requires a very strict schema. | ||
// The schema declares which fields are in the index, | ||
// and for each field, its type and "the way it should | ||
// be indexed". | ||
|
||
// First we need to define a schema ... | ||
let mut schema_builder = Schema::builder(); | ||
|
||
// Our first field is title. | ||
// We want full-text search for it, and we also want | ||
// to be able to retrieve the document after the search. | ||
// | ||
// `TEXT | STORED` is some syntactic sugar to describe | ||
// that. | ||
// | ||
// `TEXT` means the field should be tokenized and indexed, | ||
// along with its term frequency and term positions. | ||
// | ||
// `STORED` means that the field will also be saved | ||
// in a compressed, row-oriented key-value store. | ||
// This store is useful for reconstructing the | ||
// documents that were selected during the search phase. | ||
let title = schema_builder.add_text_field("title", TEXT | STORED); | ||
|
||
let schema = schema_builder.build(); | ||
|
||
// # Indexing documents | ||
// | ||
// Let's create a brand new index. | ||
// | ||
// This will actually just save a meta.json | ||
// with our schema in the directory. | ||
let index = Index::create_in_dir(&index_path, schema.clone())?; | ||
|
||
// To insert a document we will need an index writer. | ||
// There must be only one writer at a time. | ||
// This single `IndexWriter` is already | ||
// multithreaded. | ||
// | ||
// Here we give tantivy a budget of `50MB`. | ||
// Using a bigger memory_arena for the indexer may increase | ||
// throughput, but 50 MB is already plenty. | ||
let mut index_writer = index.writer(50_000_000)?; | ||
|
||
// Let's index our documents! | ||
// We first need a handle on the title and the body field. | ||
|
||
// ### Adding documents | ||
// | ||
index_writer.add_document(doc!( | ||
title => "The Name of the Wind", | ||
))?; | ||
index_writer.add_document(doc!( | ||
title => "The Diary of Muadib", | ||
))?; | ||
index_writer.add_document(doc!( | ||
title => "A Dairy Cow", | ||
))?; | ||
index_writer.add_document(doc!( | ||
title => "The Diary of a Young Girl", | ||
))?; | ||
index_writer.commit()?; | ||
|
||
// ### Committing | ||
// | ||
// At this point our documents are not searchable. | ||
// | ||
// | ||
// We need to call `.commit()` explicitly to force the | ||
// `index_writer` to finish processing the documents in the queue, | ||
// flush the current index to the disk, and advertise | ||
// the existence of new documents. | ||
// | ||
// This call is blocking. | ||
index_writer.commit()?; | ||
|
||
// If `.commit()` returns correctly, then all of the | ||
// documents that have been added are guaranteed to be | ||
// persistently indexed. | ||
// | ||
// In the scenario of a crash or a power failure, | ||
// tantivy behaves as if it has rolled back to its last | ||
// commit. | ||
|
||
// # Searching | ||
// | ||
// ### Searcher | ||
// | ||
// A reader is required first in order to search an index. | ||
// It acts as a `Searcher` pool that reloads itself, | ||
// depending on a `ReloadPolicy`. | ||
// | ||
// For a search server you will typically create one reader for the entire lifetime of your | ||
// program, and acquire a new searcher for every single request. | ||
// | ||
// In the code below, we rely on the 'ON_COMMIT' policy: the reader | ||
// will reload the index automatically after each commit. | ||
let reader = index | ||
.reader_builder() | ||
.reload_policy(ReloadPolicy::OnCommit) | ||
.try_into()?; | ||
|
||
// We now need to acquire a searcher. | ||
// | ||
// A searcher points to a snapshotted, immutable version of the index. | ||
// | ||
// Some search experience might require more than | ||
// one query. Using the same searcher ensures that all of these queries will run on the | ||
// same version of the index. | ||
// | ||
// Acquiring a `searcher` is very cheap. | ||
// | ||
// You should acquire a searcher every time you start processing a request and | ||
// and release it right after your query is finished. | ||
let searcher = reader.searcher(); | ||
|
||
// ### FuzzyTermQuery | ||
{ | ||
let term = Term::from_field_text(title, "Diary"); | ||
let query = FuzzyTermQuery::new(term, 2, true); | ||
|
||
let (top_docs, count) = searcher | ||
.search(&query, &(TopDocs::with_limit(5), Count)) | ||
.unwrap(); | ||
assert_eq!(count, 3); | ||
assert_eq!(top_docs.len(), 3); | ||
for (score, doc_address) in top_docs { | ||
let retrieved_doc = searcher.doc(doc_address)?; | ||
// Note that the score is not lower for the fuzzy hit. | ||
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563 | ||
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc)); | ||
// score 1.0 doc {"title":["The Diary of Muadib"]} | ||
// | ||
// score 1.0 doc {"title":["The Diary of a Young Girl"]} | ||
// | ||
// score 1.0 doc {"title":["A Dairy Cow"]} | ||
} | ||
} | ||
|
||
Ok(()) | ||
} |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters