Skip to content

Commit

Permalink
Basic search table implementation; fix #6766
Browse files Browse the repository at this point in the history
This is a limited implementation, so please backup your database before
running this search feature branch from now on as we may change things.

It's using a Unicode Snowball stemming tokenizer available from
https://github.com/littlesavage/sqlite3-unicodesn, also handily
available in src/sqlite3-unicodesn in Geary.  If you want to look at the
search tables on the command line, cd into the unicodesn source folder,
run make and make install, then load sqlite3 like:

   sqlite3 -cmd '.load unicodesn.sqlext' /path/to/geary.db
  • Loading branch information
chazomaticus committed May 17, 2013
1 parent f5ba36c commit a4f680b
Show file tree
Hide file tree
Showing 103 changed files with 30,586 additions and 43 deletions.
23 changes: 23 additions & 0 deletions COPYING.snowball
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Copyright (c) 2001, Dr Martin Porter, and (for the Java developments) Copyright
(c) 2002, Richard Boulton
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 changes: 24 additions & 0 deletions debian/copyright
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,27 @@ License: LGPL-2.1
On Debian systems, the complete text of the GNU Lesser General Public
License 2.1, can be found in /usr/share/common-licenses/LGPL-2.1.

Files: src/sqlite3-unicodesn/libstemmer_c/*
Copyright: 2001, Dr Martin Porter
2002, Richard Boulton
License: BSD-2-clause
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
.
Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1 change: 1 addition & 0 deletions sql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ install(FILES version-006.sql DESTINATION ${SQL_DEST})
install(FILES version-007.sql DESTINATION ${SQL_DEST})
install(FILES version-008.sql DESTINATION ${SQL_DEST})
install(FILES version-009.sql DESTINATION ${SQL_DEST})
install(FILES version-010.sql DESTINATION ${SQL_DEST})
5 changes: 5 additions & 0 deletions sql/version-010.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
--
-- Dummy database upgrade to add MessageSearchTable, whose parameters depend on
-- things we need at run-time. See src/engine/imap-db/imap-db-database.vala in
-- post_upgrade() for the code that runs the upgrade.
--
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ OPTIONS
)

add_library(geary-static STATIC ${ENGINE_VALA_C})
target_link_libraries(geary-static ${DEPS_LIBRARIES} gthread-2.0)
target_link_libraries(geary-static ${DEPS_LIBRARIES} sqlite3-unicodesn gthread-2.0)

# Geary client app
#################################################
Expand Down Expand Up @@ -580,3 +580,4 @@ set_property(
gearyd
)

add_subdirectory(sqlite3-unicodesn)
9 changes: 9 additions & 0 deletions src/engine/api/geary-email.vala
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,15 @@ public class Geary.Email : BaseObject {
this.attachments.add_all(attachments);
}

public string get_searchable_attachment_list() {
StringBuilder search = new StringBuilder();
foreach (Geary.Attachment attachment in attachments) {
search.append(attachment.filename);
search.append("\n");
}
return search.str;
}

/**
* This method requires Geary.Email.Field.HEADER and Geary.Email.Field.BODY be present.
* If not, EngineError.INCOMPLETE_MESSAGE is thrown.
Expand Down
12 changes: 12 additions & 0 deletions src/engine/common/common-message-data.vala
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ public abstract class Geary.MessageData.AbstractMessageData : BaseObject {
public abstract string to_string();
}

/**
* Allows message data fields to define how they'll expose themselves to search
* queries.
*/
public interface Geary.MessageData.SearchableMessageData {
/**
* Return a string representing the data as a corpus of text to be searched
* against. Return values from this may be stored in the search index.
*/
public abstract string to_searchable_string();
}

public abstract class Geary.MessageData.StringMessageData : AbstractMessageData,
Gee.Hashable<StringMessageData> {
public string value { get; private set; }
Expand Down
71 changes: 71 additions & 0 deletions src/engine/imap-db/imap-db-database.vala
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* (version 2.1 or later). See the COPYING file in this distribution.
*/

extern int sqlite3_unicodesn_register_tokenizer(Sqlite.Database db);

private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
private const string DB_FILENAME = "geary.db";
private string account_owner_email;
Expand Down Expand Up @@ -32,6 +34,10 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
case 6:
post_upgrade_encode_folder_names();
break;

case 10:
post_upgrade_add_search_table();
break;
}
}

Expand Down Expand Up @@ -77,11 +83,76 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
}

// Version 10.
private void post_upgrade_add_search_table() {
try {
// This can't go in the .sql file because its schema (the stemmer
// algorithm) is determined at runtime.
string stemmer = "english"; // TODO
exec("""
CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
id INTEGER PRIMARY KEY,
body,
attachment,
subject,
from_field,
receivers,
cc,
bcc,
tokenize=unicodesn "stemmer=%s",
prefix="2,4,6,8,10",
);
""".printf(stemmer));
} catch (Error e) {
error("Error creating search table: %s", e.message);
}

bool done = false;
int limit = 100;
for (int offset = 0; !done; offset += limit) {
try {
exec_transaction(Db.TransactionType.RW, (cx) => {
Db.Statement stmt = prepare(
"SELECT id FROM MessageTable ORDER BY id LIMIT ? OFFSET ?");
stmt.bind_int(0, limit);
stmt.bind_int(1, offset);

Db.Result result = stmt.exec();
if (result.finished)
done = true;

while (!result.finished) {
int64 id = result.rowid_at(0);

try {
MessageRow row = Geary.ImapDB.Folder.do_fetch_message_row(
cx, id, Geary.ImapDB.Folder.REQUIRED_FOR_SEARCH, null);
Geary.Email email = row.to_email(-1, new Geary.ImapDB.EmailIdentifier(id));
Geary.ImapDB.Folder.do_add_attachments(cx, email, id);

Geary.ImapDB.Folder.do_add_email_to_search_table(cx, id, email, null);
} catch (Error e) {
debug("Error adding message %lld to the search table: %s", id, e.message);
}

result.next();
}

return Db.TransactionOutcome.DONE;
});
} catch (Error e) {
debug("Error populating search table: %s", e.message);
}
}
}

private void on_prepare_database_connection(Db.Connection cx) throws Error {
cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
cx.set_foreign_keys(true);
cx.set_recursive_triggers(true);
cx.set_synchronous(Db.SynchronousMode.OFF);
sqlite3_unicodesn_register_tokenizer(cx.db);
}
}

77 changes: 75 additions & 2 deletions src/engine/imap-db/imap-db-folder.vala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics {
public const Geary.Email.Field REQUIRED_FOR_DUPLICATE_DETECTION = Geary.Email.Field.PROPERTIES;
public const Geary.Email.Field REQUIRED_FOR_SEARCH =
Geary.Email.Field.ORIGINATORS | Geary.Email.Field.RECEIVERS |
Geary.Email.Field.SUBJECT | Geary.Email.Field.HEADER | Geary.Email.Field.BODY |
Geary.Attachment.REQUIRED_FIELDS;

private const int LIST_EMAIL_CHUNK_COUNT = 5;
private const int LIST_EMAIL_FIELDS_CHUNK_COUNT = 500;
Expand Down Expand Up @@ -909,6 +913,8 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics {
if (email.fields.fulfills(Attachment.REQUIRED_FIELDS))
do_save_attachments(cx, message_id, email.get_message().get_attachments(), cancellable);

do_add_email_to_search_table(cx, message_id, email, cancellable);

MessageAddresses message_addresses =
new MessageAddresses.from_email(account_owner_email, email);
foreach (Contact contact in message_addresses.contacts)
Expand All @@ -918,6 +924,38 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics {
return true;
}

internal static void do_add_email_to_search_table(Db.Connection cx, int64 message_id,
Geary.Email email, Cancellable? cancellable) throws Error {
string? body = null;
try {
body = email.get_message().get_searchable_body();
} catch (Error e) {
// Ignore.
}
string? recipients = null;
try {
recipients = email.get_message().get_searchable_recipients();
} catch (Error e) {
// Ignore.
}

Db.Statement stmt = cx.prepare("""
INSERT INTO MessageSearchTable
(id, body, attachment, subject, from_field, receivers, cc, bcc)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""");
stmt.bind_rowid(0, message_id);
stmt.bind_string(1, body);
stmt.bind_string(2, email.get_searchable_attachment_list());
stmt.bind_string(3, (email.subject != null ? email.subject.to_searchable_string() : null));
stmt.bind_string(4, (email.from != null ? email.from.to_searchable_string() : null));
stmt.bind_string(5, recipients);
stmt.bind_string(6, (email.cc != null ? email.cc.to_searchable_string() : null));
stmt.bind_string(7, (email.bcc != null ? email.bcc.to_searchable_string() : null));

stmt.exec_insert();
}

private Gee.List<Geary.Email>? do_list_email(Db.Connection cx, Gee.List<LocationIdentifier> locations,
Geary.Email.Field required_fields, ListFlags flags, Cancellable? cancellable) throws Error {
Gee.List<Geary.Email> emails = new Gee.ArrayList<Geary.Email>();
Expand Down Expand Up @@ -1242,6 +1280,38 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics {
updated_contacts = message_addresses.contacts;
}

private void do_merge_email_in_search_table(Db.Connection cx, int64 message_id,
Geary.Email email, Cancellable? cancellable) throws Error {
string? body = null;
try {
body = email.get_message().get_searchable_body();
} catch (Error e) {
// Ignore.
}
string? recipients = null;
try {
recipients = email.get_message().get_searchable_recipients();
} catch (Error e) {
// Ignore.
}

Db.Statement stmt = cx.prepare("""
UPDATE MessageSearchTable
SET body=?, attachment=?, subject=?, from_field=?, receivers=?, cc=?, bcc=?
WHERE id=?
""");
stmt.bind_string(0, body);
stmt.bind_string(1, email.get_searchable_attachment_list());
stmt.bind_string(2, (email.subject != null ? email.subject.to_searchable_string() : null));
stmt.bind_string(3, (email.from != null ? email.from.to_searchable_string() : null));
stmt.bind_string(4, recipients);
stmt.bind_string(5, (email.cc != null ? email.cc.to_searchable_string() : null));
stmt.bind_string(6, (email.bcc != null ? email.bcc.to_searchable_string() : null));
stmt.bind_rowid(7, message_id);

stmt.exec();
}

private void do_merge_email(Db.Connection cx, int64 message_id, Geary.Email email,
out Gee.Collection<Contact> updated_contacts, Cancellable? cancellable) throws Error {
assert(message_id != Db.INVALID_ROWID);
Expand All @@ -1253,13 +1323,14 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics {
return;

// fetch message from database and merge in this email
MessageRow row = do_fetch_message_row(cx, message_id, email.fields | Attachment.REQUIRED_FIELDS,
cancellable);
MessageRow row = do_fetch_message_row(cx, message_id,
email.fields | REQUIRED_FOR_SEARCH | Attachment.REQUIRED_FIELDS, cancellable);
Geary.Email.Field db_fields = row.fields;
row.merge_from_remote(email);

// Build the combined email from the merge, which will be used to save the attachments
Geary.Email combined_email = row.to_email(email.position, email.id);
do_add_attachments(cx, combined_email, message_id, cancellable);

// Merge in any fields in the submitted email that aren't already in the database or are mutable
if (((db_fields & email.fields) != email.fields) || email.fields.is_any_set(Geary.Email.MUTABLE_FIELDS)) {
Expand All @@ -1272,6 +1343,8 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics {
cancellable);
}
}

do_merge_email_in_search_table(cx, message_id, combined_email, cancellable);
}

private static Gee.List<Geary.Attachment>? do_list_attachments(Db.Connection cx, int64 message_id,
Expand Down
33 changes: 32 additions & 1 deletion src/engine/rfc822/rfc822-mailbox-address.vala
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
* (version 2.1 or later). See the COPYING file in this distribution.
*/

public class Geary.RFC822.MailboxAddress : BaseObject {
public class Geary.RFC822.MailboxAddress : Geary.MessageData.SearchableMessageData, BaseObject {
internal delegate string ListToStringDelegate(MailboxAddress address);

public string? name { get; private set; }
public string? source_route { get; private set; }
public string mailbox { get; private set; }
Expand Down Expand Up @@ -124,8 +126,37 @@ public class Geary.RFC822.MailboxAddress : BaseObject {
: "%s <%s>".printf(GMime.utils_quote_string(name), address);
}

/**
* See Geary.MessageData.SearchableMessageData.
*/
public string to_searchable_string() {
return get_full_address();
}

public string to_string() {
return get_full_address();
}

internal static string list_to_string(Gee.List<MailboxAddress> addrs,
string empty, ListToStringDelegate to_s) {
switch (addrs.size) {
case 0:
return empty;

case 1:
return to_s(addrs[0]);

default:
StringBuilder builder = new StringBuilder();
foreach (MailboxAddress addr in addrs) {
if (!String.is_empty(builder.str))
builder.append(", ");

builder.append(to_s(addr));
}

return builder.str;
}
}
}

Loading

0 comments on commit a4f680b

Please sign in to comment.