From b3e5b61fc1ac972f0772151a3748098870742071 Mon Sep 17 00:00:00 2001 From: Julien Letessier Date: Thu, 25 Oct 2012 10:09:49 +0100 Subject: [PATCH] First working version, specs not complete --- ..gemspec | 19 ------- .rspec | 3 ++ LICENSE.txt | 2 +- README.md | 46 +++++++++++++++- Rakefile | 5 ++ acts_as_fuzzy.rb | 95 --------------------------------- fuzzily.gemspec | 12 ++++- lib/..rb | 5 -- lib/fuzzily.rb | 8 +-- lib/fuzzily/migration.rb | 35 ++++++++++++ lib/fuzzily/model.rb | 51 ++++++++++++++++++ lib/fuzzily/searchable.rb | 55 +++++++++++++++++++ lib/fuzzily/trigram.rb | 25 +++++++++ lib/version.rb | 3 -- spec/fuzzily/migration_spec.rb | 33 ++++++++++++ spec/fuzzily/model_spec.rb | 79 +++++++++++++++++++++++++++ spec/fuzzily/searchable_spec.rb | 72 +++++++++++++++++++++++++ spec/fuzzily/trigram_spec.rb | 8 +++ spec/meta_spec.rb | 8 +++ spec/spec_helper.rb | 48 +++++++++++++++++ 20 files changed, 482 insertions(+), 130 deletions(-) delete mode 100644 ..gemspec create mode 100644 .rspec delete mode 100644 acts_as_fuzzy.rb delete mode 100644 lib/..rb create mode 100644 lib/fuzzily/migration.rb create mode 100644 lib/fuzzily/model.rb create mode 100644 lib/fuzzily/searchable.rb create mode 100644 lib/fuzzily/trigram.rb delete mode 100644 lib/version.rb create mode 100644 spec/fuzzily/migration_spec.rb create mode 100644 spec/fuzzily/model_spec.rb create mode 100644 spec/fuzzily/searchable_spec.rb create mode 100644 spec/fuzzily/trigram_spec.rb create mode 100644 spec/meta_spec.rb create mode 100644 spec/spec_helper.rb diff --git a/..gemspec b/..gemspec deleted file mode 100644 index c7bcf04..0000000 --- a/..gemspec +++ /dev/null @@ -1,19 +0,0 @@ -# -*- encoding: utf-8 -*- -lib = File.expand_path('../lib', __FILE__) -$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) -require './version' - -Gem::Specification.new do |gem| - gem.name = "." - gem.version = .::VERSION - gem.authors = ["Julien Letessier"] - gem.email = ["julien.letessier@gmail.com"] - gem.description = %q{TODO: Write a gem description} - gem.summary = %q{TODO: Write a gem summary} - gem.homepage = "" - - gem.files = `git ls-files`.split($/) - gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } - gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) - gem.require_paths = ["lib"] -end diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..0ad0b6c --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--colour +--format d +--backtrace \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt index c385889..6b5a4e4 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -19,4 +19,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 63f5424..d681d82 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Fuzzily -TODO: Write a gem description +A fast, trigram-based, database-backed fuzzy string search/match engine for Rails. ## Installation @@ -18,7 +18,49 @@ Or install it yourself as: ## Usage -TODO: Write usage instructions here +You'll need to setup 3 things: + +- a trigram model (your search index) +- its migration +- the model you want to search for + +Create and ActiveRecord model in your app: + + class Trigram < ActiveRecord::Base + include Fuzzily::Model + end + +Create a migration file: + + class AddTrigramsModel < ActiveRecord::Migration + extend Fuzzily::Migration + + # if you named your trigram model anything but 'Trigram', e.g. 'CustomTrigram' + # trigrams_table_name = :custom_trigrams + end + +Instrument your model (your searchable fields do not have to be stored, they can be dynamic methods too): + + class MyStuff < ActiveRecord::Base + # assuming my_stuffs has a 'name' attribute + fuzzily_searchable :name + end + +Index your model (will happen automatically for new/updated records): + + MyStuff.find_each do |record| + record.update_fuzzy_name! + end + +Search! + + MyStuff.find_by_fuzzy_name('Some Name', :limit => 10) + # => records + + +## License + +MIT licence. Quite permissive if you ask me. ## Contributing diff --git a/Rakefile b/Rakefile index 2995527..cebdd46 100644 --- a/Rakefile +++ b/Rakefile @@ -1 +1,6 @@ require "bundler/gem_tasks" +require 'rspec/core/rake_task' + +RSpec::Core::RakeTask.new(:spec) + +task :default => :spec \ No newline at end of file diff --git a/acts_as_fuzzy.rb b/acts_as_fuzzy.rb deleted file mode 100644 index d8b978d..0000000 --- a/acts_as_fuzzy.rb +++ /dev/null @@ -1,95 +0,0 @@ -module Fuzzily - module StringExt - def trigrams - normalized_words.map do |word| - (0..(word.length - 3)).map { |index| word[index,3] } - end.flatten.uniq - end - - private - - # Remove accents, downcase, replace spaces and word start with '*', - # return list of normalized words - def normalized_words - self.split(/\s+/).map { |word| - Iconv.iconv('ascii//translit//ignore', 'utf-8', word).first.downcase.gsub(/\W/,'') - }. - delete_if(&:empty?). - map { |word| - "**#{word}" - } - end - end - - module Trigram - # Needs fields: trigram, owner_type, owner_id, score - # Needs index on [owner_type, trigram] and [owner_type, owner_id] - - def self.included(by) - by.kind_of?(ActiveRecord::Base) or raise 'Not included in an ActiveRecord subclass' - by.class_eval do - return if class_variable_get(:@@fuzzily_trigram_model) - - belongs_to :owner, :polymorphic => true - validates_presence_of :owner - validates_uniqueness_of :trigram, :scope => [:owner_type, :owner_id] - validates_length_of :trigram, :is => 3 - validates_presence_of :score - - class_variable_set(:@@fuzzily_trigram_model, true) - end - end - - # options: - # - model (mandatory) - # - field (mandatory) - # - limit (default 10) - def matches_for(options = {}) - options[:limit] ||= 10 - self. - scoped(:select => 'owner_id, owner_type, SUM(score) AS score'). - scoped(:group => :owner_id). - scoped(:order => 'score DESC', :limit => options[:limit]). - scoped(:conditions => { :owner_type => options[:model], :field => options[:field] }). - end - end - - - module Searchable - # fuzzily_searchable [, ...] [, ] - def fuzzily_searchable(fields*) - options = args.last.kind_of?(Hash) ? args.pop : {} - - fields.each do |field| - make_field_fuzzily_searchable(field, options) - end - end - - private - - def make_field_fuzzily_searchable(field, options={}) - trigram_class_name = options.fetch(:class_name, 'Trigram') - trigram_association = "trigrams_for_#{field}".to_sym - has_many trigram_association, - :class_name => trigram_class_name, - :as => :owner, - :conditions => { :field => field }, - :dependent => destroy - - define_method "find_by_fuzzy_#{field}".to_sym do |pattern, options={}| - Trigram.matches_for(options.merge(:model => self.name, :field => field)) - end - - after_save do |record| - next unless record.send("#{field}_changed?".to_sym) - self.send(trigram_association).destroy_all - self.send(field).extend(StringExt).trigrams.each do |trigram| - self.send(trigram_association).create!(:score => 1, :trigram => trigram) - end - end - end - - end -end - # ActiveRecord::Base.extend(FuzzilySearchable) - diff --git a/fuzzily.gemspec b/fuzzily.gemspec index 7aba098..7f0c00c 100644 --- a/fuzzily.gemspec +++ b/fuzzily.gemspec @@ -8,10 +8,18 @@ Gem::Specification.new do |gem| gem.version = Fuzzily::VERSION gem.authors = ["Julien Letessier"] gem.email = ["julien.letessier@gmail.com"] - gem.description = %q{TODO: Write a gem description} - gem.summary = %q{TODO: Write a gem summary} + gem.description = %q{Fast fuzzy string matching for rails} + gem.summary = %q{A fast, trigram-based, database-backed fuzzy string search/match engine for Rails.} gem.homepage = "" + gem.add_runtime_dependency 'activerecord', '~> 2.3' + + gem.add_development_dependency 'rspec' + gem.add_development_dependency 'appraisal' + gem.add_development_dependency 'pry' + gem.add_development_dependency 'pry-nav' + gem.add_development_dependency 'sqlite3' + gem.files = `git ls-files`.split($/) gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) diff --git a/lib/..rb b/lib/..rb deleted file mode 100644 index 2b88097..0000000 --- a/lib/..rb +++ /dev/null @@ -1,5 +0,0 @@ -require "./version" - -module . - # Your code goes here... -end diff --git a/lib/fuzzily.rb b/lib/fuzzily.rb index a7c9eef..88f3de5 100644 --- a/lib/fuzzily.rb +++ b/lib/fuzzily.rb @@ -1,5 +1,7 @@ require "fuzzily/version" +require "fuzzily/searchable" +require "fuzzily/migration" +require "fuzzily/model" +require "active_record" -module Fuzzily - # Your code goes here... -end +ActiveRecord::Base.extend(Fuzzily::Searchable) \ No newline at end of file diff --git a/lib/fuzzily/migration.rb b/lib/fuzzily/migration.rb new file mode 100644 index 0000000..b3f661d --- /dev/null +++ b/lib/fuzzily/migration.rb @@ -0,0 +1,35 @@ +require 'active_record' + +module Fuzzily + module Migration + + def trigrams_table_name=(custom_name) + @trigrams_table_name = custom_name + end + + def trigrams_table_name + @trigrams_table_name ||= :trigrams + end + + def up + create_table trigrams_table_name do |t| + t.string :trigram, :limit => 3 + t.integer :score + t.integer :owner_id + t.string :owner_type + t.string :fuzzy_field + end + + add_index trigrams_table_name, + [:owner_type, :fuzzy_field, :trigram, :owner_id, :score], + :name => :index_for_match + add_index trigrams_table_name, + [:owner_type, :owner_id], + :name => :index_by_owner + end + + def down + drop_table trigrams_table_name + end + end +end diff --git a/lib/fuzzily/model.rb b/lib/fuzzily/model.rb new file mode 100644 index 0000000..62e85ab --- /dev/null +++ b/lib/fuzzily/model.rb @@ -0,0 +1,51 @@ +module Fuzzily + module Model + # Needs fields: trigram, owner_type, owner_id, score + # Needs index on [owner_type, trigram] and [owner_type, owner_id] + + def self.included(by) + by.ancestors.include?(ActiveRecord::Base) or raise 'Not included in an ActiveRecord subclass' + by.class_eval do + return if class_variable_defined?(:@@fuzzily_trigram_model) + + belongs_to :owner, :polymorphic => true + validates_presence_of :owner + validates_uniqueness_of :trigram, :scope => [:owner_type, :owner_id] + validates_length_of :trigram, :is => 3 + validates_presence_of :score + validates_presence_of :fuzzy_field + + named_scope :for_model, lambda { |model| { + :conditions => { :owner_type => model.kind_of?(Class) ? model.name : model } + }} + named_scope :for_field, lambda { |field_name| { + :conditions => { :fuzzy_field => field_name } + }} + named_scope :with_trigram, lambda { |trigrams| { + :conditions => { :trigram => trigrams } + }} + + class_variable_set(:@@fuzzily_trigram_model, true) + end + + by.extend(ClassMethods) + end + + module ClassMethods + # options: + # - model (mandatory) + # - field (mandatory) + # - limit (default 10) + def matches_for(text, options = {}) + options[:limit] ||= 10 + self. + scoped(:select => 'owner_id, owner_type, SUM(score) AS score'). + scoped(:group => :owner_id). + scoped(:order => 'score DESC', :limit => options[:limit]). + with_trigram(text.extend(String).trigrams). + map(&:owner) + end + end + end +end + diff --git a/lib/fuzzily/searchable.rb b/lib/fuzzily/searchable.rb new file mode 100644 index 0000000..41aa166 --- /dev/null +++ b/lib/fuzzily/searchable.rb @@ -0,0 +1,55 @@ +require 'fuzzily/trigram' + +module Fuzzily + module Searchable + # fuzzily_searchable [, ...] [, ] + def fuzzily_searchable(*fields) + options = fields.last.kind_of?(Hash) ? fields.pop : {} + + fields.each do |field| + make_field_fuzzily_searchable(field, options) + end + end + + private + + def make_field_fuzzily_searchable(field, options={}) + class_variable_defined?(:"@@fuzzily_searchable_#{field}") and return + + trigram_class_name = options.fetch(:class_name, 'Trigram') + trigram_association = "trigrams_for_#{field}".to_sym + update_trigrams_method = "update_fuzzy_#{field}!".to_sym + + has_many trigram_association, + :class_name => trigram_class_name, + :as => :owner, + :conditions => { :fuzzy_field => field.to_s }, + :dependent => :destroy + + singleton_class.send(:define_method,"find_by_fuzzy_#{field}".to_sym) do |*args| + case args.size + when 1 then pattern = args.first ; options = {} + when 2 then pattern, options = args + else raise 'Wrong # of arguments' + end + Trigram.scoped(options).for_model(self.name).for_field(field).matches(pattern) + end + + define_method update_trigrams_method do + self.send(trigram_association).destroy_all + self.send(field).extend(String).trigrams.each do |trigram| + self.send(trigram_association).create!(:score => 1, :trigram => trigram) + end + end + + after_save do |record| + next unless record.send("#{field}_changed?".to_sym) + self.send(update_trigrams_method) + end + + class_variable_set(:"@@fuzzily_searchable_#{field}", true) + self + end + + end +end diff --git a/lib/fuzzily/trigram.rb b/lib/fuzzily/trigram.rb new file mode 100644 index 0000000..bf31e44 --- /dev/null +++ b/lib/fuzzily/trigram.rb @@ -0,0 +1,25 @@ +require 'iconv' + +module Fuzzily + module String + def trigrams + normalized_words.map do |word| + (0..(word.length - 3)).map { |index| word[index,3] } + end.flatten.uniq + end + + private + + # Remove accents, downcase, replace spaces and word start with '*', + # return list of normalized words + def normalized_words + self.split(/\s+/).map { |word| + Iconv.iconv('ascii//translit//ignore', 'utf-8', word).first.downcase.gsub(/\W/,'') + }. + delete_if(&:empty?). + map { |word| + "**#{word}" + } + end + end +end diff --git a/lib/version.rb b/lib/version.rb deleted file mode 100644 index 5132cfc..0000000 --- a/lib/version.rb +++ /dev/null @@ -1,3 +0,0 @@ -module . - VERSION = "0.0.1" -end diff --git a/spec/fuzzily/migration_spec.rb b/spec/fuzzily/migration_spec.rb new file mode 100644 index 0000000..af29a1e --- /dev/null +++ b/spec/fuzzily/migration_spec.rb @@ -0,0 +1,33 @@ +require 'spec_helper' + +describe Fuzzily::Migration do + subject { Class.new(ActiveRecord::Migration).extend(described_class) } + + it 'is a proper migration' do + subject.ancestors.should include(ActiveRecord::Migration) + end + + it 'applies cleanly' do + silence_stream(STDOUT) { subject.up } + end + + it 'rolls back cleanly' do + silence_stream(STDOUT) { subject.up ; subject.down } + end + + it 'has a customizable table name' do + subject.trigrams_table_name = :foobars + silence_stream(STDOUT) { subject.up } + expect { + ActiveRecord::Base.connection.execute('INSERT INTO `foobars` (score) VALUES (1)') + }.to_not raise_error + end + + it 'results in a functional model' do + silence_stream(STDOUT) { subject.up } + model_class = Class.new(ActiveRecord::Base) + model_class.table_name = 'trigrams' + model_class.create(:trigram => 'abc') + model_class.count.should == 1 + end +end diff --git a/spec/fuzzily/model_spec.rb b/spec/fuzzily/model_spec.rb new file mode 100644 index 0000000..cf1dda8 --- /dev/null +++ b/spec/fuzzily/model_spec.rb @@ -0,0 +1,79 @@ +require 'spec_helper' + +describe Fuzzily::Model do + subject do + Class.new(ActiveRecord::Base).tap do |model| + model.table_name = :trigrams + end + end + + before(:each) { prepare_trigrams_table } + + it 'can be included into an ActiveRecord model' do + subject.send(:include, described_class) + end + + it 'can be included twice' do + subject.send(:include, described_class) + subject.send(:include, described_class) + end + + context '(derived model instance)' do + before { prepare_owners_table } + let(:model) { subject.send(:include, described_class) } + + it 'belongs to an owner' do + model.new.should respond_to(:owner) + end + + describe '.create' do + it 'can create instances' do + model.create(:owner => Stuff.create, :score => 1, :trigram => 'abc', :fuzzy_field => :name) + end + end + + describe '.matches_for' do + before do + @paris = Stuff.create(:name => 'Paris') + %w(**p *pa par ari ris).each do |trigram| + model.create(:owner => @paris, :score => 1, :fuzzy_field => :name, :trigram => trigram) + end + end + + it 'finds matches' do + model.matches_for('Paris').should == [@paris] + end + + it 'finds close matches' do + model.matches_for('Piriss').should == [@paris] + end + + it 'does not confuse fields' do + model.for_field(:name).matches_for('Paris').should == [@paris] + model.for_field(:data).matches_for('Paris').should be_empty + end + + it 'does not confuse owner types' do + model.for_model(Stuff).matches_for('Paris').should == [@paris] + model.for_model(Object).matches_for('Paris').should be_empty + end + + context '(with more than one entry)' do + before do + @palma = Stuff.create(:name => 'Palma') + %w(**p *pa pal alm lma).each do |trigram| + model.create(:owner => @palma, :score => 1, :fuzzy_field => :name, :trigram => trigram) + end + end + + it 'honors the limit option' do + model.matches_for('Palmyre', :limit => 1).should == [@palma] + end + + it 'returns ordered results' do + model.matches_for('Palmyre').should == [@palma, @paris] + end + end + end + end +end diff --git a/spec/fuzzily/searchable_spec.rb b/spec/fuzzily/searchable_spec.rb new file mode 100644 index 0000000..680d33a --- /dev/null +++ b/spec/fuzzily/searchable_spec.rb @@ -0,0 +1,72 @@ +require 'spec_helper' + +describe Fuzzily::Searchable do + # Prepare ourselves a Trigram repository + class Trigram < ActiveRecord::Base + include Fuzzily::Model + end + + before(:each) { prepare_trigrams_table } + before(:each) { prepare_owners_table } + + subject do + Stuff.clone.class_eval do + def self.name ; 'Stuff' ; end + self + end + end + + describe '.fuzzily_searchable' do + it 'is available to all of ActiveRecord' do + subject.should respond_to(:fuzzily_searchable) + end + + it 'adds a find_by_fuzzy_ method' do + subject.fuzzily_searchable :name + subject.should respond_to(:find_by_fuzzy_name) + end + + it 'is idempotent' do + subject.fuzzily_searchable :name + subject.fuzzily_searchable :name + subject.should respond_to(:find_by_fuzzy_name) + end + + it 'creates the trigrams_for_ association' do + subject.fuzzily_searchable :name + subject.new.should respond_to(:trigrams_for_name) + end + end + + describe '(callbacks)' do + it 'generates trigram records on creation' do + subject.fuzzily_searchable :name + subject.create(:name => 'Paris') + subject.last.trigrams_for_name.should_not be_empty + end + + it 'generates the correct trigrams' do + subject.fuzzily_searchable :name + record = subject.create(:name => 'FOO') + Trigram.first.trigram.should == '**f' + Trigram.first.owner_id.should == record.id + Trigram.first.owner_type.should == 'Stuff' + end + + it 'updates all trigram records on save' do + subject.fuzzily_searchable :name + subject.create(:name => 'Paris') + subject.first.update_attribute :name, 'Rome' + Trigram.all.map(&:trigram).should =~ %w(**r *ro rom ome) + end + end + + describe '#find_by_fuzzy_' do + it 'works' + end + + describe '#update_fuzzy_!' do + it 'works' + end + +end \ No newline at end of file diff --git a/spec/fuzzily/trigram_spec.rb b/spec/fuzzily/trigram_spec.rb new file mode 100644 index 0000000..9def454 --- /dev/null +++ b/spec/fuzzily/trigram_spec.rb @@ -0,0 +1,8 @@ +require 'spec_helper' + +describe Fuzzily::String do + it 'splits strings into trigrams' + it 'removes accents' + it 'removes symbols' + it 'handles multi word strings' +end \ No newline at end of file diff --git a/spec/meta_spec.rb b/spec/meta_spec.rb new file mode 100644 index 0000000..8c5db91 --- /dev/null +++ b/spec/meta_spec.rb @@ -0,0 +1,8 @@ +require 'spec_helper' +# This tests our RSpec setup works + +describe 'Test suite' do + it 'has a working ActiveRecord connection' do + ActiveRecord::Base.connection.execute('SELECT * FROM `sqlite_master`') + end +end \ No newline at end of file diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..193cc7c --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,48 @@ +require 'fuzzily' +require 'pathname' +require 'yaml' + +Database = Pathname.new 'test.sqlite3' + +# A test model we'll need as a source of trigrams +class Stuff < ActiveRecord::Base ; end +class StuffMigration < ActiveRecord::Migration + def self.up + create_table :stuffs do |t| + t.string :name + t.string :data + t.timestamps + end + end + + def self.down + drop_table :stuffs + end +end + +RSpec.configure do |config| + config.before(:each) do + # Setup test database + ActiveRecord::Base.establish_connection( + :adapter => 'sqlite3', + :database => Database.to_s + ) + + def prepare_trigrams_table + silence_stream(STDOUT) do + Class.new(ActiveRecord::Migration).extend(Fuzzily::Migration).up + end + end + + def prepare_owners_table + silence_stream(STDOUT) do + StuffMigration.up + end + end + + end + + config.after(:each) do + Database.delete if Database.exist? + end +end \ No newline at end of file