diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..b83d9b7 --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--color +--format documentation +--require spec_helper diff --git a/Rakefile b/Rakefile index a9b8975..6d97660 100644 --- a/Rakefile +++ b/Rakefile @@ -1,20 +1,22 @@ -require 'bundler/setup' +namespace :profile do + require_relative "profile/profile" -task default: 'test:unit' -task test: 'test:unit' + task :normalize_url do |task| + require "twingly/url/normalizer" -require 'rake/testtask' -namespace :test do - Rake::TestTask.new(:unit) do |test| - test.pattern = "test/unit/*_test.rb" - test.libs << 'lib' - test.libs << 'test' + Profile.measure "normalizing a short URL", 1000 do + Twingly::URL::Normalizer.normalize_url('http://www.duh.se/') + end end +end + +begin + require "rspec/core/rake_task" + + task default: "spec" - Rake::TestTask.new(:profile) do |test| - test.pattern = "test/profile/*_test.rb" - test.libs << 'lib' - test.libs << 'test' - test.libs << 'test/lib' + RSpec::Core::RakeTask.new(:spec) do |task| + task.pattern = "spec/lib/**/*_spec.rb" end +rescue LoadError end diff --git a/lib/twingly/url.rb b/lib/twingly/url.rb index 757c906..6a45dd8 100644 --- a/lib/twingly/url.rb +++ b/lib/twingly/url.rb @@ -3,13 +3,13 @@ PublicSuffix::List.private_domains = false -SCHEMES = %w(http https) - module Twingly module URL module_function UrlObject = Struct.new(:url, :domain) do + SCHEMES = %w(http https) + def valid? url && domain && SCHEMES.include?(url.normalized_scheme) end diff --git a/profile/profile.rb b/profile/profile.rb new file mode 100644 index 0000000..9905c88 --- /dev/null +++ b/profile/profile.rb @@ -0,0 +1,23 @@ +require "ruby-prof" + +class Profile + def self.measure(name, count, &block) + RubyProf.start + + count.times do + block.call + end + + result = RubyProf.stop + result_directory = "tmp" + Dir.mkdir(result_directory) unless File.exists?(result_directory) + printer = RubyProf::MultiPrinter.new(result) + printer.print(path: result_directory) + + puts "Measured #{name} #{count} times" + puts "Generated reports:" + Dir.entries(result_directory).reject { |entry| entry.end_with?(".") }.each do |file| + puts " #{result_directory}/#{file}" + end + end +end diff --git a/spec/lib/twingly/url/hasher_spec.rb b/spec/lib/twingly/url/hasher_spec.rb new file mode 100644 index 0000000..433daee --- /dev/null +++ b/spec/lib/twingly/url/hasher_spec.rb @@ -0,0 +1,33 @@ +require "spec_helper" + +describe Twingly::URL::Hasher do + describe ".taskdb_hash" do + it "returns a MD5 hexdigest" do + expect(Twingly::URL::Hasher.taskdb_hash("http://blog.twingly.com/")).to eq "B1E2D5AECF6649C2E44D17AEA3E0F4" + end + end + + describe ".blogstream_hash" do + it "returns a MD5 hexdigest" do + expect(Twingly::URL::Hasher.blogstream_hash("http://blog.twingly.com/")).to eq "B1E2D5AECF6649C2E44D17AEA3E0F4" + end + end + + describe ".documentdb_hash" do + it "returns a SHA256 unsigned long, native endian digest" do + expect(Twingly::URL::Hasher.documentdb_hash("http://blog.twingly.com/")).to eq 15340752212397415993 + end + end + + describe ".autopingdb_hash" do + it "returns a SHA256 64-bit signed, native endian digest" do + expect(Twingly::URL::Hasher.autopingdb_hash("http://blog.twingly.com/")).to eq -3105991861312135623 + end + end + + describe ".pingloggerdb_hash" do + it "returns a SHA256 64-bit unsigned, native endian digest" do + expect(Twingly::URL::Hasher.pingloggerdb_hash("http://blog.twingly.com/")).to eq 15340752212397415993 + end + end +end diff --git a/spec/lib/twingly/url/normalization_spec.rb b/spec/lib/twingly/url/normalization_spec.rb new file mode 100644 index 0000000..ae89bde --- /dev/null +++ b/spec/lib/twingly/url/normalization_spec.rb @@ -0,0 +1,125 @@ +require "spec_helper" + +describe Twingly::URL::Normalizer do + let (:normalizer) { Twingly::URL::Normalizer } + + describe ".normalize" do + it "accepts a String" do + expect { normalizer.normalize("") }.not_to raise_error + end + + it "accepts an Array" do + expect { normalizer.normalize([]) }.not_to raise_error + end + + it "handles URL with ] in it" do + url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy" + expect { normalizer.normalize(url) }.not_to raise_error + end + + it "handles URL with reference to another URL in it" do + url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun" + expect { normalizer.normalize(url) }.not_to raise_error + end + + it "handles URL with umlauts in host" do + url = "http://www.åäö.se/" + expect(normalizer.normalize(url)).to eq([url]) + end + + it "handles URL with umlauts in path" do + url = "http://www.aoo.se/öö" + expect(normalizer.normalize(url)).to eq([url]) + end + + it "does not blow up when there's only protocol in the text" do + url = "http://" + expect { normalizer.normalize(url) }.not_to raise_error + end + + it "does not blow up when there's no URL in the text" do + url = "Just some text" + expect { normalizer.normalize(url) }.not_to raise_error + end + + it "does not create URLs for normal words" do + url = "This is, just, some words. Yay!" + expect(normalizer.normalize(url)).to eq([]) + end + end + + describe ".extract_urls" do + it "detects two urls in a String" do + urls = "http://blog.twingly.com/ http://twingly.com/" + response = normalizer.extract_urls(urls) + + expect(response.size).to eq(2) + end + + it "detects two urls in an Array" do + urls = %w(http://blog.twingly.com/ http://twingly.com/) + response = normalizer.extract_urls(urls) + + expect(response.size).to eq(2) + end + + it "always returns an Array" do + response = normalizer.extract_urls(nil) + + expect(response).to be_instance_of(Array) + end + end + + describe ".normalize_url" do + it "adds www if host is missing a subdomain" do + url = "http://twingly.com/" + + expect(normalizer.normalize_url(url)).to eq("http://www.twingly.com/") + end + + it "does not add www if the host has a subdomain" do + url = "http://blog.twingly.com/" + + expect(normalizer.normalize_url(url)).to eq(url) + end + + it "keeps www if the host already has it" do + url = "http://www.twingly.com/" + + expect(normalizer.normalize_url(url)).to eq(url) + end + + it "adds a trailing slash if missing" do + url = "http://www.twingly.com" + expected = "http://www.twingly.com/" + + expect(normalizer.normalize_url(url)).to eq(expected) + end + + it "is able to normalize a url without protocol" do + url = "www.twingly.com/" + expected = "http://www.twingly.com/" + + expect(normalizer.normalize_url(url)).to eq(expected) + end + + it "does not return broken URLs" do + url = "http://www.twingly." + + expect(normalizer.normalize_url(url)).to eq(nil) + end + + it "does not add www. to blogspot blogs" do + url = "http://jlchen1026.blogspot.com/" + + expect(normalizer.normalize_url(url)).to eq(url) + end + + it "downcases the URL" do + url = "http://www.Twingly.com/" + expected = url.downcase + + expect(normalizer.normalize_url(url)).to eq(expected) + end + end +end diff --git a/spec/lib/twingly/url/url_spec.rb b/spec/lib/twingly/url/url_spec.rb new file mode 100644 index 0000000..4235992 --- /dev/null +++ b/spec/lib/twingly/url/url_spec.rb @@ -0,0 +1,37 @@ +require "spec_helper" + +describe Twingly::URL do + describe ".parse" do + %w(http://http http:/// http:// http:/ http: htttp a 1 ?).each do |invalid_url| + it "handles the invalid url '#{invalid_url}'" do + expect { described_class.parse(invalid_url) }.not_to raise_error + end + end + + describe ".valid?" do + %w(ftp://blog.twingly.com/ blablahttp://blog.twingly.com/).each do |invalid_url| + it "returns false for non-http and https" do + expect(described_class.parse(invalid_url).valid?).to be false + end + end + + %w(http://blog.twingly.com/ hTTP://blog.twingly.com/ https://blog.twingly.com).each do |valid_url| + it "returns true for the valid url '#{valid_url}" do + expect(described_class.parse(valid_url).valid?).to be true + end + end + end + end + + describe ".validate" do + it "returns true for a valid url" do + expect(described_class.validate("http://blog.twingly.com/")).to be true + end + + %w(http:// feedville.com,2007-06-19:/blends/16171).each do |invalid_url| + it "returns false for the invalid url '#{invalid_url}'" do + expect(described_class.validate(invalid_url)).to be_falsey + end + end + end +end diff --git a/spec/lib/twingly/url/utilities_spec.rb b/spec/lib/twingly/url/utilities_spec.rb new file mode 100644 index 0000000..556d817 --- /dev/null +++ b/spec/lib/twingly/url/utilities_spec.rb @@ -0,0 +1,80 @@ +require "spec_helper" + +describe Twingly::URL::Utilities do + describe ".normalize" do + it "does not remove scheme from non HTTP(S) URLs" do + url = "gopher://www.duh.se/" + + expect(described_class.remove_scheme(url)).to eq(url) + end + + it "removes scheme from mixed case HTTP URL" do + url = "HttP://www.duh.se/" + expected = "//www.duh.se/" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "removes scheme from mixed case HTTPS URL" do + url = "hTTpS://www.duh.se/" + expected = "//www.duh.se/" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "removes scheme from lowercase HTTP URL" do + url = "http://www.duh.se/" + expected = "//www.duh.se/" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "removes scheme from lowercase HTTPS URL" do + url = "https://www.duh.se/" + expected = "//www.duh.se/" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "removes scheme from uppercase HTTP URL" do + url = "HTTP://WWW.DUH.SE/" + expected = "//WWW.DUH.SE/" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "removes scheme from uppercase HTTPS URL" do + url = "HTTPS://WWW.DUH.SE/" + expected = "//WWW.DUH.SE/" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "removes scheme from URL with non ASCII characters" do + url = "http://www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα" + expected = "//www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "only removes scheme from HTTP URL" do + url = "http://feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml" + expected = "//feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "only removes scheme from HTTPS URL" do + url = "https://feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss" + expected = "//feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss" + + expect(described_class.remove_scheme(url)).to eq(expected) + end + + it "does not remove scheme from non HTTP(S) URLs with parameter" do + url = "ftp://ftp.example.com/?url=https://www.example.com/" + + expect(described_class.remove_scheme(url)).to eq(url) + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..29d6ac5 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,18 @@ +require "twingly/url" +require "twingly/url/hasher" +require "twingly/url/normalizer" +require "twingly/url/utilities" + +RSpec.configure do |config| + config.expect_with :rspec do |expectations| + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + config.mock_with :rspec do |mocks| + mocks.verify_partial_doubles = true + end + + config.order = :random + + Kernel.srand config.seed +end diff --git a/test/test_helper.rb b/test/test_helper.rb deleted file mode 100644 index 2872c53..0000000 --- a/test/test_helper.rb +++ /dev/null @@ -1,11 +0,0 @@ -require 'bundler/setup' -require 'minitest/autorun' -require 'minitest/reporters' -require 'shoulda-context' - -require 'twingly/url' -require 'twingly/url/hasher' -require 'twingly/url/normalizer' -require 'twingly/url/utilities' - -Minitest::Reporters.use! Minitest::Reporters::SpecReporter.new diff --git a/test/unit/hasher_test.rb b/test/unit/hasher_test.rb deleted file mode 100644 index 02c60ea..0000000 --- a/test/unit/hasher_test.rb +++ /dev/null @@ -1,38 +0,0 @@ -require 'test_helper' - -class HasherTest < Minitest::Test - context ".taskdb_hash" do - should "return a MD5 hexdigest" do - assert_equal Twingly::URL::Hasher.taskdb_hash("http://blog.twingly.com/"), - "B1E2D5AECF6649C2E44D17AEA3E0F4" - end - end - - context ".blogstream_hash" do - should "return a MD5 hexdigest" do - assert_equal Twingly::URL::Hasher.blogstream_hash("http://blog.twingly.com/"), - "B1E2D5AECF6649C2E44D17AEA3E0F4" - end - end - - context ".documentdb_hash" do - should "return a SHA256 unsigned long, native endian digest" do - assert_equal Twingly::URL::Hasher.documentdb_hash("http://blog.twingly.com/"), - 15340752212397415993 - end - end - - context ".autopingdb_hash" do - should "return a SHA256 64-bit signed, native endian digest" do - assert_equal Twingly::URL::Hasher.autopingdb_hash("http://blog.twingly.com/"), - -3105991861312135623 - end - end - - context ".pingloggerdb_hash" do - should "return a SHA256 64-bit unsigned, native endian digest" do - assert_equal Twingly::URL::Hasher.pingloggerdb_hash("http://blog.twingly.com/"), - 15340752212397415993 - end - end -end diff --git a/test/unit/normalization_test.rb b/test/unit/normalization_test.rb deleted file mode 100644 index 6eb6822..0000000 --- a/test/unit/normalization_test.rb +++ /dev/null @@ -1,140 +0,0 @@ -require 'test_helper' - -class NormalizerTest < Minitest::Test - context ".normalize" do - setup do - @normalizer = Twingly::URL::Normalizer - end - - should "accept a String" do - assert @normalizer.normalize("") - end - - should "accept an Array" do - assert @normalizer.normalize([]) - end - - should "handle URL with ] in it" do - url = "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy" - assert @normalizer.normalize(url) - end - - should "handle URL with reference to another URL in it" do - url = "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun" - assert @normalizer.normalize(url) - end - - should "handle URL with umlauts in host" do - url = "http://www.åäö.se/" - assert_equal [url], @normalizer.normalize(url) - end - - should "handle URL with umlauts in path" do - url = "http://www.aoo.se/öö" - assert_equal [url], @normalizer.normalize(url) - end - - should "not blow up when there's only protocol in the text" do - url = "http://" - assert @normalizer.normalize(url) - end - - should "not blow up when there's no URL in the text" do - url = "Just some text" - assert @normalizer.normalize(url) - end - - should "not create URLs for normal words" do - url = "This is, just, some words. Yay!" - assert_equal [], @normalizer.normalize(url) - end - end - - context ".extract_urls" do - setup do - @normalizer = Twingly::URL::Normalizer - end - - should "detect two urls in a String" do - urls = "http://blog.twingly.com/ http://twingly.com/" - response = @normalizer.extract_urls(urls) - - assert_equal 2, response.size - end - - should "detect two urls in an Array" do - urls = %w(http://blog.twingly.com/ http://twingly.com/) - response = @normalizer.extract_urls(urls) - - assert_equal 2, response.size - end - - should "return an Array" do - response = @normalizer.extract_urls(nil) - - assert_instance_of Array, response - end - end - - context ".normalize_url" do - setup do - @normalizer = Twingly::URL::Normalizer - end - - should "add www if host is missing a subdomain" do - url = "http://twingly.com/" - result = @normalizer.normalize_url(url) - - assert_equal "http://www.twingly.com/", result - end - - should "not add www if the host has a subdomain" do - url = "http://blog.twingly.com/" - result = @normalizer.normalize_url(url) - - assert_equal "http://blog.twingly.com/", result - end - - should "keep www if the host already has it" do - url = "http://www.twingly.com/" - result = @normalizer.normalize_url(url) - - assert_equal "http://www.twingly.com/", result - end - - should "add an ending slash if missing" do - url = "http://www.twingly.com" - result = @normalizer.normalize_url(url) - - assert_equal "http://www.twingly.com/", result - end - - should "be able to normalize url without protocol" do - url = "www.twingly.com/" - result = @normalizer.normalize_url(url) - - assert_equal "http://www.twingly.com/", result - end - - should "not return broken URLs" do - url = "http://www.twingly." - result = @normalizer.normalize_url(url) - - assert_equal nil, result - end - - should "not add www. to blogspot blogs" do - url = "http://jlchen1026.blogspot.com/" - result = @normalizer.normalize_url(url) - - assert_equal url, result - end - - should "downcase URL" do - url = "http://www.Twingly.com/" - result = @normalizer.normalize_url(url) - - assert_equal url.downcase, result - end - end -end diff --git a/test/unit/url_test.rb b/test/unit/url_test.rb deleted file mode 100644 index 51fd64a..0000000 --- a/test/unit/url_test.rb +++ /dev/null @@ -1,37 +0,0 @@ -require 'test_helper' - -class UrlTest < Minitest::Test - context ".parse" do - should "not blow up for invalid url" do - invalid_urls = %w(http://http http:/// http:// http:/ http: htttp a 1 ?) - invalid_urls.each do |url| - Twingly::URL.parse(url) - end - end - end - - context ".validate" do - should "return true for a valid url" do - assert Twingly::URL.validate("http://blog.twingly.com/"), "Should be valid" - end - - should "return false for a invalid url" do - refute Twingly::URL.validate("http://"), "Should not be valid" - refute Twingly::URL.validate("feedville.com,2007-06-19:/blends/16171"), "Should not be valid" - end - - should "should return false for non-http and https" do - invalid_urls = %w(ftp://blog.twingly.com/ blablahttp://blog.twingly.com/) - invalid_urls.each do |url| - refute Twingly::URL.parse(url).valid?, "Should not be valid" - end - end - - should "should return true for http and https" do - valid_urls = %w(http://blog.twingly.com/ hTTP://blog.twingly.com/ https://blog.twingly.com) - valid_urls.each do |url| - assert Twingly::URL.parse(url).valid?, "Should be valid" - end - end - end -end diff --git a/test/unit/utilities_test.rb b/test/unit/utilities_test.rb deleted file mode 100644 index d596a29..0000000 --- a/test/unit/utilities_test.rb +++ /dev/null @@ -1,82 +0,0 @@ -require 'test_helper' - -class TestUtilities < Minitest::Test - context ".normalize" do - should "not remove scheme from non HTTP(S) URLs" do - url = 'gopher://www.duh.se/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal 'gopher://www.duh.se/', result - end - - should "remove scheme from mixed case HTTP URL" do - url = 'HttP://www.duh.se/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//www.duh.se/', result - end - - should "remove scheme from mixed case HTTPS URL" do - url = 'hTTpS://www.duh.se/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//www.duh.se/', result - end - - should "remove scheme from lowercase HTTP URL" do - url = 'http://www.duh.se/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//www.duh.se/', result - end - - should "remove scheme from lowercase HTTPS URL" do - url = 'https://www.duh.se/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//www.duh.se/', result - end - - should "remove scheme from uppercase HTTP URL" do - url = 'HTTP://WWW.DUH.SE/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//WWW.DUH.SE/', result - end - - should "remove scheme from uppercase HTTPS URL" do - url = 'HTTPS://WWW.DUH.SE/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//WWW.DUH.SE/', result - end - - should "remove scheme from URL with non ASCII characters" do - url = 'http://www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα', result - end - - should "only remove scheme from HTTP URL" do - url = 'http://feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml', result - end - - should "only remove scheme from HTTPS URL" do - url = 'https://feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal '//feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss', result - end - - should "not remove scheme from non HTTP(S) URLs with parameter" do - url = 'ftp://ftp.example.com/?url=https://www.example.com/' - - result = Twingly::URL::Utilities.remove_scheme(url) - assert_equal 'ftp://ftp.example.com/?url=https://www.example.com/', result - end - end -end diff --git a/twingly-url.gemspec b/twingly-url.gemspec index c560a2b..326fdaa 100644 --- a/twingly-url.gemspec +++ b/twingly-url.gemspec @@ -15,9 +15,8 @@ Gem::Specification.new do |s| s.add_dependency "addressable", "~> 2" s.add_dependency "public_suffix", "~> 1.4" - s.add_development_dependency "minitest-reporters", "~> 1" s.add_development_dependency "rake", "~> 10" - s.add_development_dependency "shoulda-context", "~> 1" + s.add_development_dependency "rspec", "~> 3" s.add_development_dependency "ruby-prof", "~> 0" s.files = Dir.glob("{lib}/**/*") + %w(README.md)