diff --git a/README.md b/README.md index 9c8182c..7533a5b 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Twingly URL tools. * `Twingly::URL::Hasher.documentdb_hash(url)` - SHA256 unsigned long, native endian digest * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest * `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest +* `twingly/url/utilities` - Utilities to work with URLs + * `Twingly::URL::Utilities.remove_scheme(url)` - Removes scheme from HTTP/HTTPS URLs (`http://twingly.com` -> `//twingly.com`) ## Normalization example diff --git a/lib/twingly/url/utilities.rb b/lib/twingly/url/utilities.rb new file mode 100644 index 0000000..ed7ae75 --- /dev/null +++ b/lib/twingly/url/utilities.rb @@ -0,0 +1,13 @@ +module Twingly + module URL + module Utilities + module_function + + PROTOCOL_EXPRESSION = /^https?:/i + + def remove_scheme(url) + url.sub(PROTOCOL_EXPRESSION, '') + end + end + end +end diff --git a/test/test_helper.rb b/test/test_helper.rb index 873690c..9dc204e 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -5,3 +5,4 @@ require 'twingly/url' require 'twingly/url/hasher' require 'twingly/url/normalizer' +require 'twingly/url/utilities' diff --git a/test/unit/utilities_test.rb b/test/unit/utilities_test.rb new file mode 100644 index 0000000..afe2d9d --- /dev/null +++ b/test/unit/utilities_test.rb @@ -0,0 +1,82 @@ +require 'test_helper' + +class TestUtilities < MiniTest::Unit::TestCase + context ".normalize" do + should "not remove scheme from non HTTP(S) URLs" do + url = 'gopher://www.duh.se/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal 'gopher://www.duh.se/', result + end + + should "remove scheme from mixed case HTTP URL" do + url = 'HttP://www.duh.se/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//www.duh.se/', result + end + + should "remove scheme from mixed case HTTPS URL" do + url = 'hTTpS://www.duh.se/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//www.duh.se/', result + end + + should "remove scheme from lowercase HTTP URL" do + url = 'http://www.duh.se/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//www.duh.se/', result + end + + should "remove scheme from lowercase HTTPS URL" do + url = 'https://www.duh.se/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//www.duh.se/', result + end + + should "remove scheme from uppercase HTTP URL" do + url = 'HTTP://WWW.DUH.SE/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//WWW.DUH.SE/', result + end + + should "remove scheme from uppercase HTTPS URL" do + url = 'HTTPS://WWW.DUH.SE/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//WWW.DUH.SE/', result + end + + should "remove scheme from URL with non ASCII characters" do + url = 'http://www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα', result + end + + should "only remove scheme from HTTP URL" do + url = 'http://feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml', result + end + + should "only remove scheme from HTTPS URL" do + url = 'https://feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal '//feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss', result + end + + should "not remove scheme from non HTTP(S) URLs with parameter" do + url = 'ftp://ftp.example.com/?url=https://www.example.com/' + + result = Twingly::URL::Utilities.remove_scheme(url) + assert_equal 'ftp://ftp.example.com/?url=https://www.example.com/', result + end + end +end