Skip to content

Commit

Permalink
Work with Twingly::URL objects instead of strings
Browse files Browse the repository at this point in the history
...and maybe more.

Related to #11, #31, #35.
  • Loading branch information
twingly-mob committed Oct 13, 2015
1 parent 5288cd2 commit 3c6986b
Show file tree
Hide file tree
Showing 18 changed files with 631 additions and 499 deletions.
23 changes: 2 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,20 @@
Twingly URL tools.

* `twingly/url` - Parse and validate URLs
* `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
* `Twingly::URL.validate` - Validates a URL
* `Twingly::URL.extract_urls` - Extracts URLs from string or array
* `twingly/url/normalizer` - Normalize URLs
* `Twingly::URL::Normalizer.normalize(string)` - Extracts and normalizes URLs from string (Array)
* `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance
* `twingly/url/hasher` - Generate URL hashes suitable for primary keys
* `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest
* `Twingly::URL::Hasher.blogstream_hash(url)` - MD5 hexdigest
* `Twingly::URL::Hasher.documentdb_hash(url)` - SHA256 unsigned long, native endian digest
* `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
* `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
* `twingly/url/utilities` - Utilities to work with URLs
* `Twingly::URL::Utilities.remove_scheme(url)` - Removes scheme from HTTP/HTTPS URLs (`http://twingly.com` -> `//twingly.com`)
* `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL`

## Installation

gem install twingly-url

## Normalization example

```ruby
require 'twingly/url/normalizer'

Twingly::URL::Normalizer.normalize('http://duh.se')
# => ["http://www.duh.se/"]

Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
# => ["http://www.duh.se/", "http://blog.twingly.com/"]

Twingly::URL::Normalizer.normalize('no URL')
# => []
```

## Tests

Run tests with
Expand Down
7 changes: 4 additions & 3 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
namespace :profile do
require_relative "profile/profile"

task :normalize_url do |task|
require "twingly/url/normalizer"
desc "Profile"
task :normalize do |task|
require "twingly/url"

Profile.measure "normalizing a short URL", 1000 do
Twingly::URL::Normalizer.normalize_url('http://www.duh.se/')
Twingly::URL.parse('http://www.duh.se/').normalized
end
end
end
Expand Down
2 changes: 0 additions & 2 deletions lib/twingly-url-normalizer.rb

This file was deleted.

133 changes: 106 additions & 27 deletions lib/twingly/url.rb
Original file line number Diff line number Diff line change
@@ -1,50 +1,129 @@
require 'addressable/uri'
require 'public_suffix'
require "addressable/uri"
require "public_suffix"

require_relative "url/null_url"
require_relative "url/error"

PublicSuffix::List.private_domains = false

module Twingly
module URL
module_function
class URL
SCHEMES = %w(http https)
ENDS_WITH_SLASH = /\/+$/

def self.parse(potential_url)
potential_url = String(potential_url)

# TODO: Can we make this less send-y?
self.new.send(:setup, potential_url)
rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
NullURL.new
end

def scheme
addressable_uri.scheme
end

def trd
public_suffix_domain.trd
end

def sld
public_suffix_domain.sld
end

UrlObject = Struct.new(:url, :domain) do
SCHEMES = %w(http https)
def tld
public_suffix_domain.tld
end

def valid?
url && domain && SCHEMES.include?(url.normalized_scheme)
def domain
public_suffix_domain.domain
end

def host
addressable_uri.host
end

def path
addressable_uri.path
end

def without_scheme
self.to_s.sub(/\A#{scheme}:/, "")
end

def normalized
normalized_url = addressable_uri.dup

normalized_url.scheme = normalized_scheme
normalized_url.host = normalized_host
normalized_url.path = normalized_path

setup(normalized_url)
end

def normalized_scheme
addressable_uri.scheme.downcase
end

def normalized_host
host = addressable_uri.normalized_host
domain = public_suffix_domain

unless domain.subdomain?
host = "www.#{host}"
end

host = normalize_blogspot(host, domain)

host
end

def normalized_path
path = strip_trailing_slashes(addressable_uri.path)

(path.empty?) ? "/" : path
end

def parse(potential_url)
url, domain = extract_url_and_domain(potential_url)
UrlObject.new(url, domain)
def valid?
addressable_uri && public_suffix_domain && SCHEMES.include?(normalized_scheme)
end

def extract_urls(text_or_array)
potential_urls = Array(text_or_array).flat_map(&:split)
potential_urls.map do |potential_url|
potential_url if validate(potential_url)
end.compact
def to_s
addressable_uri.to_s
end

def extract_url_and_domain(potential_url)
addressable_uri = Addressable::URI.heuristic_parse(potential_url)
private

return invalid_url unless addressable_uri
attr_reader :addressable_uri, :public_suffix_domain

domain = PublicSuffix.parse(addressable_uri.display_uri.host)
def setup(potential_url)
if potential_url.is_a?(Addressable::URI)
@addressable_uri = potential_url
else
@addressable_uri = Addressable::URI.heuristic_parse(potential_url)
end

raise Twingly::Error::ParseError if addressable_uri.nil?

[addressable_uri, domain]
rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
invalid_url
@public_suffix_domain = PublicSuffix.parse(addressable_uri.display_uri.host)

self
rescue Addressable::URI::InvalidURIError, PublicSuffix::DomainInvalid => error
error.extend(Twingly::URL::Error)
raise
end

def validate(potential_url)
parse(potential_url).valid?
def normalize_blogspot(host, domain)
if domain.sld.downcase == "blogspot"
host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
else
host
end
end

def invalid_url
[nil, nil]
def strip_trailing_slashes(path)
path.sub(ENDS_WITH_SLASH, "")
end
end
end
8 changes: 8 additions & 0 deletions lib/twingly/url/error.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
module Twingly
class URL
module Error
class ParseError < StandardError
end
end
end
end
2 changes: 1 addition & 1 deletion lib/twingly/url/hasher.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
require 'digest'

module Twingly
module URL
class URL
module Hasher
module_function

Expand Down
66 changes: 0 additions & 66 deletions lib/twingly/url/normalizer.rb

This file was deleted.

24 changes: 24 additions & 0 deletions lib/twingly/url/null_url.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module Twingly
class URL
class NullURL
def method_missing(name, *)
error = NoMethodError.new("undefined method `#{name}'")
raise error unless Twingly::URL.instance_methods.include?(name)

""
end

def normalized
self
end

def valid?
false
end

def to_s
""
end
end
end
end
12 changes: 7 additions & 5 deletions lib/twingly/url/utilities.rb
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
module Twingly
module URL
class URL
module Utilities
module_function

PROTOCOL_EXPRESSION = /^https?:/i

def remove_scheme(url)
url.sub(PROTOCOL_EXPRESSION, '')
def extract_valid_urls(text_or_array)
potential_urls = Array(text_or_array).flat_map(&:split)
potential_urls.map do |potential_url|
url = Twingly::URL.parse(potential_url)
url if url.valid?
end.compact
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
module Twingly
module URL
class URL
VERSION = '1.3.4'
end
end
Loading

0 comments on commit 3c6986b

Please sign in to comment.