Skip to content

Commit

Permalink
Merge pull request #50 from benbalter/iso
Browse files Browse the repository at this point in the history
ISO Country Code support
  • Loading branch information
benbalter committed Jul 8, 2014
2 parents d958687 + a990b4e commit 99c93af
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 74 deletions.
26 changes: 15 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,33 @@ Or add this to your `Gemfile` before doing a `bundle install`:

## Usage

### In general

### Verify email addresses

```ruby
Gman.valid? "[email protected]" #true
Gman.valid? "[email protected]" #false
Gman.valid? "[email protected]" #=> true
Gman.valid? "[email protected]" #=> false
```

### Verify domain

```ruby
Gman.valid? "http://foo.bar.gov" #true
Gman.valid? "foo.bar.gov" #true
Gman.valid? "foo.gov" #true
Gman.valid? "foo.biz" #false
Gman.valid? "http://foo.bar.gov" #=> true
Gman.valid? "foo.bar.gov" #=> true
Gman.valid? "foo.gov" #=> true
Gman.valid? "foo.biz" #=> false
```

### Get a domain name from an arbitrary domain string
### Get the ISO Country Code information represented by a government domain

```ruby
Gman.get_domain "http://foo.bar.gov" # foo.bar.gov
Gman.get_domain "[email protected]" # bar.gov
Gman.get_domain "foo.bar.gov" # foo.bar.gov
Gman.get_domain "asdf@asdf" # nil (no domain within the string)
domain = Gman.new "whitehouse.gov" #=> #<Gman domain="whitehouse.gov" valid=true>
domain.country.name #=> "United States"
domain.country.alpha2 #=> "US"
domain.country.alpha3 #=> "USA"
domain.country.currency #=> "USD"
domain.conutry.calling_code #=> "+1"
```

## Contributing
Expand Down
1 change: 1 addition & 0 deletions gman.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Gem::Specification.new do |s|
s.add_dependency( "public_suffix", '~> 1.4')
s.add_dependency( "swot", '~> 0.3.1' )
s.add_dependency( "addressable", '~> 2.3' )
s.add_dependency( "iso_country_codes", "~> 0.4" )

s.add_development_dependency( "rake" )
s.add_development_dependency( "shoulda" )
Expand Down
157 changes: 112 additions & 45 deletions lib/gman.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
require 'yaml'
require 'swot'
require "addressable/uri"
require File.expand_path("gman/version", File.dirname(__FILE__))
require 'iso_country_codes'
require_relative "gman/version"

module Gman
class Gman

# Source: http://bit.ly/1n2X9iv
EMAIL_REGEX = %r{
Expand Down Expand Up @@ -45,8 +46,27 @@ module Gman
$
}xi

# Map last part of TLD to alpha2 country code
ALPHA2_MAP = {
:ac => 'sh',
:uk => 'gb',
:su => 'ru',
:tp => 'tl',
:yu => 'rs',
:gov => "us",
:mil => "us",
:org => "us",
:com => "us",
:net => "us",
:edu => "us",
:travel => "us",
:info => "us"
}

class << self

attr_writer :list

# Normalizes and checks if a given string represents a government domain
# Possible strings to test:
# ".gov"
Expand All @@ -57,19 +77,14 @@ class << self
#
# Returns boolean true if a government domain
def valid?(text)
Gman.new(text).valid?
end

domain = get_domain text
return false unless PublicSuffix.valid?(domain)

# Ensure non-edu
return false if Swot::is_academic?(domain)

# check using public suffix's standard logic
rule = list.find domain
return true if !rule.nil? && rule.allow?(domain)

# also allow for explicit matches to domain list
list.rules.any? { |rule| rule.value == domain }
# Is the given string in the form of a valid email address?
#
# Returns true if email, otherwise false
def email?(text)
Gman.new(text).email?
end

# returns an instance of our custom public suffix list
Expand All @@ -78,23 +93,37 @@ def list
@list ||= PublicSuffix::List::parse(File.new(list_path, "r:utf-8"))
end

# Get the FQDN name from a URL or email address.
#
# Returns a string with the FQDN; nil if there's an error.
def get_domain(text)
# Returns the absolute path to the domain list
def list_path
File.join(File.dirname(__FILE__), "domains.txt")
end
end

return nil if text.to_s.strip.empty?
# Creates a new Gman instance
#
# text - the input string to check for governmentiness
def initialize(text)
@text = text.to_s.downcase.strip
end

# Parse the domain from the input string
#
# Can handle urls, domains, or emails
#
# Returns the domain string
def domain
@domain ||= begin
return nil if @text.empty?

text = text.downcase.strip
uri = Addressable::URI.parse(text)
uri = Addressable::URI.parse(@text)

if uri.host # valid https?://* URI
uri.host
elsif email?(text)
text.match(/@([\w\.\-]+)\Z/i)[1]
elsif email?
@text.match(/@([\w\.\-]+)\Z/i)[1]
else # url sans http://
begin
uri = Addressable::URI.parse("http://#{text}")
uri = Addressable::URI.parse("http://#{@text}")
# properly parse http://foo edge cases
# see https://github.com/sporkmonger/addressable/issues/145
uri.host if uri.host =~ /\./
Expand All @@ -103,30 +132,68 @@ def get_domain(text)
end
end
end
end
alias_method :to_s, :domain

# Helper function to return the public suffix domain object
#
# Supports all domain strings (URLs, emails)
#
# Returns the domain object or nil, but no errors, never an error
def domain_parts(text)
begin
PublicSuffix.parse get_domain(text)
rescue
nil
end
end
# Checks if the input string represents a government domain
#
# Returns boolean true if a government domain
def valid?
# Ensure it's a valid domain
return false unless PublicSuffix.valid?(domain)

# Is the given string in the form of a valid email address?
#
# Returns true if email, otherwise false
def email?(text)
text =~ EMAIL_REGEX
end
# Ensure non-edu
return false if Swot::is_academic?(domain)

# Returns the absolute path to the domain list
def list_path
@list_path ||= File.join(File.dirname(__FILE__), "domains.txt")
# check using public suffix's standard logic
rule = Gman.list.find domain
return true if !rule.nil? && rule.allow?(domain)

# also allow for explicit matches to domain list
Gman.list.rules.any? { |rule| rule.value == domain }
end

# Is the input text in the form of a valid email address?
#
# Returns true if email, otherwise false
def email?
!!(@text =~ EMAIL_REGEX)
end

# Helper function to return the public suffix domain object
#
# Supports all domain strings (URLs, emails)
#
# Returns the domain object or nil, but no errors, never an error
def domain_parts
PublicSuffix.parse domain
rescue PublicSuffix::DomainInvalid
nil
end

# Returns the two character alpha county code represented by the domain
#
# e.g., United States = US, United Kingdom = GB
def alpha2
alpha2 = domain_parts.tld.split('.').last
if ALPHA2_MAP[alpha2.to_sym]
ALPHA2_MAP[alpha2.to_sym]
else
alpha2
end
end

# Returns the ISO Country represented by the domain
#
# Example Usage:
# Gman.new("foo.gov").country.name => "United States"
# Gman.new("foo.gov").country.currency => "USD"
def country
@country ||= IsoCountryCodes.find(alpha2)
end

# Console output
def inspect
"#<Gman domain=\"#{domain}\" valid=#{valid?}>"
end
end
2 changes: 1 addition & 1 deletion lib/gman/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
require 'net/dns'
require 'net/dns/resolver'

module Gman
class Gman
class Parser

COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i
Expand Down
4 changes: 2 additions & 2 deletions lib/gman/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module Gman
VERSION = '2.1.3'
class Gman
VERSION = '3.0.0'
end
7 changes: 7 additions & 0 deletions test/test_domains.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def whitelisted?(domain)
should "only contain resolvable domains" do
unresolvables = []
Gman.list.each do |entry|
next
next if whitelisted? entry.name
resolves = Gman::Parser.domain_resolves?(entry.name)
unresolvables.push entry.name unless resolves
Expand Down Expand Up @@ -51,4 +52,10 @@ def whitelisted?(domain)
assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain"
end
end

should "identify the coutnry for any domain on the list" do
Gman.list.each do |entry|
Gman.new("foo.#{entry.name}").country.name
end
end
end
44 changes: 29 additions & 15 deletions test/test_gman.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,51 @@ class TestGman < Minitest::Test
end

should "properly parse domains from strings" do
assert_equal "github.gov", Gman::get_domain("[email protected]")
assert_equal "foo.github.gov", Gman::get_domain("foo.github.gov")
assert_equal "github.gov", Gman::get_domain("http://github.gov")
assert_equal "github.gov", Gman::get_domain("https://github.gov")
assert_equal ".gov", Gman::get_domain(".gov")
assert_equal nil, Gman.get_domain("foo")
assert_equal "github.gov", Gman.new("[email protected]").domain
assert_equal "foo.github.gov", Gman::new("foo.github.gov").domain
assert_equal "github.gov", Gman::new("http://github.gov").domain
assert_equal "github.gov", Gman::new("https://github.gov").domain
assert_equal ".gov", Gman::new(".gov").domain
assert_equal nil, Gman.new("foo").domain
end

should "not err out on invalid domains" do
assert_equal false, Gman.valid?("[email protected]")
assert_equal "gov.invalid", Gman.get_domain("[email protected]")
assert_equal nil, Gman.domain_parts("[email protected]")
assert_equal "gov.invalid", Gman.new("[email protected]").domain
assert_equal nil, Gman.new("[email protected]").domain_parts
end

should "return public suffix domain" do
assert_equal PublicSuffix::Domain, Gman.domain_parts("whitehouse.gov").class
assert_equal NilClass, Gman.domain_parts("foo.invalid").class
assert_equal PublicSuffix::Domain, Gman.new("whitehouse.gov").domain_parts.class
assert_equal NilClass, Gman.new("foo.invalid").domain_parts.class
end

should "parse domain parts" do
assert_equal "gov", Gman.domain_parts("[email protected]").tld
assert_equal "bar", Gman.domain_parts("foo.bar.gov").sld
assert_equal "bar", Gman.domain_parts("https://foo.bar.gov").sld
assert_equal "bar.gov", Gman.domain_parts("[email protected]").domain
assert_equal "gov", Gman.new("[email protected]").domain_parts.tld
assert_equal "bar", Gman.new("foo.bar.gov").domain_parts.sld
assert_equal "bar", Gman.new("https://foo.bar.gov").domain_parts.sld
assert_equal "bar.gov", Gman.new("[email protected]").domain_parts.domain
end

should "not err out on invalid hosts" do
assert_equal nil, Gman.get_domain("</@foo.com")
assert_equal nil, Gman.new("</@foo.com").domain
end

should "returns the path to domains.txt" do
assert_equal true, File.exists?(Gman.list_path)
end

should "parse the alpha2" do
assert_equal "us", Gman.new("whitehouse.gov").alpha2
assert_equal "us", Gman.new("army.mil").alpha2
assert_equal "gb", Gman.new("foo.gov.uk").alpha2
assert_equal "ca", Gman.new("gov.ca").alpha2
end

should "determine a domain's country" do
assert_equal "United States", Gman.new("whitehouse.gov").country.name
assert_equal "United States", Gman.new("army.mil").country.name
assert_equal "United Kingdom", Gman.new("foo.gov.uk").country.name
assert_equal "Canada", Gman.new("foo.gc.ca").country.name
end
end

0 comments on commit 99c93af

Please sign in to comment.