diff --git a/lib/domains.txt b/lib/domains.txt index fe429475..46b3d655 100644 --- a/lib/domains.txt +++ b/lib/domains.txt @@ -1796,7 +1796,6 @@ peachtree-city.org pearson-ga.com pembrokega.net pinelakega.com -polkcountygeorgia.us pooler-ga.us putnamcountyga.us rockdalecounty.org @@ -2451,7 +2450,6 @@ madisoncty.com michianashores.com middleburyin.org morgancounty-il.com -newburgh.org newhavenin.org newtoncountyin.com nirpc.org @@ -3196,7 +3194,6 @@ townofdresden.com townofgreene.net townofguilford.com townofhartlandme.com -townofmadawaska.com townofnaples.org townofnorridgewock.com townofnorthberwick.org @@ -6093,7 +6090,6 @@ lloydharbor.org lowville.ny.us lynbrookvillage.net macedontown.net -madisoncounty.org madridny.us malonetown.com malta-town.org @@ -7490,7 +7486,6 @@ paehealth.com palmertonborough.com palmertwp.com paradisetownship.com -parkercity.org parkesburg.org parksideboro.com patientsafetyauthority.org @@ -7682,7 +7677,6 @@ zelieboro.org fajardopr.org gobierno.pr loteriaelectronicapr.com -mayaguez.pr municipiocarolina.com municipiodearecibo.com municipiodebayamon.com diff --git a/lib/gman.rb b/lib/gman.rb index d495fe5c..07ab8c0a 100644 --- a/lib/gman.rb +++ b/lib/gman.rb @@ -74,7 +74,7 @@ def valid?(text) # returns an instance of our custom public suffix list # list behaves like PublicSuffix::List but is limited to our whitelisted domains def list - @list ||= PublicSuffix::List::parse(File.new(File.join(File.dirname(__FILE__), "domains.txt"), "r:utf-8")) + @list ||= PublicSuffix::List::parse(File.new(list_path, "r:utf-8")) end # Get the FQDN name from a URL or email address. @@ -122,5 +122,10 @@ def domain_parts(text) def email?(text) text =~ EMAIL_REGEX end + + # Returns the absolute path to the domain list + def list_path + @list_path ||= File.join(File.dirname(__FILE__), "domains.txt") + end end end diff --git a/lib/gman/parser.rb b/lib/gman/parser.rb new file mode 100644 index 00000000..96f15dec --- /dev/null +++ b/lib/gman/parser.rb @@ -0,0 +1,51 @@ +# Utility functions for parsing and manipulating public-suffix formated domain lists +require 'net/dns' +require 'net/dns/resolver' + +module Gman + class Parser + + COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i + + class << self + + # Given a public-suffix list formatted file + # Converts to a hash in the form of :group => [domain1, domain2...] + def file_to_hash(file) + array_to_hash(file_to_array(file)) + end + + # Given a public-suffix list formatted file + # Convert it into an array of comments and domains representing each line + def file_to_array(file) + domains = File.open(file).read + domains.gsub! /\r\n?/, "\n" # Normalize line endings + domains = domains.split("\n") + end + + # Given an array of comments/domains in public suffix format + # Converts to a hash in the form of :group => [domain1, domain2...] + def array_to_hash(domains) + group = "" + domain_hash = {} + domains.each do |line| + next if line.empty? + if match = COMMENT_REGEX.match(line) + group = match[1] + else + domain_hash[group] = [] if domain_hash[group].nil? + domain_hash[group].push line.downcase + end + end + domain_hash + end + + # Verifies that the given domain has an MX record, and thus is valid + def domain_resolves?(domain) + resolver = Net::DNS::Resolver.new + resolver.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"] + resolver.search(domain, Net::DNS::NS).header.anCount > 0 || resolver.search(domain, Net::DNS::MX).header.anCount > 0 + end + end + end +end diff --git a/script/vendor-us b/script/vendor-us index 74a585b0..b4fea77a 100755 --- a/script/vendor-us +++ b/script/vendor-us @@ -17,15 +17,12 @@ require 'rubygems' require 'fileutils' require 'public_suffix' require 'swot' -require 'net/dns' -require 'net/dns/resolver' +require './lib/gman/parser' -CURRENT_LIST = File.expand_path("../lib/domains.txt", File.dirname(__FILE__)) TMP_DIR = File.expand_path("../tmp/govt-urls", File.dirname(__FILE__)) REPO = "https://github.com/GSA-OCSIT/govt-urls" TXT_FILE = "government-urls-hierarchical-list.txt" YAML_FILE = "governent-urls.yaml" -COMMENT_REGEX = /\/\/[\/\s]*(.*)$/i BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"] domain_hash = {} @@ -34,41 +31,9 @@ FileUtils.rm_rf TMP_DIR FileUtils.mkdir_p TMP_DIR Dir.chdir TMP_DIR -# Given a public-suffix list formatted file -# Convert it into an array of comments and domains representing each line -def file_to_array(file) - domains = File.open(file).read - domains.gsub! /\r\n?/, "\n" # Normalize line endings - domains = domains.split("\n") -end - -# Given an array of comments/domains in public suffix format -# Converts to a hash in the form of :group => [domain1, domain2...] -def array_to_hash(domains) - group = "" - domain_hash = {} - domains.each do |line| - next if line.empty? - if match = COMMENT_REGEX.match(line) - group = match[1] - else - domain_hash[group] = [] if domain_hash[group].nil? - domain_hash[group].push line.downcase - end - end - domain_hash -end - -def domain_resolves?(domain) - res = Net::DNS::Resolver.new - res.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"] - packet = res.search(domain, Net::DNS::MX) - packet.header.anCount > 0 -end - # Clone down the lastest version of the list system "git clone --depth 1 #{REPO} #{TMP_DIR}" -domains = file_to_array(TXT_FILE) +domains = Gman::Parser.file_to_array(TXT_FILE) # Normalize ALL THE THINGS domains.map! { |domain| domain.strip } # Strip trailing slashes @@ -76,7 +41,7 @@ domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes domains.reject! { |domain| domain.empty? } # Reject empty strings # build our hash -domain_hash = array_to_hash(domains) +domain_hash = Gman::Parser.array_to_hash(domains) # filter domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist @@ -84,12 +49,12 @@ domain_hash.each do |group, domains| domains.reject! { |domain| domain.match /\// } # Reject URLs domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains - domains.select! { |domain| domain_resolves?(domain) } # Domain + domains.select! { |domain| Gman::Parser.domain_resolves?(domain) } # Domain end # Grab existing list -current = file_to_array( CURRENT_LIST ) -current_hash = array_to_hash(current) +current = Gman::Parser.file_to_array( Gman.list_path ) +current_hash = Gman::Parser.array_to_hash(current) # Lazy deep merge domain_hash.each do |group,domains| @@ -114,4 +79,4 @@ current_hash.each do |group, domains| output << domains.join("\n") end -File.open(CURRENT_LIST, "w") { |file| file.write output } +File.open(Gman.list_path, "w") { |file| file.write output } diff --git a/test/helper.rb b/test/helper.rb index 9ef6b83d..5a665b4e 100644 --- a/test/helper.rb +++ b/test/helper.rb @@ -15,6 +15,7 @@ require 'gman' require 'net/dns' require 'net/dns/resolver' +require './lib/gman/parser' class Test::Unit::TestCase end diff --git a/test/test_domains.rb b/test/test_domains.rb index 962664be..45583956 100644 --- a/test/test_domains.rb +++ b/test/test_domains.rb @@ -1,17 +1,22 @@ require File.join(File.dirname(__FILE__), 'helper') -class TestRemote < Test::Unit::TestCase +class TestDomains < Test::Unit::TestCase - def domain_resolves?(domain) - res = Net::DNS::Resolver.new - res.nameservers = ["8.8.8.8","8.8.4.4", "208.67.222.222", "208.67.220.220"] - packet = res.search(domain, Net::DNS::NS) - packet.header.anCount > 0 + WHITELIST = [ "non-us gov", "non-us mil", "US Federal"] + DOMAINS = Gman::Parser.file_to_hash(Gman.list_path) + + def whitelisted?(domain) + WHITELIST.each do |group| + return true if DOMAINS[group].include? domain + end + false end should "only contain resolvable domains" do Gman.list.each do |entry| - assert_equal true, domain_resolves?(entry.name), "Could not resolve #{entry.name}" + next if whitelisted? entry.name + resolves = Gman::Parser.domain_resolves?(entry.name) + assert_equal true, resolves, "Could not resolve #{entry.name}" end end @@ -44,5 +49,4 @@ def domain_resolves?(domain) assert_equal true, Gman.valid?("foo.#{entry.name}"), "foo.#{entry.name} is not a valid domain" end end - end diff --git a/test/test_gman.rb b/test/test_gman.rb index 2e1f9255..283263ce 100644 --- a/test/test_gman.rb +++ b/test/test_gman.rb @@ -76,4 +76,8 @@ class TestGman < Test::Unit::TestCase assert_equal nil, Gman.get_domain("