diff --git a/lib/gman/domain_list.rb b/lib/gman/domain_list.rb index 1ee07f7a..b34e8088 100644 --- a/lib/gman/domain_list.rb +++ b/lib/gman/domain_list.rb @@ -15,7 +15,7 @@ def groups end def domains - list.values.flatten + list.values.flatten.sort.uniq end def count diff --git a/lib/gman/importer.rb b/lib/gman/importer.rb index 6abe877e..9a20ebbd 100644 --- a/lib/gman/importer.rb +++ b/lib/gman/importer.rb @@ -81,7 +81,9 @@ def valid_domain?(domain, options={}) true end + # if RECONCILING=true, return the reason, rather than a bool and silence log output def reject(domain, reason) + return reason if ENV["RECONCILING"] logger.info "👎 `#{domain}`: #{reason}" false end diff --git a/script/reconcile-us b/script/reconcile-us new file mode 100755 index 00000000..85765605 --- /dev/null +++ b/script/reconcile-us @@ -0,0 +1,64 @@ +#!/usr/bin/env ruby +# +# Reconciles the USA.gov-maintained list of US domains with domains.txt +# to show domains listed in the USA.gov-maintained list that we reject and why +# +# Usage: script/reconcile-us + +require './lib/gman/importer' +require 'yaml' + +ENV["RECONCILING"] = "true" +blacklist = ["usagovQUASI"] +source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt" + +data = open(source).read +data = data.split("__________________________________________________________________________") +data = data.last.strip +data = data.split(/\r?\n/).reject { |r| r.empty? } + +domains = {} +group = "" +data.each do |row| + if row =~ /^\w/ + group = row + domains[group] = [] + else + domains[group].push row.sub("\.\t", "").strip + end +end + +domains.reject! { |group,domain| blacklist.include?(group) } +importer = Gman::Importer.new(domains) + +importer.logger.info "Starting with #{importer.domains.count} domains" + +importer.domains.list.each do |group, domains| + domains.map! { |domain| Gman.new(domain).to_s } + domains.map! { |domain| importer.normalize_domain(domain) } +end + +importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains" + +missing = {} +importer.domains.list.each do |group, usagovdomains| + next unless importer.current.list[group] + missing[group] = importer.current.list[group] - usagovdomains +end + +missing.reject! { |key, value| value.empty? } + +importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list" +puts "Here's the list of missing domains:" +puts YAML.dump(missing) + +domains = importer.domains.domains +domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) } +domains.delete(true) +domains.delete(false) +domains.delete("locality") + +importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains" + +puts "Here are the rejected domains and why they were rejected (excluding locality regexs):" +puts YAML.dump(domains) diff --git a/script/vendor-gov-list b/script/vendor-gov-list index 282bbbbb..43e4bd8a 100755 --- a/script/vendor-gov-list +++ b/script/vendor-gov-list @@ -1,5 +1,17 @@ #!/bin/sh +# +# Vendors the full list of US .gov domains from https://github.com/GSA/data +# Usage: script/vendor-gov-list -DATE=2015-03-15 +# Set up +mkdir tmp +rm -Rf tmp/gsa-data -wget "https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/$DATE-full.csv" -O config/vendor/dotgovs.csv +# Vendor the last file in the dotgov-domains folder that ends in `-full.csv` +git clone https://github.com/GSA/data tmp/gsa-data +pattern="tmp/gsa-data/dotgov-domains/*-full.csv" +files=( $pattern ) +cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv + +# Clean up +rm -Rf tmp/gsa-data