Skip to content

Commit

Permalink
Merge pull request #115 from benbalter/reconcile-us
Browse files Browse the repository at this point in the history
Create script to reconcile US list
  • Loading branch information
benbalter committed Oct 14, 2015
2 parents b13339c + e5123a9 commit e844089
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 3 deletions.
2 changes: 1 addition & 1 deletion lib/gman/domain_list.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def groups
end

def domains
list.values.flatten
list.values.flatten.sort.uniq
end

def count
Expand Down
2 changes: 2 additions & 0 deletions lib/gman/importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ def valid_domain?(domain, options={})
true
end

# if RECONCILING=true, return the reason, rather than a bool and silence log output
def reject(domain, reason)
return reason if ENV["RECONCILING"]
logger.info "👎 `#{domain}`: #{reason}"
false
end
Expand Down
64 changes: 64 additions & 0 deletions script/reconcile-us
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env ruby
#
# Reconciles the USA.gov-maintained list of US domains with domains.txt
# to show domains listed in the USA.gov-maintained list that we reject and why
#
# Usage: script/reconcile-us

require './lib/gman/importer'
require 'yaml'

ENV["RECONCILING"] = "true"
blacklist = ["usagovQUASI"]
source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt"

data = open(source).read
data = data.split("__________________________________________________________________________")
data = data.last.strip
data = data.split(/\r?\n/).reject { |r| r.empty? }

domains = {}
group = ""
data.each do |row|
if row =~ /^\w/
group = row
domains[group] = []
else
domains[group].push row.sub("\.\t", "").strip
end
end

domains.reject! { |group,domain| blacklist.include?(group) }
importer = Gman::Importer.new(domains)

importer.logger.info "Starting with #{importer.domains.count} domains"

importer.domains.list.each do |group, domains|
domains.map! { |domain| Gman.new(domain).to_s }
domains.map! { |domain| importer.normalize_domain(domain) }
end

importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains"

missing = {}
importer.domains.list.each do |group, usagovdomains|
next unless importer.current.list[group]
missing[group] = importer.current.list[group] - usagovdomains
end

missing.reject! { |key, value| value.empty? }

importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list"
puts "Here's the list of missing domains:"
puts YAML.dump(missing)

domains = importer.domains.domains
domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) }
domains.delete(true)
domains.delete(false)
domains.delete("locality")

importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains"

puts "Here are the rejected domains and why they were rejected (excluding locality regexs):"
puts YAML.dump(domains)
16 changes: 14 additions & 2 deletions script/vendor-gov-list
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
#!/bin/sh
#
# Vendors the full list of US .gov domains from https://github.com/GSA/data
# Usage: script/vendor-gov-list

DATE=2015-03-15
# Set up
mkdir tmp
rm -Rf tmp/gsa-data

wget "https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/$DATE-full.csv" -O config/vendor/dotgovs.csv
# Vendor the last file in the dotgov-domains folder that ends in `-full.csv`
git clone https://github.com/GSA/data tmp/gsa-data
pattern="tmp/gsa-data/dotgov-domains/*-full.csv"
files=( $pattern )
cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv

# Clean up
rm -Rf tmp/gsa-data

0 comments on commit e844089

Please sign in to comment.