From d8fb955eb4d1af9a9a22ac83031593236c52b73a Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Mon, 12 Oct 2015 15:11:04 -0400 Subject: [PATCH 1/4] add script to reconcile us domains --- lib/gman/domain_list.rb | 2 +- lib/gman/importer.rb | 2 ++ script/reconcile-us | 64 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) create mode 100755 script/reconcile-us diff --git a/lib/gman/domain_list.rb b/lib/gman/domain_list.rb index 1ee07f7a..b34e8088 100644 --- a/lib/gman/domain_list.rb +++ b/lib/gman/domain_list.rb @@ -15,7 +15,7 @@ def groups end def domains - list.values.flatten + list.values.flatten.sort.uniq end def count diff --git a/lib/gman/importer.rb b/lib/gman/importer.rb index 0ca4331a..da89d175 100644 --- a/lib/gman/importer.rb +++ b/lib/gman/importer.rb @@ -81,7 +81,9 @@ def valid_domain?(domain, options={}) true end + # if RECONCILING=true, return the reason, rather than a bool and silence log output def reject(domain, reason) + return reason if ENV["RECONCILING"] logger.info "👎 `#{domain}`: #{reason}" false end diff --git a/script/reconcile-us b/script/reconcile-us new file mode 100755 index 00000000..85765605 --- /dev/null +++ b/script/reconcile-us @@ -0,0 +1,64 @@ +#!/usr/bin/env ruby +# +# Reconciles the USA.gov-maintained list of US domains with domains.txt +# to show domains listed in the USA.gov-maintained list that we reject and why +# +# Usage: script/reconcile-us + +require './lib/gman/importer' +require 'yaml' + +ENV["RECONCILING"] = "true" +blacklist = ["usagovQUASI"] +source = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls-hierarchical-list.txt" + +data = open(source).read +data = data.split("__________________________________________________________________________") +data = data.last.strip +data = data.split(/\r?\n/).reject { |r| r.empty? } + +domains = {} +group = "" +data.each do |row| + if row =~ /^\w/ + group = row + domains[group] = [] + else + domains[group].push row.sub("\.\t", "").strip + end +end + +domains.reject! { |group,domain| blacklist.include?(group) } +importer = Gman::Importer.new(domains) + +importer.logger.info "Starting with #{importer.domains.count} domains" + +importer.domains.list.each do |group, domains| + domains.map! { |domain| Gman.new(domain).to_s } + domains.map! { |domain| importer.normalize_domain(domain) } +end + +importer.logger.info "Filtered down to #{importer.domains.domains.count} normalized domains" + +missing = {} +importer.domains.list.each do |group, usagovdomains| + next unless importer.current.list[group] + missing[group] = importer.current.list[group] - usagovdomains +end + +missing.reject! { |key, value| value.empty? } + +importer.logger.info "Found #{missing.values.count} domains not on the USA.gov list" +puts "Here's the list of missing domains:" +puts YAML.dump(missing) + +domains = importer.domains.domains +domains = domains.group_by { |domain| importer.valid_domain?(domain, :skip_dupe => true) } +domains.delete(true) +domains.delete(false) +domains.delete("locality") + +importer.logger.info "Calling out #{domains.values.flatten.count} rejected domains" + +puts "Here are the rejected domains and why they were rejected (excluding locality regexs):" +puts YAML.dump(domains) From 8fb52222e570bba50ff0fab474f24d0b087c5b96 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Mon, 12 Oct 2015 16:11:57 -0400 Subject: [PATCH 2/4] dynamically get latest from gov list without date --- script/vendor-gov-list | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/script/vendor-gov-list b/script/vendor-gov-list index 282bbbbb..3ea28426 100755 --- a/script/vendor-gov-list +++ b/script/vendor-gov-list @@ -2,4 +2,12 @@ DATE=2015-03-15 -wget "https://raw.githubusercontent.com/GSA/data/gh-pages/dotgov-domains/$DATE-full.csv" -O config/vendor/dotgovs.csv +mkdir tmp +rm -Rf tmp/gsa-data + +git clone https://github.com/GSA/data tmp/gsa-data +pattern="tmp/gsa-data/dotgov-domains/*-full.csv" +files=( $pattern ) +cp "${files[@]:(-1)}" config/vendor/dotgovs.csv + +rm -Rf tmp/gsa-data From 6ff681a73a0a6c990fd0b4f1fddd3745d541a9fe Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Mon, 12 Oct 2015 16:14:05 -0400 Subject: [PATCH 3/4] comment --- script/vendor-gov-list | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/script/vendor-gov-list b/script/vendor-gov-list index 3ea28426..678138a2 100755 --- a/script/vendor-gov-list +++ b/script/vendor-gov-list @@ -1,13 +1,17 @@ #!/bin/sh +# +# Vendors the full list of US .gov domains from https://github.com/GSA/data +# Usage: script/vendor-gov-list -DATE=2015-03-15 - +# Set up mkdir tmp rm -Rf tmp/gsa-data +# Vendor the last file in the dotgov-domains folder that ends in `-full.csv` git clone https://github.com/GSA/data tmp/gsa-data pattern="tmp/gsa-data/dotgov-domains/*-full.csv" files=( $pattern ) cp "${files[@]:(-1)}" config/vendor/dotgovs.csv +# Clean up rm -Rf tmp/gsa-data From e5123a9eaad7635a3754d24aa8cf7ff859916ba9 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Mon, 12 Oct 2015 16:14:26 -0400 Subject: [PATCH 4/4] force the copy --- script/vendor-gov-list | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/vendor-gov-list b/script/vendor-gov-list index 678138a2..43e4bd8a 100755 --- a/script/vendor-gov-list +++ b/script/vendor-gov-list @@ -11,7 +11,7 @@ rm -Rf tmp/gsa-data git clone https://github.com/GSA/data tmp/gsa-data pattern="tmp/gsa-data/dotgov-domains/*-full.csv" files=( $pattern ) -cp "${files[@]:(-1)}" config/vendor/dotgovs.csv +cp -f "${files[@]:(-1)}" config/vendor/dotgovs.csv # Clean up rm -Rf tmp/gsa-data