diff --git a/script/dedupe b/script/dedupe new file mode 100755 index 00000000..5b23fd1d --- /dev/null +++ b/script/dedupe @@ -0,0 +1,31 @@ +#! /usr/bin/env ruby + +require 'yaml' +require 'open-uri' +require './lib/gman' +require './lib/gman/parser' + + +current = Gman::Parser.file_to_array( Gman::list_path ) +domain_hash = Gman::Parser.array_to_hash(current) +domain_list = domain_hash.flat_map { |k,v| v } +puts "Current list contains #{domain_list.count} domains..." + +SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" +source_hash = YAML.load(open(SOURCE).read) +source_list = source_hash.flat_map { |k,v| v } + +dupes = [] +domain_hash.each do |group,domains| + domains.each do |domain| + if domain_list.count(domain) > 1 && source_list.count(domain) <= 1 + dupes.push(domain) + end + end +end + +dupes.uniq! + +puts "Found #{dupes.count} dupes!" + +puts dupes.inspect diff --git a/script/vendor-us b/script/vendor-us index d0f11b01..978378b3 100755 --- a/script/vendor-us +++ b/script/vendor-us @@ -22,7 +22,6 @@ require './lib/gman' require './lib/gman/parser' SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" -TXT_FILE = "government-urls-hierarchical-list.txt" BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"] domain_hash = {} @@ -33,6 +32,7 @@ puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} doma domain_hash.each do |group, domains| domains.map! { |domain| domain.strip } # Strip trailing slashes domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes + domains.map! { |domain| domain.downcase } # make lower case domains.reject! { |domain| domain.empty? } # Reject empty strings end