Skip to content

Commit

Permalink
downcase
Browse files Browse the repository at this point in the history
  • Loading branch information
benbalter committed Jul 23, 2014
1 parent 5bc7dbb commit d50159f
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
31 changes: 31 additions & 0 deletions script/dedupe
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#! /usr/bin/env ruby

require 'yaml'
require 'open-uri'
require './lib/gman'
require './lib/gman/parser'


current = Gman::Parser.file_to_array( Gman::list_path )
domain_hash = Gman::Parser.array_to_hash(current)
domain_list = domain_hash.flat_map { |k,v| v }
puts "Current list contains #{domain_list.count} domains..."

SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
source_hash = YAML.load(open(SOURCE).read)
source_list = source_hash.flat_map { |k,v| v }

dupes = []
domain_hash.each do |group,domains|
domains.each do |domain|
if domain_list.count(domain) > 1 && source_list.count(domain) <= 1
dupes.push(domain)
end
end
end

dupes.uniq!

puts "Found #{dupes.count} dupes!"

puts dupes.inspect
2 changes: 1 addition & 1 deletion script/vendor-us
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ require './lib/gman'
require './lib/gman/parser'

SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml"
TXT_FILE = "government-urls-hierarchical-list.txt"
BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"]
domain_hash = {}

Expand All @@ -33,6 +32,7 @@ puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} doma
domain_hash.each do |group, domains|
domains.map! { |domain| domain.strip } # Strip trailing slashes
domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes
domains.map! { |domain| domain.downcase } # make lower case
domains.reject! { |domain| domain.empty? } # Reject empty strings
end

Expand Down

0 comments on commit d50159f

Please sign in to comment.