diff --git a/lib/domains.txt b/lib/domains.txt index 15be91b3..578be508 100644 --- a/lib/domains.txt +++ b/lib/domains.txt @@ -160,6 +160,7 @@ ncnhtf.org ncoah.com nconemap.net ncopenbook.gov +ncpanbranch.com ncparks.gov ncports.com ncpublications.com @@ -432,11 +433,8 @@ mil si.edu // US State, County, Local -bouldercounty.org sfmta.org sfcta.org -borough.kenai.ak.us -kcmo.org clevelandmetroparks.com // usagovAK @@ -901,6 +899,7 @@ ci.irwindale.ca.us ci.jackson.ca.us ci.la-verne.ca.us ci.lafayette.ca.us +ci.laguna-hills.ca.us ci.larkspur.ca.us ci.lathrop.ca.us ci.lincoln.ca.us @@ -1055,6 +1054,7 @@ cityoforangecove.com cityoforinda.org cityoforland.com cityoforoville.org +cityofpacifica.org cityofpalmdale.org cityofpalmdesert.org cityofpaloalto.org @@ -1362,6 +1362,7 @@ brushcolo.com burlingtoncolo.com c3gov.com cedaredgecolorado.com +centennialco.gov centennialcolorado.com chaffeecounty.org cherryhillsvillage.com @@ -1378,6 +1379,7 @@ ci.westminster.co.us ci.wheatridge.co.us cityofblackhawk.org cityofcortez.com +cityofenglewood.org cityoffortmorgan.com cityoflafayette.com cityoflonetree.com @@ -1405,6 +1407,7 @@ englewoodgov.org fcgov.com flaglercolorado.com fortlupton.org +fountaincolorado.org fremontco.com friscogov.com fruita.org @@ -2281,7 +2284,6 @@ stormlake.org tamacity.govoffice2.com tamacounty.org tiptoniowa.org -tobaccofreeqc.org traveliowa.com underwoodia.com university-heights.org @@ -2432,13 +2434,16 @@ ci.rockford.il.us cityhpil.com cityofbatavia.net cityofchicago.org +cityofchicagoheights.org cityofdanville.org cityofdekalb.com cityofeastpeoria.com cityofedwardsville.com cityofelgin.org +cityofevanston.org cityofgalena.org cityofharvard.org +cityofharvey.org cityoflakeforest.com cityoflockport.net cityofmacomb.com @@ -2508,6 +2513,7 @@ hillside-il.org historyillinois.org hoffmanestates.com homerglenil.org +il-bradley.civicplus.com il-vernonhills.civicplus.com illinoisepay.com illinoislottery.com @@ -2521,6 +2527,7 @@ lagrangepark.org lake-villa.org lakebluff.org lakevillatownship.org +lakezurich.org lasallecounty.org lemont.il.us lemonttownship.org @@ -2537,7 +2544,6 @@ masoncountyil.org maywood-il.org mchenrytownship.com melrosepark.org -millstadt.org minooka.com mocoil.org mokena.org @@ -2551,6 +2557,7 @@ mountprospect.org mtcarrollil.org mtvernon.com mtzion.com +mundelein-il.org mundelein.org murphysboro.com naperville.il.us @@ -2594,6 +2601,7 @@ roanokeil.org rochesteril.org rockdale-il.com rockfalls61071.com +rockislandcounty.org rocktonvillage.com romeoville.org roselle.il.us @@ -2614,7 +2622,6 @@ swanseail.org taylorville.net tazewell.com tinleypark.org -tobaccofreeqc.org toi.org transitchicago.com tricountyrpc.org @@ -2639,6 +2646,7 @@ villageofbrookfield.com villageofcrete.org villageoffrankfort.com villageofgilberts.com +villageofglencoe.org villageofglenwood.com villageofgrayslake.com villageofhazelcrest.com @@ -2667,6 +2675,7 @@ vniles.com volz.org warrentownship.net watsekacity.com +watsekacity.org waukegantownship.com waukeganweb.net westchester-il.org @@ -2696,7 +2705,6 @@ centertownshiptrustee.com cfcindiana.org cfclouisville.org chestertonin.org -chicagocfc.net churubusco.net ci.valparaiso.in.us cityofanderson.com @@ -3339,7 +3347,6 @@ queenstown-md.com ridgelymd.org risingsunmd.org riverdaleparkmd.info -rockislandcounty.org salisburyfd.com sharpsburgmd.com snowhillmd.com @@ -3364,6 +3371,7 @@ townofindianhead.org townoflaplata.org townofprincessanne.com townofsharptown.org +townofsmithsburg.org townofsomerset.com townofstmichaels.com townofsudlersville.org @@ -3377,7 +3385,6 @@ washingtongrovemd.org westernmarylandcfc.org westgov.com wicomicocounty.org -wmata.com woodsboro.org // usagovME @@ -3588,7 +3595,6 @@ brightontwp.com brookstownship.org brownstown-mi.org brucetwp.org -butmantownship.com byron.org cadillac-mi.net cannontwp.org @@ -3805,6 +3811,7 @@ salem-mi.org sanilaccounty.net saugatuckcity.com saulttribe.com +seekingmichigan.org shelbytwp.org shiawassee.net sjcity.com @@ -3817,7 +3824,6 @@ springfield-twp.us springlakevillage.org state.mi.us stclaircounty.org -stclairshores.net stcmi.com stephenson-mi.com stjosephcountymi.org @@ -4360,7 +4366,6 @@ cityofstjohn.org cityofversailles.org cityofwildwood.com cityofwillard.org -clarencemo.com clarksonvalley.org clarksvillemo.us claycogov.com @@ -4550,10 +4555,12 @@ ci.ocean-springs.ms.us ci.pass-christian.ms.us ci.quitman.ms.us city.jackson.ms.us +cityofamoryms.com cityofbaldwyn.com cityofbatesvillems.com cityofbaysprings.com cityofboonevillems.com +cityofboonevillems.org cityofbrandon.net cityofcarthage.org cityofclevelandms.com @@ -4588,7 +4595,6 @@ greatermscfc.org greenvillems.org greenwoodms.com gulfofmexicoalliance.org -guntownms.com hattiesburgms.com hornlake.org iukams.com @@ -5521,6 +5527,7 @@ andovertwp.org audubonparknj.org avalonboro.org avon-by-the-sea.com +avonbytheseanj.com barnegat.net barringtonboro.com bassriver-nj.org @@ -6019,6 +6026,7 @@ wyckoff-nj.com // usagovNM ashiwi.org bloomfieldnm.com +catroncounty.us ci.alamogordo.nm.us cityofcarlsbadnm.com cityofclovis.org @@ -6030,7 +6038,9 @@ co.cibola.nm.us co.colfax.nm.us co.eddy.nm.us co.mckinley.nm.us +co.otero.nm.us corrales-nm.org +currycounty.org donaanacounty.org fmtn.org grantcountynm.com @@ -6091,7 +6101,6 @@ chickaloon.org chilkatindianvillage.org choctaw.org choctawnation.com -choctawnationflorida.org chukchansi.net cied.org citci.org @@ -7597,6 +7606,7 @@ durham-oregon.us echo-oregon.com elkton-oregon.com fallscity.org +fallscityoregon.gov gduway.org getcollegefunds.org hermiston.or.us @@ -7612,7 +7622,6 @@ lincolncity.org malheurco.org morrowcountyoregon.com multco.us -mwtown.org northbendcity.org northplains.org nwsds.org @@ -7998,6 +8007,7 @@ quakertownboro.com radnor.com rankinborough.com raphotownship.com +readypa.org redlionpa.org reservetwp.com ricetwp.us @@ -9025,6 +9035,7 @@ rowlett.com roysecity.com sanangelocfc.org sanangelotexas.us +sanjacintoriverauthority.com sansabatexas.com schertz.com seagoville.us @@ -9135,6 +9146,7 @@ helpercity.net heneferutah.org herriman.org highlandcity.org +honeyvillecity.com huntsvilletown.com hyrumcity.org intermountaincfc.org @@ -9225,6 +9237,7 @@ woodscross.com wt.govoffice.com // usagovVA +211.getcare.com 211virginia.org albemarle.org alleghanycounty.us @@ -9296,6 +9309,7 @@ farmvilleva.com fcva.us floydcova.org fluvannacounty.org +franklincountyva.gov franklincountyva.org franklinva.com frontroyalva.com @@ -9304,6 +9318,7 @@ gilescounty.org gloucesterva.info govce.net gunstonhall.org +henrico.us honakerva.com hrpdc.org independenceva.com @@ -9315,6 +9330,7 @@ lancova.com lawrencevilleweb.com leesburgva.org lenowisco.org +lexingtonva.gov louisacounty.com louisatown.org manassascity.org @@ -9333,7 +9349,9 @@ onancock.com parksley.org pearisburg.org petersburg-va.org +petersburgva.gov pittgov.org +portofvirginia.com potomaccfc.org princegeorgeva.org pulaskicounty.org @@ -9755,6 +9773,7 @@ gorgecommission.org grandview.wa.us harringtonbiz.com havemilk.com +innovatewashington.org inwcfc.org islandcounty.net kettle-falls.com @@ -9968,6 +9987,7 @@ co.brown.wi.us co.calumet.wi.us co.chippewa.wi.us co.clark.wi.us +co.columbia.wi.us co.dodge.wi.us co.eau-claire.wi.us co.green-lake.wi.us @@ -10320,7 +10340,6 @@ stalbanswv.com state.wv.us summersvillewv.org upshurcounty.org -westernmarylandcfc.org woodcountywv.com wvagriculture.org wvcommerce.org @@ -10338,242 +10357,44 @@ wvtourism.com wvwda.org // usagovWY -angolamuseum.org -atchafalaya.org -battleshipnc.com -berrytownecrafts.com -bikesafenc.com -bmcnc.org carbonwy.com -caswellcenter.org ccgov.net -cfnc.org cheyennecity.org ci.gillette.wy.us ci.laramie.wy.us city-of-torrington.org city-sheridan-wy.com +cityofdouglaswy.com cityofgreenriver.org cityoflaramie.com cityofworland.org co.laramie.wy.us -codofil.org coloradoriverrecovery.org conversecounty.org -correctionenterprises.com -crawfish.org -driving95.com -eatsmartmovemorenc.com -ebrso.org -elearningnc.gov -emspic.org -encsd.net evanstonwy.org -everywomannc.com fremontcountywy.org -gottobenc.com -gottobencfestival.com -groupbenefits.org -healthycarolinians.org hscounty.com -i-85yadkinriver.com -inclusivehealth.org intermountaincfc.org -jennettespier.net -jfkadatc.net -jirdc.org johnsoncountywyoming.org kemmerer.org -la-kidmed.com -la.ngb.army.mil -labenfa.com -labp.com -labswe.org -lachiefs.org -lachiropracticboard.com -lacisd.org -lacourtreporterboard.com -lacpra.org -laddc.org -laeggs.com -lalb.org -lapels.com -laptboard.org laramiecounty.com -lasc.org -laspc.com -lastbdarchs.com -laworks.net -lbedn.org -lbespa.org -lcltfb.org lcwy.org -lma.org -loni.org -louisiana.gov -louisianacda.com -louisianaeconomicdevelopment.com -louisianaseafood.com -louisianataxfree.com -lpb.org -lpgov.org -lrcboard.org -lsba.org -lsbd.org -lsbep.org -lsbes.org -lsbid.org -lsbmt.org -lsbpie.com -lsbpne.com -lsbvm.org -lsbwdd.org -lsli.org -lsp.org lymanwy.com -mattamuskeetlodge.com medicinebow.org -mountainfair.org -murdochcenter.org -museumofthealbemarle.com -myeatsmartmovemore.com -naturalsciences.org -nc-ddc.org -nc-educationlottery.org -nc-sco.com -nc.gov -nc.ngb.army.mil -ncabc.com -ncadfp.org -ncagfairs.org -ncagr.gov -ncair.org -ncapt.tv -ncaquariums.com -ncartmuseum.org -ncatlasrevisited.org -ncatp.org -ncauditor.net -ncbrownfields.org -ncbytrain.org -nccancer.com -nccivilwar150.com -nccoastalmanagement.net -nccoastalreserve.net -nccob.org -nccommerce.com -nccourts.org -nccrimecontrol.org -ncdcr.gov -ncdenr.gov -ncdenr.org -ncdhhs.gov -ncdiabetes.org -ncdjjdp.org -ncdmf.net -ncdmfstore.net -ncdnpe.org -ncdoi.com -ncdoj.gov -ncdot.gov -ncdrought.org -ncdsca.org -ncecho.org -ncesc.com -ncesf.org -ncfacilitymanagement.net -ncfarmfresh.com -ncfarmtoschool.com -ncfhp.org -ncfilm.com -ncfirewise.org -ncfisheries.net -ncfloodmaps.com -ncforeclosurehelp.org -ncforestassessment.com -ncfreshlink.com -ncfriendsofagriculture.org -nchan.org -nchealthyschools.org -nchealthystart.org -nchfa.com -nchistoricsites.org -nchistoryday.org -ncicu.org -ncknows.org -nclabor.com -ncleg.net -nclifetimeincome.org -ncmaritimemuseums.com -ncmarkers.com -ncmedicaidpbm.com -ncminorityhealth.org -ncmotorfleet.com -ncnewbornhearing.org -ncnhp.org -ncnhtf.org -ncoah.com -nconemap.net -ncopenbook.gov -ncparks.gov -ncports.com -ncpublications.com -ncpublichealth.com -ncpublicschools.org -ncradiation.net -ncradon.org -ncrecovery.gov -ncreportcards.org -ncsd.net -ncsicklecellprogram.org -ncstatefair.org -ncstatesurplus.com -ncstormwater.org -ncstrokeregistry.com -ncsymphony.org -nctreasurer.com -ncveterans.com -ncwater.org -ncwaterquality.org -ncwelldriller.org -ncwildlife.org -nczoo.org newcastlewyoming.org -newhirereporting.com -nutritionnc.com -onencnaturally.org parkcounty.us -portgbr.com -qaronline.org rawlins-wyoming.com -readync.org -roanokeisland.com rswy.net -safesurrender.net saratoga.govoffice2.com -savewaternc.org -sehsr.org sheridancounty.com -shpnc.org -startwithyourheart.com -state.la.us -state.lib.la.us -state.nc.us state.wy.us sublettewyo.com sweet.wy.us tetonwyo.org thayne-wy.com townofjackson.com -trsl.org -tryonpalace.org uintacounty.com -visitnc.com -volunteerlouisiana.gov -volunteernc.org -waywelivednc.com -webgate.co.laramie.wy.us -wrightschool.org +uwyo.edu/CES wrightwyoming.com wyomingbusiness.org wyomingtourism.org \ No newline at end of file diff --git a/script/dedupe b/script/dedupe new file mode 100755 index 00000000..5b23fd1d --- /dev/null +++ b/script/dedupe @@ -0,0 +1,31 @@ +#! /usr/bin/env ruby + +require 'yaml' +require 'open-uri' +require './lib/gman' +require './lib/gman/parser' + + +current = Gman::Parser.file_to_array( Gman::list_path ) +domain_hash = Gman::Parser.array_to_hash(current) +domain_list = domain_hash.flat_map { |k,v| v } +puts "Current list contains #{domain_list.count} domains..." + +SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" +source_hash = YAML.load(open(SOURCE).read) +source_list = source_hash.flat_map { |k,v| v } + +dupes = [] +domain_hash.each do |group,domains| + domains.each do |domain| + if domain_list.count(domain) > 1 && source_list.count(domain) <= 1 + dupes.push(domain) + end + end +end + +dupes.uniq! + +puts "Found #{dupes.count} dupes!" + +puts dupes.inspect diff --git a/script/vendor-us b/script/vendor-us index eb81fa8c..978378b3 100755 --- a/script/vendor-us +++ b/script/vendor-us @@ -14,46 +14,27 @@ # It's also probably a good idea to run `script/ci-build` for good measure require 'rubygems' -require 'fileutils' require 'public_suffix' require 'swot' +require 'yaml' +require 'open-uri' require './lib/gman' require './lib/gman/parser' -TMP_DIR = File.expand_path("../tmp/govt-urls", File.dirname(__FILE__)) -REPO = "https://github.com/GSA-OCSIT/govt-urls" -TXT_FILE = "government-urls-hierarchical-list.txt" -YAML_FILE = "governent-urls.yaml" +SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"] domain_hash = {} -# set up our working directory -FileUtils.rm_rf TMP_DIR -FileUtils.mkdir_p TMP_DIR -Dir.chdir TMP_DIR - -# Clone down the latest version of the list -system "git clone --depth 1 #{REPO} #{TMP_DIR}" - -# Convert list to public suffix format -domains = File.open(TXT_FILE).read -domains.gsub! /.*_{3,}$/m, "" # strib leading comments -domains.gsub! /^(?! )(.*)$/, "\n//\\1" -domains.gsub! /^ \.\s/, "" -File.open(TXT_FILE, "w") { |file| file.write domains } - -# Parse domains -domains = Gman::Parser.file_to_array(TXT_FILE) -puts "Parsing #{domains.size} domains... normalizing" +domain_hash = YAML.load(open(SOURCE).read) +puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..." # Normalize ALL THE THINGS -domains.map! { |domain| domain.strip } # Strip trailing slashes -domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes -domains.reject! { |domain| domain.empty? } # Reject empty strings - -# build our hash -domain_hash = Gman::Parser.array_to_hash(domains) -puts "Normalized down to #{domain_hash.size} domains... filtering" +domain_hash.each do |group, domains| + domains.map! { |domain| domain.strip } # Strip trailing slashes + domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes + domains.map! { |domain| domain.downcase } # make lower case + domains.reject! { |domain| domain.empty? } # Reject empty strings +end # filter domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist @@ -63,7 +44,7 @@ domain_hash.each do |group, domains| domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains end -puts "Filtered down to #{domain_hash.size} domains" +puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains" # Grab existing list current = Gman::Parser.file_to_array( Gman::list_path ) diff --git a/test/test_domains.rb b/test/test_domains.rb index a6aae403..948deac9 100644 --- a/test/test_domains.rb +++ b/test/test_domains.rb @@ -15,7 +15,6 @@ def whitelisted?(domain) should "only contain resolvable domains" do unresolvables = [] Gman.list.each do |entry| - next next if whitelisted? entry.name resolves = Gman::Parser.domain_resolves?(entry.name) unresolvables.push entry.name unless resolves