From 50c44184e1200d739aa677521cb9430422c7a064 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Wed, 23 Jul 2014 14:11:47 -0400 Subject: [PATCH 1/5] update list and script --- lib/domains.txt | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ script/vendor-us | 41 +++++++++----------------------- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/lib/domains.txt b/lib/domains.txt index 15be91b3..5db082d0 100644 --- a/lib/domains.txt +++ b/lib/domains.txt @@ -31,6 +31,7 @@ loni.org louisiana.gov louisianacda.com louisianaeconomicdevelopment.com +louisianamarinedebris.com louisianaseafood.com louisianataxfree.com lpb.org @@ -55,6 +56,7 @@ trsl.org volunteerlouisiana.gov // NCgov +I-85yadkinriver.com battleshipnc.com berrytownecrafts.com bikesafenc.com @@ -160,6 +162,7 @@ ncnhtf.org ncoah.com nconemap.net ncopenbook.gov +ncpanbranch.com ncparks.gov ncports.com ncpublications.com @@ -440,6 +443,7 @@ kcmo.org clevelandmetroparks.com // usagovAK +USCGAlaska.com afognak.org ahfc.us aidea.org @@ -901,6 +905,7 @@ ci.irwindale.ca.us ci.jackson.ca.us ci.la-verne.ca.us ci.lafayette.ca.us +ci.laguna-hills.ca.us ci.larkspur.ca.us ci.lathrop.ca.us ci.lincoln.ca.us @@ -1055,6 +1060,7 @@ cityoforangecove.com cityoforinda.org cityoforland.com cityoforoville.org +cityofpacifica.org cityofpalmdale.org cityofpalmdesert.org cityofpaloalto.org @@ -1220,6 +1226,7 @@ mccsd.com mcfarlandcity.org menlopark.org modestogov.com +modoccounty.us mojaveindiantribe.com monterey.org montesereno.org @@ -1362,6 +1369,7 @@ brushcolo.com burlingtoncolo.com c3gov.com cedaredgecolorado.com +centennialco.gov centennialcolorado.com chaffeecounty.org cherryhillsvillage.com @@ -1378,6 +1386,7 @@ ci.westminster.co.us ci.wheatridge.co.us cityofblackhawk.org cityofcortez.com +cityofenglewood.org cityoffortmorgan.com cityoflafayette.com cityoflonetree.com @@ -1396,6 +1405,7 @@ coloradonocall.com coloradoriverrecovery.org crgov.com cripplecreekgov.com +deertrailcolorado.org deltacounty.com denvergov.org durangogov.org @@ -1405,6 +1415,7 @@ englewoodgov.org fcgov.com flaglercolorado.com fortlupton.org +fountaincolorado.org fremontco.com friscogov.com fruita.org @@ -1588,6 +1599,7 @@ washington.org wmata.com // usagovDE +DelawareNationalGuard.com cbacfc.org cityofdover.com cityofmilford.com @@ -2070,6 +2082,7 @@ peachtree-city.org pearson-ga.com pembrokega.net pinelakega.com +polkcountygeorgia.us pooler-ga.us putnamcountyga.us rockdalecounty.org @@ -2432,13 +2445,16 @@ ci.rockford.il.us cityhpil.com cityofbatavia.net cityofchicago.org +cityofchicagoheights.org cityofdanville.org cityofdekalb.com cityofeastpeoria.com cityofedwardsville.com cityofelgin.org +cityofevanston.org cityofgalena.org cityofharvard.org +cityofharvey.org cityoflakeforest.com cityoflockport.net cityofmacomb.com @@ -2508,6 +2524,7 @@ hillside-il.org historyillinois.org hoffmanestates.com homerglenil.org +il-bradley.civicplus.com il-vernonhills.civicplus.com illinoisepay.com illinoislottery.com @@ -2521,6 +2538,7 @@ lagrangepark.org lake-villa.org lakebluff.org lakevillatownship.org +lakezurich.org lasallecounty.org lemont.il.us lemonttownship.org @@ -2551,6 +2569,7 @@ mountprospect.org mtcarrollil.org mtvernon.com mtzion.com +mundelein-il.org mundelein.org murphysboro.com naperville.il.us @@ -2594,6 +2613,7 @@ roanokeil.org rochesteril.org rockdale-il.com rockfalls61071.com +rockislandcounty.org rocktonvillage.com romeoville.org roselle.il.us @@ -2639,6 +2659,7 @@ villageofbrookfield.com villageofcrete.org villageoffrankfort.com villageofgilberts.com +villageofglencoe.org villageofglenwood.com villageofgrayslake.com villageofhazelcrest.com @@ -2667,6 +2688,7 @@ vniles.com volz.org warrentownship.net watsekacity.com +watsekacity.org waukegantownship.com waukeganweb.net westchester-il.org @@ -3364,6 +3386,7 @@ townofindianhead.org townoflaplata.org townofprincessanne.com townofsharptown.org +townofsmithsburg.org townofsomerset.com townofstmichaels.com townofsudlersville.org @@ -3387,6 +3410,7 @@ alfredme.us aroostook.me.us arrowsic.org arundelmaine.org +atkinson-me.org berwickmaine.org bethelmaine.org biddefordmaine.org @@ -3805,6 +3829,7 @@ salem-mi.org sanilaccounty.net saugatuckcity.com saulttribe.com +seekingmichigan.org shelbytwp.org shiawassee.net sjcity.com @@ -3854,6 +3879,7 @@ woodhavenmi.org wyandotte.net // usagovMN +MinnesotaNationalGuard.org albanytownship.com alexandriatownship.org ardc.org @@ -4190,6 +4216,7 @@ northmankato.com nwrdc.org oronoco.com oronocotownship.com +ortonville.net oxfordtownship.us parkersprairie.net paynesvillemn.com @@ -4278,6 +4305,7 @@ wyomingmn.org zimmerman.govoffice.com // usagovMO +GoColumbiaMo.com albanymo.net arnoldmo.org ashgrovemo.org @@ -4550,10 +4578,12 @@ ci.ocean-springs.ms.us ci.pass-christian.ms.us ci.quitman.ms.us city.jackson.ms.us +cityofamoryms.com cityofbaldwyn.com cityofbatesvillems.com cityofbaysprings.com cityofboonevillems.com +cityofboonevillems.org cityofbrandon.net cityofcarthage.org cityofclevelandms.com @@ -5314,6 +5344,7 @@ heartofthemidlandscfc.org imperial-ne.com mindennebraska.org ne-ethanol.org +neagrelations.org neatp.org nebraskaartscouncil.org nebraskacorn.org @@ -5521,6 +5552,7 @@ andovertwp.org audubonparknj.org avalonboro.org avon-by-the-sea.com +avonbytheseanj.com barnegat.net barringtonboro.com bassriver-nj.org @@ -6019,6 +6051,7 @@ wyckoff-nj.com // usagovNM ashiwi.org bloomfieldnm.com +catroncounty.us ci.alamogordo.nm.us cityofcarlsbadnm.com cityofclovis.org @@ -6030,7 +6063,9 @@ co.cibola.nm.us co.colfax.nm.us co.eddy.nm.us co.mckinley.nm.us +co.otero.nm.us corrales-nm.org +currycounty.org donaanacounty.org fmtn.org grantcountynm.com @@ -6275,6 +6310,7 @@ accessesmeralda.com bcnv.org carson.org churchillcounty.org +cityoffallon.org cityoffernley.org cityofhenderson.com cityofnorthlasvegas.com @@ -6499,6 +6535,7 @@ hartfordny.com hastingsgov.org henrietta.org herkimercounty.org +heuveltonny.us hewlettharbor.org hillburn.org hillsdaleny.com @@ -6564,6 +6601,7 @@ newyorkfed.org niagaracounty.com niagarafallsusa.org niskayuna.org +norfolkny.us northcastleny.com northelba.org northhempstead.com @@ -6681,6 +6719,7 @@ southbristol.org southbristolny.org srbc.net stamfordny.com +state.ny.us statenislandusa.com steubencony.org stillwaterny.org @@ -7597,6 +7636,7 @@ durham-oregon.us echo-oregon.com elkton-oregon.com fallscity.org +fallscityoregon.gov gduway.org getcollegefunds.org hermiston.or.us @@ -7953,6 +7993,7 @@ oakdaleborough.com oakmontborough.com ohara.pa.us ohiotwp.org +pa.wildlifelicense.com pacast.com pacouncilonthearts.org pacourts.us @@ -7961,6 +8002,7 @@ paehealth.com palmertonborough.com palmertwp.com paradisetownship.com +parkercity.org parkesburg.org parksideboro.com patientsafetyauthority.org @@ -7998,6 +8040,7 @@ quakertownboro.com radnor.com rankinborough.com raphotownship.com +readypa.org redlionpa.org reservetwp.com ricetwp.us @@ -8137,6 +8180,7 @@ whitpaintownship.net wilkinstownship.com williamstwp.org willistown.pa.us +wilmerdingBoro.com wilmerdingboro.com wilsonborough.org windsortwp.com @@ -8465,6 +8509,7 @@ salemsd.com sddot.com sdhda.org sdonecall.com +siouxempirecfc.org siouxfalls.org state.sd.us sullycounty.net @@ -9025,6 +9070,7 @@ rowlett.com roysecity.com sanangelocfc.org sanangelotexas.us +sanjacintoriverauthority.com sansabatexas.com schertz.com seagoville.us @@ -9135,6 +9181,7 @@ helpercity.net heneferutah.org herriman.org highlandcity.org +honeyvillecity.com huntsvilletown.com hyrumcity.org intermountaincfc.org @@ -9225,6 +9272,7 @@ woodscross.com wt.govoffice.com // usagovVA +211.getcare.com 211virginia.org albemarle.org alleghanycounty.us @@ -9296,6 +9344,7 @@ farmvilleva.com fcva.us floydcova.org fluvannacounty.org +franklincountyva.gov franklincountyva.org franklinva.com frontroyalva.com @@ -9304,6 +9353,7 @@ gilescounty.org gloucesterva.info govce.net gunstonhall.org +henrico.us honakerva.com hrpdc.org independenceva.com @@ -9315,6 +9365,7 @@ lancova.com lawrencevilleweb.com leesburgva.org lenowisco.org +lexingtonva.gov louisacounty.com louisatown.org manassascity.org @@ -9333,7 +9384,9 @@ onancock.com parksley.org pearisburg.org petersburg-va.org +petersburgva.gov pittgov.org +portofvirginia.com potomaccfc.org princegeorgeva.org pulaskicounty.org @@ -9424,6 +9477,7 @@ winusvilottery.com // usagovVT 1-800-vermont.com +VermontVacation.com barnetvt.org barrecity.org barretown.org @@ -9501,6 +9555,7 @@ newportvermont.org northernnewenglandcfc.org northherovt.com norwich.vt.us +onioncity.com pantonvt.us peacham.net pittsfieldvt.org @@ -9587,6 +9642,7 @@ bucoda.us cawh.org centralwashingtoncfc.us cfcgive.org +cfcisland.org cfcnps.org ci.bainbridge-isl.wa.us ci.bellevue.wa.us @@ -9755,6 +9811,7 @@ gorgecommission.org grandview.wa.us harringtonbiz.com havemilk.com +innovatewashington.org inwcfc.org islandcounty.net kettle-falls.com @@ -9857,6 +9914,7 @@ wssb.org yakimacounty.us // usagovWI +BrightonWI.com addisonwi.org adellwi.govoffice2.com algomacity.org @@ -9968,6 +10026,7 @@ co.brown.wi.us co.calumet.wi.us co.chippewa.wi.us co.clark.wi.us +co.columbia.wi.us co.dodge.wi.us co.eau-claire.wi.us co.green-lake.wi.us @@ -10045,6 +10104,7 @@ iowacounty.org jeffersonwis.com johnsoncreek-wi.us kenosha.org +kewaskumsausage.com kewauneeco.org kildaretownship.com kohlervillage.org @@ -10353,6 +10413,7 @@ ci.gillette.wy.us ci.laramie.wy.us city-of-torrington.org city-sheridan-wy.com +cityofdouglaswy.com cityofgreenriver.org cityoflaramie.com cityofworland.org diff --git a/script/vendor-us b/script/vendor-us index eb81fa8c..d0f11b01 100755 --- a/script/vendor-us +++ b/script/vendor-us @@ -14,46 +14,27 @@ # It's also probably a good idea to run `script/ci-build` for good measure require 'rubygems' -require 'fileutils' require 'public_suffix' require 'swot' +require 'yaml' +require 'open-uri' require './lib/gman' require './lib/gman/parser' -TMP_DIR = File.expand_path("../tmp/govt-urls", File.dirname(__FILE__)) -REPO = "https://github.com/GSA-OCSIT/govt-urls" +SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" TXT_FILE = "government-urls-hierarchical-list.txt" -YAML_FILE = "governent-urls.yaml" BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"] domain_hash = {} -# set up our working directory -FileUtils.rm_rf TMP_DIR -FileUtils.mkdir_p TMP_DIR -Dir.chdir TMP_DIR - -# Clone down the latest version of the list -system "git clone --depth 1 #{REPO} #{TMP_DIR}" - -# Convert list to public suffix format -domains = File.open(TXT_FILE).read -domains.gsub! /.*_{3,}$/m, "" # strib leading comments -domains.gsub! /^(?! )(.*)$/, "\n//\\1" -domains.gsub! /^ \.\s/, "" -File.open(TXT_FILE, "w") { |file| file.write domains } - -# Parse domains -domains = Gman::Parser.file_to_array(TXT_FILE) -puts "Parsing #{domains.size} domains... normalizing" +domain_hash = YAML.load(open(SOURCE).read) +puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains..." # Normalize ALL THE THINGS -domains.map! { |domain| domain.strip } # Strip trailing slashes -domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes -domains.reject! { |domain| domain.empty? } # Reject empty strings - -# build our hash -domain_hash = Gman::Parser.array_to_hash(domains) -puts "Normalized down to #{domain_hash.size} domains... filtering" +domain_hash.each do |group, domains| + domains.map! { |domain| domain.strip } # Strip trailing slashes + domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes + domains.reject! { |domain| domain.empty? } # Reject empty strings +end # filter domain_hash.reject! { |group,domain| BLACKLIST.include?(group) } # Group blacklist @@ -63,7 +44,7 @@ domain_hash.each do |group, domains| domains.select! { |domain| PublicSuffix.valid?(domain) } # Validate domain domains.reject! { |domain| Swot::is_academic?(domain) } # Reject academic domains end -puts "Filtered down to #{domain_hash.size} domains" +puts "Filtered down to #{domain_hash.map { |group,domains| domains.count }.inject(:+)} domains" # Grab existing list current = Gman::Parser.file_to_array( Gman::list_path ) From 716e326c8d58104d311d9ceb6979ab0a3cfdb09f Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Wed, 23 Jul 2014 14:54:19 -0400 Subject: [PATCH 2/5] remove unresolveable domains --- lib/domains.txt | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/lib/domains.txt b/lib/domains.txt index 5db082d0..511a41a8 100644 --- a/lib/domains.txt +++ b/lib/domains.txt @@ -31,7 +31,6 @@ loni.org louisiana.gov louisianacda.com louisianaeconomicdevelopment.com -louisianamarinedebris.com louisianaseafood.com louisianataxfree.com lpb.org @@ -1226,7 +1225,6 @@ mccsd.com mcfarlandcity.org menlopark.org modestogov.com -modoccounty.us mojaveindiantribe.com monterey.org montesereno.org @@ -1405,7 +1403,6 @@ coloradonocall.com coloradoriverrecovery.org crgov.com cripplecreekgov.com -deertrailcolorado.org deltacounty.com denvergov.org durangogov.org @@ -2082,7 +2079,6 @@ peachtree-city.org pearson-ga.com pembrokega.net pinelakega.com -polkcountygeorgia.us pooler-ga.us putnamcountyga.us rockdalecounty.org @@ -2555,7 +2551,6 @@ masoncountyil.org maywood-il.org mchenrytownship.com melrosepark.org -millstadt.org minooka.com mocoil.org mokena.org @@ -3410,7 +3405,6 @@ alfredme.us aroostook.me.us arrowsic.org arundelmaine.org -atkinson-me.org berwickmaine.org bethelmaine.org biddefordmaine.org @@ -3612,7 +3606,6 @@ brightontwp.com brookstownship.org brownstown-mi.org brucetwp.org -butmantownship.com byron.org cadillac-mi.net cannontwp.org @@ -3842,7 +3835,6 @@ springfield-twp.us springlakevillage.org state.mi.us stclaircounty.org -stclairshores.net stcmi.com stephenson-mi.com stjosephcountymi.org @@ -4216,7 +4208,6 @@ northmankato.com nwrdc.org oronoco.com oronocotownship.com -ortonville.net oxfordtownship.us parkersprairie.net paynesvillemn.com @@ -4388,7 +4379,6 @@ cityofstjohn.org cityofversailles.org cityofwildwood.com cityofwillard.org -clarencemo.com clarksonvalley.org clarksvillemo.us claycogov.com @@ -4618,7 +4608,6 @@ greatermscfc.org greenvillems.org greenwoodms.com gulfofmexicoalliance.org -guntownms.com hattiesburgms.com hornlake.org iukams.com @@ -5344,7 +5333,6 @@ heartofthemidlandscfc.org imperial-ne.com mindennebraska.org ne-ethanol.org -neagrelations.org neatp.org nebraskaartscouncil.org nebraskacorn.org @@ -6126,7 +6114,6 @@ chickaloon.org chilkatindianvillage.org choctaw.org choctawnation.com -choctawnationflorida.org chukchansi.net cied.org citci.org @@ -6310,7 +6297,6 @@ accessesmeralda.com bcnv.org carson.org churchillcounty.org -cityoffallon.org cityoffernley.org cityofhenderson.com cityofnorthlasvegas.com @@ -6535,7 +6521,6 @@ hartfordny.com hastingsgov.org henrietta.org herkimercounty.org -heuveltonny.us hewlettharbor.org hillburn.org hillsdaleny.com @@ -6601,7 +6586,6 @@ newyorkfed.org niagaracounty.com niagarafallsusa.org niskayuna.org -norfolkny.us northcastleny.com northelba.org northhempstead.com @@ -6719,7 +6703,6 @@ southbristol.org southbristolny.org srbc.net stamfordny.com -state.ny.us statenislandusa.com steubencony.org stillwaterny.org @@ -7993,7 +7976,6 @@ oakdaleborough.com oakmontborough.com ohara.pa.us ohiotwp.org -pa.wildlifelicense.com pacast.com pacouncilonthearts.org pacourts.us @@ -8002,7 +7984,6 @@ paehealth.com palmertonborough.com palmertwp.com paradisetownship.com -parkercity.org parkesburg.org parksideboro.com patientsafetyauthority.org @@ -8509,7 +8490,6 @@ salemsd.com sddot.com sdhda.org sdonecall.com -siouxempirecfc.org siouxfalls.org state.sd.us sullycounty.net @@ -9555,7 +9535,6 @@ newportvermont.org northernnewenglandcfc.org northherovt.com norwich.vt.us -onioncity.com pantonvt.us peacham.net pittsfieldvt.org @@ -9642,7 +9621,6 @@ bucoda.us cawh.org centralwashingtoncfc.us cfcgive.org -cfcisland.org cfcnps.org ci.bainbridge-isl.wa.us ci.bellevue.wa.us @@ -10104,7 +10082,6 @@ iowacounty.org jeffersonwis.com johnsoncreek-wi.us kenosha.org -kewaskumsausage.com kewauneeco.org kildaretownship.com kohlervillage.org @@ -10413,7 +10390,6 @@ ci.gillette.wy.us ci.laramie.wy.us city-of-torrington.org city-sheridan-wy.com -cityofdouglaswy.com cityofgreenriver.org cityoflaramie.com cityofworland.org From f8ace7bd82735faf10e21fe6886036885e18728b Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Wed, 23 Jul 2014 15:21:15 -0400 Subject: [PATCH 3/5] de-dupe --- lib/domains.txt | 220 +----------------------------------------------- 1 file changed, 2 insertions(+), 218 deletions(-) diff --git a/lib/domains.txt b/lib/domains.txt index 511a41a8..578be508 100644 --- a/lib/domains.txt +++ b/lib/domains.txt @@ -55,7 +55,6 @@ trsl.org volunteerlouisiana.gov // NCgov -I-85yadkinriver.com battleshipnc.com berrytownecrafts.com bikesafenc.com @@ -434,15 +433,11 @@ mil si.edu // US State, County, Local -bouldercounty.org sfmta.org sfcta.org -borough.kenai.ak.us -kcmo.org clevelandmetroparks.com // usagovAK -USCGAlaska.com afognak.org ahfc.us aidea.org @@ -1596,7 +1591,6 @@ washington.org wmata.com // usagovDE -DelawareNationalGuard.com cbacfc.org cityofdover.com cityofmilford.com @@ -2290,7 +2284,6 @@ stormlake.org tamacity.govoffice2.com tamacounty.org tiptoniowa.org -tobaccofreeqc.org traveliowa.com underwoodia.com university-heights.org @@ -2629,7 +2622,6 @@ swanseail.org taylorville.net tazewell.com tinleypark.org -tobaccofreeqc.org toi.org transitchicago.com tricountyrpc.org @@ -2713,7 +2705,6 @@ centertownshiptrustee.com cfcindiana.org cfclouisville.org chestertonin.org -chicagocfc.net churubusco.net ci.valparaiso.in.us cityofanderson.com @@ -3356,7 +3347,6 @@ queenstown-md.com ridgelymd.org risingsunmd.org riverdaleparkmd.info -rockislandcounty.org salisburyfd.com sharpsburgmd.com snowhillmd.com @@ -3395,7 +3385,6 @@ washingtongrovemd.org westernmarylandcfc.org westgov.com wicomicocounty.org -wmata.com woodsboro.org // usagovME @@ -3871,7 +3860,6 @@ woodhavenmi.org wyandotte.net // usagovMN -MinnesotaNationalGuard.org albanytownship.com alexandriatownship.org ardc.org @@ -4296,7 +4284,6 @@ wyomingmn.org zimmerman.govoffice.com // usagovMO -GoColumbiaMo.com albanymo.net arnoldmo.org ashgrovemo.org @@ -7635,7 +7622,6 @@ lincolncity.org malheurco.org morrowcountyoregon.com multco.us -mwtown.org northbendcity.org northplains.org nwsds.org @@ -8161,7 +8147,6 @@ whitpaintownship.net wilkinstownship.com williamstwp.org willistown.pa.us -wilmerdingBoro.com wilmerdingboro.com wilsonborough.org windsortwp.com @@ -9457,7 +9442,6 @@ winusvilottery.com // usagovVT 1-800-vermont.com -VermontVacation.com barnetvt.org barrecity.org barretown.org @@ -9892,7 +9876,6 @@ wssb.org yakimacounty.us // usagovWI -BrightonWI.com addisonwi.org adellwi.govoffice2.com algomacity.org @@ -10357,7 +10340,6 @@ stalbanswv.com state.wv.us summersvillewv.org upshurcounty.org -westernmarylandcfc.org woodcountywv.com wvagriculture.org wvcommerce.org @@ -10375,242 +10357,44 @@ wvtourism.com wvwda.org // usagovWY -angolamuseum.org -atchafalaya.org -battleshipnc.com -berrytownecrafts.com -bikesafenc.com -bmcnc.org carbonwy.com -caswellcenter.org ccgov.net -cfnc.org cheyennecity.org ci.gillette.wy.us ci.laramie.wy.us city-of-torrington.org city-sheridan-wy.com +cityofdouglaswy.com cityofgreenriver.org cityoflaramie.com cityofworland.org co.laramie.wy.us -codofil.org coloradoriverrecovery.org conversecounty.org -correctionenterprises.com -crawfish.org -driving95.com -eatsmartmovemorenc.com -ebrso.org -elearningnc.gov -emspic.org -encsd.net evanstonwy.org -everywomannc.com fremontcountywy.org -gottobenc.com -gottobencfestival.com -groupbenefits.org -healthycarolinians.org hscounty.com -i-85yadkinriver.com -inclusivehealth.org intermountaincfc.org -jennettespier.net -jfkadatc.net -jirdc.org johnsoncountywyoming.org kemmerer.org -la-kidmed.com -la.ngb.army.mil -labenfa.com -labp.com -labswe.org -lachiefs.org -lachiropracticboard.com -lacisd.org -lacourtreporterboard.com -lacpra.org -laddc.org -laeggs.com -lalb.org -lapels.com -laptboard.org laramiecounty.com -lasc.org -laspc.com -lastbdarchs.com -laworks.net -lbedn.org -lbespa.org -lcltfb.org lcwy.org -lma.org -loni.org -louisiana.gov -louisianacda.com -louisianaeconomicdevelopment.com -louisianaseafood.com -louisianataxfree.com -lpb.org -lpgov.org -lrcboard.org -lsba.org -lsbd.org -lsbep.org -lsbes.org -lsbid.org -lsbmt.org -lsbpie.com -lsbpne.com -lsbvm.org -lsbwdd.org -lsli.org -lsp.org lymanwy.com -mattamuskeetlodge.com medicinebow.org -mountainfair.org -murdochcenter.org -museumofthealbemarle.com -myeatsmartmovemore.com -naturalsciences.org -nc-ddc.org -nc-educationlottery.org -nc-sco.com -nc.gov -nc.ngb.army.mil -ncabc.com -ncadfp.org -ncagfairs.org -ncagr.gov -ncair.org -ncapt.tv -ncaquariums.com -ncartmuseum.org -ncatlasrevisited.org -ncatp.org -ncauditor.net -ncbrownfields.org -ncbytrain.org -nccancer.com -nccivilwar150.com -nccoastalmanagement.net -nccoastalreserve.net -nccob.org -nccommerce.com -nccourts.org -nccrimecontrol.org -ncdcr.gov -ncdenr.gov -ncdenr.org -ncdhhs.gov -ncdiabetes.org -ncdjjdp.org -ncdmf.net -ncdmfstore.net -ncdnpe.org -ncdoi.com -ncdoj.gov -ncdot.gov -ncdrought.org -ncdsca.org -ncecho.org -ncesc.com -ncesf.org -ncfacilitymanagement.net -ncfarmfresh.com -ncfarmtoschool.com -ncfhp.org -ncfilm.com -ncfirewise.org -ncfisheries.net -ncfloodmaps.com -ncforeclosurehelp.org -ncforestassessment.com -ncfreshlink.com -ncfriendsofagriculture.org -nchan.org -nchealthyschools.org -nchealthystart.org -nchfa.com -nchistoricsites.org -nchistoryday.org -ncicu.org -ncknows.org -nclabor.com -ncleg.net -nclifetimeincome.org -ncmaritimemuseums.com -ncmarkers.com -ncmedicaidpbm.com -ncminorityhealth.org -ncmotorfleet.com -ncnewbornhearing.org -ncnhp.org -ncnhtf.org -ncoah.com -nconemap.net -ncopenbook.gov -ncparks.gov -ncports.com -ncpublications.com -ncpublichealth.com -ncpublicschools.org -ncradiation.net -ncradon.org -ncrecovery.gov -ncreportcards.org -ncsd.net -ncsicklecellprogram.org -ncstatefair.org -ncstatesurplus.com -ncstormwater.org -ncstrokeregistry.com -ncsymphony.org -nctreasurer.com -ncveterans.com -ncwater.org -ncwaterquality.org -ncwelldriller.org -ncwildlife.org -nczoo.org newcastlewyoming.org -newhirereporting.com -nutritionnc.com -onencnaturally.org parkcounty.us -portgbr.com -qaronline.org rawlins-wyoming.com -readync.org -roanokeisland.com rswy.net -safesurrender.net saratoga.govoffice2.com -savewaternc.org -sehsr.org sheridancounty.com -shpnc.org -startwithyourheart.com -state.la.us -state.lib.la.us -state.nc.us state.wy.us sublettewyo.com sweet.wy.us tetonwyo.org thayne-wy.com townofjackson.com -trsl.org -tryonpalace.org uintacounty.com -visitnc.com -volunteerlouisiana.gov -volunteernc.org -waywelivednc.com -webgate.co.laramie.wy.us -wrightschool.org +uwyo.edu/CES wrightwyoming.com wyomingbusiness.org wyomingtourism.org \ No newline at end of file From 5bc7dbb4bf0f56a1a851e8616f75ebe8e65e1d95 Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Wed, 23 Jul 2014 15:21:36 -0400 Subject: [PATCH 4/5] actually check domain validity --- test/test_domains.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_domains.rb b/test/test_domains.rb index a6aae403..948deac9 100644 --- a/test/test_domains.rb +++ b/test/test_domains.rb @@ -15,7 +15,6 @@ def whitelisted?(domain) should "only contain resolvable domains" do unresolvables = [] Gman.list.each do |entry| - next next if whitelisted? entry.name resolves = Gman::Parser.domain_resolves?(entry.name) unresolvables.push entry.name unless resolves From d50159fb4d0311c3baf307562c5f014cd0e7c82f Mon Sep 17 00:00:00 2001 From: Ben Balter Date: Wed, 23 Jul 2014 15:21:42 -0400 Subject: [PATCH 5/5] downcase --- script/dedupe | 31 +++++++++++++++++++++++++++++++ script/vendor-us | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100755 script/dedupe diff --git a/script/dedupe b/script/dedupe new file mode 100755 index 00000000..5b23fd1d --- /dev/null +++ b/script/dedupe @@ -0,0 +1,31 @@ +#! /usr/bin/env ruby + +require 'yaml' +require 'open-uri' +require './lib/gman' +require './lib/gman/parser' + + +current = Gman::Parser.file_to_array( Gman::list_path ) +domain_hash = Gman::Parser.array_to_hash(current) +domain_list = domain_hash.flat_map { |k,v| v } +puts "Current list contains #{domain_list.count} domains..." + +SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" +source_hash = YAML.load(open(SOURCE).read) +source_list = source_hash.flat_map { |k,v| v } + +dupes = [] +domain_hash.each do |group,domains| + domains.each do |domain| + if domain_list.count(domain) > 1 && source_list.count(domain) <= 1 + dupes.push(domain) + end + end +end + +dupes.uniq! + +puts "Found #{dupes.count} dupes!" + +puts dupes.inspect diff --git a/script/vendor-us b/script/vendor-us index d0f11b01..978378b3 100755 --- a/script/vendor-us +++ b/script/vendor-us @@ -22,7 +22,6 @@ require './lib/gman' require './lib/gman/parser' SOURCE = "https://raw.githubusercontent.com/GSA/govt-urls/master/government-urls.yaml" -TXT_FILE = "government-urls-hierarchical-list.txt" BLACKLIST = ["usagovQUASI", "usagovFED", "usagovPW"] domain_hash = {} @@ -33,6 +32,7 @@ puts "found #{domain_hash.map { |group,domains| domains.count }.inject(:+)} doma domain_hash.each do |group, domains| domains.map! { |domain| domain.strip } # Strip trailing slashes domains.map! { |domain| domain.gsub /\/$/, "" } # Strip trailing slashes + domains.map! { |domain| domain.downcase } # make lower case domains.reject! { |domain| domain.empty? } # Reject empty strings end