diff --git a/DESCRIPTION b/DESCRIPTION index 26fade7..9f534e4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,7 @@ Description: Provides easier interaction with Users can upload data to Socrata portals directly from R. Version: 1.8.0-6 -Date: 2018-10-28 +Date: 2019-01-05 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc Maintainer: "Tom Schenk Jr." Depends: diff --git a/R/RSocrata.R b/R/RSocrata.R index b1b88d9..a37c250 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -461,14 +461,14 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' Exports CSVs from Socrata data portals #' -#' Input the URL of a data portal (e.g., "data.cityofchicago.org") and -#' will download all CSV files (no other files supported) and saved in -#' a single directory named after the root URL (e.g., "data.cityofchicago.org/"). -#' Downloaded files are compressed to GZip format and timestamped so the download -#' time is cataloged. The site's data.json file is downloaded as a canonical index -#' of data saved from the website. Users can cross-reference the data.json file -#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped -#' files. +#' Input the base URL of a data portal (e.g., "data.cityofchicago.org") and +#' will download CSVs, PDFs, Word, Excel, and PowerPoint files contained on +#' the respective data portal into a single directory named after the root URL. +#' Downloaded CSV files are compressed to GZip format and each file timestamped +#' so the download time is cataloged. The site's data.json file is downloaded +#' as a canonical index of data saved from the website. Users can cross-reference +#' the data.json file by matching the "four-by-four" in data.json with the first +#' 5 letters of downloaded files. #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org") #' @param app_token - a string; SODA API token used to query the data #' portal \url{http://dev.socrata.com/consumers/getting-started.html} @@ -478,7 +478,7 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' @importFrom jsonlite write_json #' @importFrom utils write.csv #' @export -export.socrata <- function(url, app_token = NULL) { +export.socrata <- function(url, path = getwd(), app_token = NULL) { dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL downloadTime <- Sys.time() # Grab timestamp when data.json was downloaded @@ -497,10 +497,11 @@ export.socrata <- function(url, app_token = NULL) { # Download data downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element - if(is.null(downloadUrl)) { # Skips if not a data file (e.g., Socrata Pages) + mediaType <- ls$distribution[[i]]$mediaType + if(is.null(downloadUrl)) { # Skips if there is no data or file next - } else if(grepl(".csv", downloadUrl)) { # Downloads if it's a CSV - d <- read.socrata(downloadUrl, app_token) + } else if(mediaType[1] == "text/csv") { # Downloads if it's a CSV + d <- RSocrata::read.socrata(downloadUrl, app_token) # Construct the filename output default_format <- "csv" @@ -512,8 +513,10 @@ export.socrata <- function(url, app_token = NULL) { # Write file write.csv(d, file = gzfile(filename)) # Writes g-zipped file + } else if(mediaType == "text/html") { # Skips file if it's an HTML page + next } else { - response <- GET(downloadUrl) # Downloads non-CSVs + response <- httr::GET(downloadUrl) # Downloads non-CSVs (e.g. PDF, Word, etc.) # Construct the filename output if(is.null(response$headers$`content-disposition`)) {