Skip to content

Commit

Permalink
Merge pull request #89 from ropensci/update-docs
Browse files Browse the repository at this point in the history
Update docs
  • Loading branch information
jrdnbradford authored Sep 2, 2024
2 parents ffe1f73 + 607ea03 commit a85f7a5
Show file tree
Hide file tree
Showing 64 changed files with 143 additions and 185 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: robotstxt
Date: 2024-08-25
Type: Package
Title: A 'robots.txt' Parser and 'Webbot'/'Spider'/'Crawler' Permissions Checker
Version: 0.7.15
Version: 0.7.15.9000
Authors@R: c(
person(
"Pedro", "Baltazar", role = c("ctb"),
Expand Down Expand Up @@ -47,6 +47,6 @@ Suggests:
Depends:
R (>= 3.0.0)
VignetteBuilder: knitr
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Encoding: UTF-8
Config/testthat/edition: 3
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
NEWS robotstxt
==========================================================================

0.7.15.9000
--------------------------------------------------------------------------

- `null_to_default` typo fixed
- Updates to function documentation

0.7.15 | 2024-08-24
--------------------------------------------------------------------------

Expand Down
6 changes: 2 additions & 4 deletions R/as_list.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@


#' Method as.list() for class robotstxt_text
#' Convert robotstxt_text to list
#'
#' @param x class robotstxt_text object to be transformed into list
#' @param ... further arguments (inherited from \code{base::as.list()})
Expand All @@ -17,4 +15,4 @@ as.list.robotstxt_text <-
res$request <- attr(x, "request")

res
}
}
4 changes: 2 additions & 2 deletions R/fix_url.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' fix_url
#' Add http protocal if missing from URL
#'
#'
#' @param url a character string containing a single URL
Expand All @@ -10,4 +10,4 @@ fix_url <-
url <- paste0("http://", url)
}
url
}
}
2 changes: 1 addition & 1 deletion R/get_robotstxt.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' downloading robots.txt file
#' Download a robots.txt file
#'
#' @param domain domain from which to download robots.txt file
#' @param warn warn about being unable to download domain/robots.txt because of
Expand Down
7 changes: 3 additions & 4 deletions R/get_robotstxt_http_get.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@

#' storage for http request response objects
#' Storage for http request response objects
#'
#' @rdname get_robotstxt_http_get
#'
#' @export
rt_last_http <- new.env()
rt_last_http$request <- list()

#' get_robotstxt() worker function to execute HTTP request
#' Execute HTTP request for get_robotstxt()
#'
#' @param ssl_verifypeer either 1 (default) or 0, if 0 it disables SSL peer verification, which
#' might help with robots.txt file retrieval
#' @param domain the domain to get tobots.txt. file for
#' @param domain the domain to get robots.txt. file for
#' @param user_agent the user agent to use for HTTP request header
#'
#' @export
Expand Down
3 changes: 1 addition & 2 deletions R/get_robotstxts.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

#' function to get multiple robotstxt files
#' Download multiple robotstxt files
#'
#' @inheritParams get_robotstxt
#' @param use_futures Should future::future_lapply be used for possible
Expand Down
4 changes: 2 additions & 2 deletions R/guess_domain.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' function guessing domain from path
#' Guess a domain from path
#' @param x path aka URL from which to infer domain
guess_domain <- function(x){

Expand All @@ -23,4 +23,4 @@ guess_domain <- function(x){
return(domain)
}

}
}
2 changes: 1 addition & 1 deletion R/http_domain_changed.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' http_domain_changed
#' Check if HTTP domain changed
#'
#' @param response an httr response object, e.g. from a call to httr::GET()
#'
Expand Down
4 changes: 2 additions & 2 deletions R/http_subdomain_changed.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' http_subdomain_changed
#' Check if HTTP subdomain changed
#'
#' @param response an httr response object, e.g. from a call to httr::GET()
#'
#' @return logical of length 1 indicating whether or not any domain change
#' @return logical of length 1 indicating whether or not any subdomain change
#' happened during the HTTP request
#'
#'
Expand Down
2 changes: 1 addition & 1 deletion R/http_was_redirected.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' http_was_redirected
#' Check if HTTP redirect occurred
#'
#' @param response an httr response object, e.g. from a call to httr::GET()
#'
Expand Down
6 changes: 2 additions & 4 deletions R/is_suspect_robotstxt.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' is_suspect_robotstxt
#' Check if file is valid / parsable robots.txt file
#'
#' function that checks if file is valid / parsable robots.txt file
#' Function that checks if file is valid / parsable robots.txt file
#'
#' @param text content of a robots.txt file provides as character vector
#'
Expand All @@ -26,5 +26,3 @@ is_suspect_robotstxt <- function(text){
# return default
return(FALSE)
}


6 changes: 2 additions & 4 deletions R/is_valid_robotstxt.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' function that checks if file is valid / parsable robots.txt file
#' Validate if a file is valid / parsable robots.txt file
#'
#' @param text content of a robots.txt file provides as character vector
#' @param text content of a robots.txt file provided as character vector
#' @param check_strickt_ascii whether or not to check if content does adhere to the specification of RFC to use plain text aka ASCII
#'
#' @export
Expand Down Expand Up @@ -39,5 +39,3 @@ is_valid_robotstxt <- function(text, check_strickt_ascii = FALSE){

)
}


6 changes: 3 additions & 3 deletions R/null_to_default.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#' null_to_defeault
#' Return default value if NULL
#'
#' @param x value to check and return
#' @param d value to return in case x is NULL
#'
null_to_defeault <-
null_to_default <-
function(x, d){
if ( is.null(x) ){
d
}else{
x
}
}
}
23 changes: 2 additions & 21 deletions R/parse_robotstxt.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#' function parsing robots.txt
#' @param txt content of the robots.txt file
#' Parse a robots.txt file
#' @param txt content of the robots.txt file
#' @return a named list with useragents, comments, permissions, sitemap
#' @export
parse_robotstxt <- function(txt){
# return
res <-
list(
useragents = rt_get_useragent(txt),
Expand All @@ -21,21 +20,3 @@ parse_robotstxt <- function(txt){
)
return(res)
}


















6 changes: 1 addition & 5 deletions R/parse_url.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,9 @@



#' parse_url
#' Parse a URL
#'
#' @param url url to parse into its components
#'
#' @return data.frame with columns protocol, domain, path
#'
#'
#' @keywords internal
#'
#' @examples
Expand Down
4 changes: 2 additions & 2 deletions R/paths_allowed.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#' check if a bot has permissions to access page(s)
#' Check if a bot has permissions to access page(s)
#'
#'
#' @param domain Domain for which paths should be checked. Defaults to "auto".
#' If set to "auto" function will try to guess the domain by parsing the paths
#' argument. Note however, that these are educated guesses which might utterly
#' fail. To be on the safe side, provide appropriate domains manually.
#' @param bot name of the bot, defaults to "*"
#' @param paths paths for which to check bot's permission, defaults to "/". Please, note that path to a folder should end with a trailing slash ("/").
#' @param paths paths for which to check bot's permission, defaults to "/". Please note that path to a folder should end with a trailing slash ("/").
#' @param check_method at the moment only kept for backward compatibility reasons - do not use parameter anymore --> will let the function simply use the default
#' @param robotstxt_list either NULL -- the default -- or a list of character
#' vectors with one vector per path to check
Expand Down
4 changes: 1 addition & 3 deletions R/paths_allowed_worker_spiderbar.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@


#' paths_allowed_worker spiderbar flavor
#' Check if a spiderbar bot has permissions to access page(s)
#'
#' @inheritParams paths_allowed
#'
Expand Down
3 changes: 1 addition & 2 deletions R/print_robotstxt.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' printing robotstxt
#' Print robotstxt
#' @param x robotstxt instance to be printed
#' @param ... goes down the sink
#' @export
Expand Down Expand Up @@ -29,4 +29,3 @@ print.robotstxt <- function(x, ...){
invisible(x)
}


7 changes: 3 additions & 4 deletions R/print_robotstxt_text.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@

#' printing robotstxt_text
#' Print robotstxt's text
#' @param x character vector aka robotstxt$text to be printed
#' @param ... goes down the sink
#' @export
print.robotstxt_text <- function(x, ...){

# rpint part of the robots.txt file
# print part of the robots.txt file
cat("[robots.txt]\n--------------------------------------\n\n")
tmp <- unlist(strsplit(x, "\n"))
cat(tmp[seq_len(min(length(tmp), 50))], sep ="\n")
Expand All @@ -29,4 +28,4 @@ print.robotstxt_text <- function(x, ...){

# return
invisible(x)
}
}
4 changes: 1 addition & 3 deletions R/remove_domain.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' function to remove domain from path
#' Remove domain from path
#' @param x path aka URL from which to first infer domain and then remove it
remove_domain <- function(x){
unlist(lapply(
Expand All @@ -12,5 +12,3 @@ remove_domain <- function(x){
}
))
}


6 changes: 3 additions & 3 deletions R/request_handler_handler.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#' request_handler_handler
#' Handle robotstxt handlers
#'
#' Helper function to handle robotstxt handlers.
#'
#' @param request the request object returned by call to httr::GET()
#' @param handler the handler either a character string entailing various options or a function producing a specific list, see return.
#' @param res a list a list with elements '[handler names], ...', 'rtxt', and 'cache'
#' @param res a list with elements '[handler names], ...', 'rtxt', and 'cache'
#' @param info info to add to problems list
#' @param warn if FALSE warnings and messages are suppressed
#'
Expand Down Expand Up @@ -72,4 +72,4 @@ request_handler_handler <-

# return
res
}
}
5 changes: 3 additions & 2 deletions R/robotstxt.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#' Generate a representations of a robots.txt file
#' Generate a representation of a robots.txt file
#'
#' The function generates a list that entails data resulting from parsing a robots.txt file
#' as well as a function called check that enables to ask the representation if bot (or
#' particular bots) are allowed to access a resource on the domain.
#'
#' @param domain Domain for which to generate a representation. If text equals to NULL,
#' the function will download the file from server - the default.
#'
#' @param text If automatic download of the robots.txt is not preferred, the text can be
#' supplied directly.
#' @inheritParams get_robotstxt
Expand All @@ -20,7 +21,7 @@
#' @field domain character vector holding domain name for which the robots.txt
#' file is valid; will be set to NA if not supplied on initialization
#'
#' @field text character vector of text of robots.txt file; either supplied on
#' @field character vector of text of robots.txt file; either supplied on
#' initialization or automatically downloaded from domain supplied on
#' initialization
#'
Expand Down
3 changes: 1 addition & 2 deletions R/rt_cache.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
#' get_robotstxt() cache
#' Get the robotstxt cache
rt_cache <- new.env( parent = emptyenv() )

4 changes: 1 addition & 3 deletions R/rt_get_comments.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

#' extracting comments from robots.txt
#' Extract comments from robots.txt
#' @param txt content of the robots.txt file
#' @keywords internal
rt_get_comments <- function(txt){
Expand All @@ -8,4 +7,3 @@ rt_get_comments <- function(txt){
ccontent <- stringr::str_extract(txt[clines], "#.*")
data.frame(line=clines, comment=ccontent, stringsAsFactors = FALSE)
}

3 changes: 1 addition & 2 deletions R/rt_get_fields.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

#' extracting permissions from robots.txt
#' Extract permissions from robots.txt
#' @param txt content of the robots.txt file
#' @param regex regular expression specify field
#' @param invert invert selection made via regex?
Expand Down
3 changes: 1 addition & 2 deletions R/rt_get_fields_worker.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

#' extracting robotstxt fields
#' Extract robotstxt fields
#' @param txt content of the robots.txt file
#' @param type name or names of the fields to be returned, defaults to all
#' fields
Expand Down
4 changes: 2 additions & 2 deletions R/rt_get_useragent.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' extracting HTTP useragents from robots.txt
#' Extract HTTP useragents from robots.txt
#' @param txt content of the robots.txt file
#' @keywords internal
# rt_get_useragent <- function(txt){
Expand All @@ -19,4 +19,4 @@ rt_get_useragent <- function(txt){
pattern = stringr::regex("U.*:| |\n", ignore_case = TRUE),
replacement = ""
)
}
}
Loading

0 comments on commit a85f7a5

Please sign in to comment.