-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Code from CLiC API R examples encapsulated into a package.
The intension is to robustify and release. Added function to fetch full texts and some getters and setters for package state.
- Loading branch information
0 parents
commit 33c9f7b
Showing
16 changed files
with
441 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
Encoding: UTF-8 | ||
Package: clicclient | ||
Type: Package | ||
Title: Client for the CLiC API | ||
Version: 0.1-0 | ||
Date: 2018-08-02 | ||
Authors@R: | ||
c(person('Anthony','Hennessey',email='[email protected]',role=c('aut','cre')), | ||
person('Jamie','Lentin',email='[email protected]',role='aut'), | ||
person('Viola','Wiegand',email='[email protected]',role='ctb'), | ||
person('Michaela','Mahlberg',email='[email protected]',role='fnd')) | ||
Description: Fetch data available to the CLiC web interface <http://clic.bham.ac.uk> by directly calling the CLiC API from R. | ||
URL: | ||
License: GPL (>= 3) | ||
Depends: | ||
R (>= 3.2.0), | ||
Imports: | ||
httr, | ||
jsonlite, | ||
data.table (>= 1.9.6) | ||
Suggests: | ||
unittest, | ||
knitr, | ||
rmarkdown | ||
VignetteBuilder: | ||
knitr | ||
BugReports: https://github.com/birmingham-ccr/clicclient/issues | ||
LazyData: yes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
export("clic_clusters") | ||
export("clic_lookup") | ||
export("clic_tokens") | ||
export("clic_texts") | ||
export("clic_set_user_agent") | ||
export("clic_get_user_agent") | ||
|
||
importFrom("utils", "packageVersion", "head", "tail") | ||
|
||
importFrom("data.table", "rbindlist", "setkeyv", ":=", "shift") | ||
importFrom("httr", "modify_url", "GET", "http_error", "http_type", "add_headers", "status_code", "content") | ||
importFrom("jsonlite", "fromJSON") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
clic_clusters <- function( | ||
shortname, | ||
length, | ||
cutoff = 5, | ||
subset = NULL | ||
) { | ||
if(! length %in% c(1, 3, 4, 5)) { | ||
stop(paste0("bad length parameter: '", length, "'")) | ||
} | ||
query <- paste(sprintf("corpora=%s", shortname), collapse = "&") | ||
query <- sprintf("%s&clusterlength=%d&cutoff=%s", query, length, cutoff) | ||
if(! is.null(subset)) { | ||
subset <- match.arg(subset, c("shortsus", "longsus", "nonquote", "quote")) | ||
query <- sprintf("%s&subset=%s", query, subset) | ||
} | ||
rv <- clic_request(endpoint = "cluster", query = query) | ||
clusters <- data.frame("cluster" = rv$data[ , 1], "count" = as.integer(rv$data[ , 2]), stringsAsFactors = FALSE) | ||
clusters <- clusters[order(clusters$count, decreasing = TRUE),] | ||
rownames(clusters) <- NULL | ||
return(clusters) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
clic_lookup <- function() { | ||
rv <- clic_request(endpoint = "corpora") | ||
# hack to stop R CMD check warnings - ref: data.table | ||
corpus = NULL | ||
DT <- rbindlist( | ||
rv$corpora$children, fill = TRUE, | ||
idcol = 'corpus' | ||
)[ , corpus := rv$corpora$id[corpus]] | ||
setkeyv(DT, cols = c("corpus", "author", "title")) | ||
return(DT[]) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Makes the API requests. | ||
# Returns the endpoint specific data structure. | ||
# | ||
# - endpoint: see the inline docs in /server/clic | ||
# - query: endpoint specific parameters as a querystring | ||
# | ||
clic_request <- function( | ||
endpoint = c("subset", "corpora", "cluster"), | ||
query = NULL | ||
) { | ||
endpoint <- match.arg(endpoint) | ||
uri <- modify_url("", | ||
scheme = "http", | ||
hostname = get('HOSTNAME', pos = pkg_vars), | ||
path = sprintf("/api/%s", endpoint), | ||
query = ifelse(is.null(query), "", query) | ||
) | ||
ua <- paste(get('UA', pos = pkg_vars), " (clicclient v", packageVersion("clicclient"), ")", sep = "") | ||
req <- GET(uri, add_headers('User-Agent' = ua, 'Accept' = "application/json")) | ||
if (http_error(req)) { | ||
stop(sprintf("Request failed: status %s - URL '%s'", status_code(req), uri)) | ||
} | ||
# can ignore header so check response | ||
# https://tools.ietf.org/html/rfc7231#section-5.3.2 | ||
if (http_type(req) != "application/json") { | ||
stop("API did not return JSON") | ||
} | ||
rv <- fromJSON( content(req, as = "text", encoding = "UTF-8") ) | ||
if (!is.null(rv$error)) stop("API returned error: ", rv$error$message) | ||
if (!is.null(rv$warn)) cat("API returned warning: ", rv$warn$message) | ||
if (!is.null(rv$info)) cat("API returned info: ", rv$info$message) | ||
return(rv) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
pkg_vars <- new.env() | ||
|
||
clic_set_user_agent <- function(x){ | ||
ua <- paste(x, " (clicclient v", packageVersion("clicclient"), ")", sep = "") | ||
assign('UA', ua, pos = pkg_vars) | ||
return(ua) | ||
} | ||
|
||
clic_get_user_agent <- function(){ | ||
return(get('UA', pos = pkg_vars)) | ||
} | ||
|
||
# NOT exported | ||
clic_set_hostname <- function(x){ | ||
assign('HOSTNAME', x, pos = pkg_vars) | ||
return(x) | ||
} | ||
|
||
.onAttach <- function(libname, pkgname) { | ||
clic_set_user_agent("R CLiC client") | ||
clic_set_hostname("clic.bham.ac.uk") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
clic_texts <- function( | ||
shortname, | ||
subset = NULL | ||
) { | ||
query <- paste(sprintf("corpora=%s", shortname), collapse = "&") | ||
if(! is.null(subset)) { | ||
subset <- match.arg(subset, c("shortsus", "longsus", "nonquote", "quote")) | ||
query <- sprintf("%s&subset=%s", query, subset) | ||
} | ||
rv <- clic_request(endpoint = "subset", query = query) | ||
tokens <- unlist( sapply(rv$data, function(x) { | ||
head(x[[1]], -1) | ||
}) ) | ||
# attempt to regain some white space integrity | ||
tokens <- ifelse(tokens %in% c("."), | ||
paste0(tokens, " "), | ||
tokens | ||
) | ||
return(paste(tokens, collapse = "")) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
clic_tokens <- function( | ||
shortname, | ||
subset = NULL, | ||
lowercase = TRUE, | ||
punctuation = FALSE # includes whitespace | ||
) { | ||
query <- paste(sprintf("corpora=%s", shortname), collapse = "&") | ||
if(! is.null(subset)) { | ||
subset <- match.arg(subset, c("shortsus", "longsus", "nonquote", "quote")) | ||
query <- sprintf("%s&subset=%s", query, subset) | ||
} | ||
rv <- clic_request(endpoint = "subset", query = query) | ||
if(punctuation) { | ||
tokens <- unlist( sapply(rv$data, function(x) { | ||
head(x[[1]], -1) | ||
}) ) | ||
} else { | ||
tokens <- unlist( sapply(rv$data, function(x) { | ||
head(x[[1]], -1)[as.integer(tail(x[[1]], 1)[[1]])+1] | ||
}) ) | ||
} | ||
if(lowercase) { | ||
tokens <- tolower(tokens) | ||
} | ||
return(tokens) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
Any data available through the [CLiC web interface](http://clic.bham.ac.uk/) is also available by directly calling the *CLiC API*. | ||
The CLiC API returns a JSON representation of the CLiC data, which means that the data can be retrieved directly using any programming language. | ||
This package offers a set of convenience functions for calling the CLiC API directly from R. | ||
|
||
The package is the result of work by the [Centre for Corpus Research](http://www.birmingham.ac.uk/research/activity/corpus/) at the University of Birmingham. | ||
We would be interested to hear about how you use the CLiC API and are always happy to consider CLiC related guest posts for the [CLiC blog](https://blog.bham.ac.uk/clic-dickens/). | ||
To let us know how you are using the CLiC API, to give us feedback, or if you need any help that you cannot find here or through the [CLiC homepage](https://www.birmingham.ac.uk/schools/edacs/departments/englishlanguage/research/projects/clic/) you can contact us at [[email protected]]([email protected]). | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
\name{clic_clusters} | ||
\alias{clic_clusters} | ||
|
||
\title{Fetch clusters} | ||
\description{ | ||
Fetches n-grams using the CLiC API \sQuote{cluster} endpoint. | ||
} | ||
|
||
\usage{ | ||
clic_clusters(shortname, length, cutoff = 5, subset = NULL) | ||
} | ||
|
||
\arguments{ | ||
\item{shortname}{ | ||
Can be any value from the \sQuote{corpus} or \sQuote{shortname} columns | ||
returned by \code{\link{clic_lookup}}. | ||
Can be given as a single string, or a list of strings in which case the | ||
results will be concatenated. | ||
} | ||
\item{length}{ | ||
Cluster length to search for, one of 1/3/4/5 (NB: There is no 2). | ||
} | ||
\item{cutoff}{ | ||
The cutoff frequency, if a cluster occurs less times than this then | ||
it is not returned. | ||
} | ||
\item{subset}{ | ||
A string containing Any one of \dQuote{shortsus}, \dQuote{longsus}, | ||
\dQuote{nonquote} and \dQuote{quote}. | ||
} | ||
} | ||
|
||
\value{ | ||
Returns a \code{data.frame} of clusters to counts. | ||
} | ||
|
||
\seealso{ | ||
\url{https://github.com/birmingham-ccr/clic/server/clic/cluster.py} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
\name{clic_lookup} | ||
\alias{clic_lookup} | ||
|
||
\title{Fetch listing of CLiC content} | ||
\description{ | ||
Fetches a listing of the texts for each of the available corpora in CLiC. | ||
|
||
} | ||
|
||
\usage{ | ||
clic_lookup() | ||
} | ||
|
||
\value{ | ||
Returns a \code{data.frame} listing the texts for each of the available corpora. | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
\name{clic_set_user_agent} | ||
\alias{clic_set_user_agent} | ||
\alias{clic_get_user_agent} | ||
|
||
\title{Set User-Agent} | ||
\description{ | ||
Set the \sQuote{User-Agent} string for CLiC API requests. | ||
} | ||
|
||
\usage{ | ||
clic_set_user_agent(x) | ||
clic_get_user_agent() | ||
} | ||
|
||
\arguments{ | ||
\item{x}{ | ||
The new \sQuote{User-Agent} string. | ||
} | ||
} | ||
|
||
\details{ | ||
Setting the User-Agent helps us to understand who is using the API. | ||
Ideally set the User-Agent to something that either identifies you | ||
or your application. | ||
|
||
The string you supply will be concatenated with the version details of | ||
this package. The concatenated string is returned. | ||
} | ||
|
||
\value{ | ||
Both functions return the User-Agent string. | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
\name{clic_texts} | ||
\alias{clic_texts} | ||
|
||
\title{Fetch texts} | ||
\description{ | ||
Fetches texts using the CLiC API. | ||
} | ||
|
||
\usage{ | ||
clic_texts(shortname, subset = NULL) | ||
} | ||
|
||
\arguments{ | ||
\item{shortname}{ | ||
Can be any value from the \sQuote{corpus} or \sQuote{shortname} columns | ||
returned by \code{\link{clic_lookup}}. | ||
Can be given as a single string, or a list of strings in which case the | ||
results will be concatenated. | ||
} | ||
\item{subset}{ | ||
A string containing Any one of \dQuote{shortsus}, \dQuote{longsus}, | ||
\dQuote{nonquote} and \dQuote{quote}. | ||
} | ||
} | ||
|
||
\value{ | ||
Returns a single string of text. | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
\name{clic_tokens} | ||
\alias{clic_tokens} | ||
|
||
\title{Fetch tokens} | ||
\description{ | ||
Fetches tokens using the CLiC API \sQuote{subset} endpoint. | ||
} | ||
|
||
\usage{ | ||
clic_tokens(shortname, subset = NULL, lowercase = TRUE, punctuation = FALSE) | ||
} | ||
|
||
\arguments{ | ||
\item{shortname}{ | ||
Can be any value from the \sQuote{corpus} or \sQuote{shortname} columns | ||
returned by \code{\link{clic_lookup}}. | ||
Can be given as a single string, or a list of strings in which case the | ||
results will be concatenated. | ||
} | ||
\item{subset}{ | ||
A string containing Any one of \dQuote{shortsus}, \dQuote{longsus}, | ||
\dQuote{nonquote} and \dQuote{quote}. | ||
} | ||
\item{lowercase}{ | ||
Boolean indicating if the tokens should be transformed to lower case. | ||
} | ||
\item{punctuation}{ | ||
Boolean indicating if punctuation tokens should be included. | ||
Note that the puctuation tokens may include whitespace. | ||
} | ||
} | ||
|
||
\value{ | ||
Returns a vector of tokens. | ||
} | ||
|
||
\seealso{ | ||
\url{https://github.com/birmingham-ccr/clic/server/clic/subset.py} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
\name{clicclient-package} | ||
\alias{clicclient-package} | ||
\alias{clicclient} | ||
\docType{package} | ||
|
||
\title{Client for the CLiC API.} | ||
\description{ | ||
A set of function wrappers for the CLiC API. | ||
} | ||
\details{ | ||
The CLiC web interface can be found at \url{http://clic.bham.ac.uk}. | ||
CLiC also provides an \sQuote{application programming interface} (API) | ||
that makes the data used by the web interface available in a machine | ||
readable form. | ||
This package offers a set of wrapper functions to facilitate | ||
interaction with the API from R. | ||
|
||
For a list of all documentation use \code{library(help="clicclient")}. | ||
} | ||
|
||
\author{ | ||
Maintainer: Anthony Hennessey <anthony.hennessey@nottingham.ac.uk>. | ||
} |
Oops, something went wrong.