Skip to content

Commit

Permalink
Code from CLiC API R examples encapsulated into a package.
Browse files Browse the repository at this point in the history
The intension is to robustify and release.
Added function to fetch full texts and some getters and setters for package state.
  • Loading branch information
hennesseya committed Aug 3, 2018
0 parents commit 33c9f7b
Show file tree
Hide file tree
Showing 16 changed files with 441 additions and 0 deletions.
28 changes: 28 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Encoding: UTF-8
Package: clicclient
Type: Package
Title: Client for the CLiC API
Version: 0.1-0
Date: 2018-08-02
Authors@R:
c(person('Anthony','Hennessey',email='[email protected]',role=c('aut','cre')),
person('Jamie','Lentin',email='[email protected]',role='aut'),
person('Viola','Wiegand',email='[email protected]',role='ctb'),
person('Michaela','Mahlberg',email='[email protected]',role='fnd'))
Description: Fetch data available to the CLiC web interface <http://clic.bham.ac.uk> by directly calling the CLiC API from R.
URL:
License: GPL (>= 3)
Depends:
R (>= 3.2.0),
Imports:
httr,
jsonlite,
data.table (>= 1.9.6)
Suggests:
unittest,
knitr,
rmarkdown
VignetteBuilder:
knitr
BugReports: https://github.com/birmingham-ccr/clicclient/issues
LazyData: yes
13 changes: 13 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
export("clic_clusters")
export("clic_lookup")
export("clic_tokens")
export("clic_texts")
export("clic_set_user_agent")
export("clic_get_user_agent")

importFrom("utils", "packageVersion", "head", "tail")

importFrom("data.table", "rbindlist", "setkeyv", ":=", "shift")
importFrom("httr", "modify_url", "GET", "http_error", "http_type", "add_headers", "status_code", "content")
importFrom("jsonlite", "fromJSON")

21 changes: 21 additions & 0 deletions R/clic_clusters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
clic_clusters <- function(
shortname,
length,
cutoff = 5,
subset = NULL
) {
if(! length %in% c(1, 3, 4, 5)) {
stop(paste0("bad length parameter: '", length, "'"))
}
query <- paste(sprintf("corpora=%s", shortname), collapse = "&")
query <- sprintf("%s&clusterlength=%d&cutoff=%s", query, length, cutoff)
if(! is.null(subset)) {
subset <- match.arg(subset, c("shortsus", "longsus", "nonquote", "quote"))
query <- sprintf("%s&subset=%s", query, subset)
}
rv <- clic_request(endpoint = "cluster", query = query)
clusters <- data.frame("cluster" = rv$data[ , 1], "count" = as.integer(rv$data[ , 2]), stringsAsFactors = FALSE)
clusters <- clusters[order(clusters$count, decreasing = TRUE),]
rownames(clusters) <- NULL
return(clusters)
}
11 changes: 11 additions & 0 deletions R/clic_lookup.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
clic_lookup <- function() {
rv <- clic_request(endpoint = "corpora")
# hack to stop R CMD check warnings - ref: data.table
corpus = NULL
DT <- rbindlist(
rv$corpora$children, fill = TRUE,
idcol = 'corpus'
)[ , corpus := rv$corpora$id[corpus]]
setkeyv(DT, cols = c("corpus", "author", "title"))
return(DT[])
}
34 changes: 34 additions & 0 deletions R/clic_request.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Makes the API requests.
# Returns the endpoint specific data structure.
#
# - endpoint: see the inline docs in /server/clic
# - query: endpoint specific parameters as a querystring
#
clic_request <- function(
endpoint = c("subset", "corpora", "cluster"),
query = NULL
) {
endpoint <- match.arg(endpoint)
uri <- modify_url("",
scheme = "http",
hostname = get('HOSTNAME', pos = pkg_vars),
path = sprintf("/api/%s", endpoint),
query = ifelse(is.null(query), "", query)
)
ua <- paste(get('UA', pos = pkg_vars), " (clicclient v", packageVersion("clicclient"), ")", sep = "")
req <- GET(uri, add_headers('User-Agent' = ua, 'Accept' = "application/json"))
if (http_error(req)) {
stop(sprintf("Request failed: status %s - URL '%s'", status_code(req), uri))
}
# can ignore header so check response
# https://tools.ietf.org/html/rfc7231#section-5.3.2
if (http_type(req) != "application/json") {
stop("API did not return JSON")
}
rv <- fromJSON( content(req, as = "text", encoding = "UTF-8") )
if (!is.null(rv$error)) stop("API returned error: ", rv$error$message)
if (!is.null(rv$warn)) cat("API returned warning: ", rv$warn$message)
if (!is.null(rv$info)) cat("API returned info: ", rv$info$message)
return(rv)
}

22 changes: 22 additions & 0 deletions R/clic_set_user_agent.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
pkg_vars <- new.env()

clic_set_user_agent <- function(x){
ua <- paste(x, " (clicclient v", packageVersion("clicclient"), ")", sep = "")
assign('UA', ua, pos = pkg_vars)
return(ua)
}

clic_get_user_agent <- function(){
return(get('UA', pos = pkg_vars))
}

# NOT exported
clic_set_hostname <- function(x){
assign('HOSTNAME', x, pos = pkg_vars)
return(x)
}

.onAttach <- function(libname, pkgname) {
clic_set_user_agent("R CLiC client")
clic_set_hostname("clic.bham.ac.uk")
}
21 changes: 21 additions & 0 deletions R/clic_texts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
clic_texts <- function(
shortname,
subset = NULL
) {
query <- paste(sprintf("corpora=%s", shortname), collapse = "&")
if(! is.null(subset)) {
subset <- match.arg(subset, c("shortsus", "longsus", "nonquote", "quote"))
query <- sprintf("%s&subset=%s", query, subset)
}
rv <- clic_request(endpoint = "subset", query = query)
tokens <- unlist( sapply(rv$data, function(x) {
head(x[[1]], -1)
}) )
# attempt to regain some white space integrity
tokens <- ifelse(tokens %in% c("."),
paste0(tokens, " "),
tokens
)
return(paste(tokens, collapse = ""))
}

27 changes: 27 additions & 0 deletions R/clic_tokens.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
clic_tokens <- function(
shortname,
subset = NULL,
lowercase = TRUE,
punctuation = FALSE # includes whitespace
) {
query <- paste(sprintf("corpora=%s", shortname), collapse = "&")
if(! is.null(subset)) {
subset <- match.arg(subset, c("shortsus", "longsus", "nonquote", "quote"))
query <- sprintf("%s&subset=%s", query, subset)
}
rv <- clic_request(endpoint = "subset", query = query)
if(punctuation) {
tokens <- unlist( sapply(rv$data, function(x) {
head(x[[1]], -1)
}) )
} else {
tokens <- unlist( sapply(rv$data, function(x) {
head(x[[1]], -1)[as.integer(tail(x[[1]], 1)[[1]])+1]
}) )
}
if(lowercase) {
tokens <- tolower(tokens)
}
return(tokens)
}

8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Any data available through the [CLiC web interface](http://clic.bham.ac.uk/) is also available by directly calling the *CLiC API*.
The CLiC API returns a JSON representation of the CLiC data, which means that the data can be retrieved directly using any programming language.
This package offers a set of convenience functions for calling the CLiC API directly from R.

The package is the result of work by the [Centre for Corpus Research](http://www.birmingham.ac.uk/research/activity/corpus/) at the University of Birmingham.
We would be interested to hear about how you use the CLiC API and are always happy to consider CLiC related guest posts for the [CLiC blog](https://blog.bham.ac.uk/clic-dickens/).
To let us know how you are using the CLiC API, to give us feedback, or if you need any help that you cannot find here or through the [CLiC homepage](https://www.birmingham.ac.uk/schools/edacs/departments/englishlanguage/research/projects/clic/) you can contact us at [[email protected]]([email protected]).

40 changes: 40 additions & 0 deletions man/clic_clusters.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
\name{clic_clusters}
\alias{clic_clusters}

\title{Fetch clusters}
\description{
Fetches n-grams using the CLiC API \sQuote{cluster} endpoint.
}

\usage{
clic_clusters(shortname, length, cutoff = 5, subset = NULL)
}

\arguments{
\item{shortname}{
Can be any value from the \sQuote{corpus} or \sQuote{shortname} columns
returned by \code{\link{clic_lookup}}.
Can be given as a single string, or a list of strings in which case the
results will be concatenated.
}
\item{length}{
Cluster length to search for, one of 1/3/4/5 (NB: There is no 2).
}
\item{cutoff}{
The cutoff frequency, if a cluster occurs less times than this then
it is not returned.
}
\item{subset}{
A string containing Any one of \dQuote{shortsus}, \dQuote{longsus},
\dQuote{nonquote} and \dQuote{quote}.
}
}

\value{
Returns a \code{data.frame} of clusters to counts.
}

\seealso{
\url{https://github.com/birmingham-ccr/clic/server/clic/cluster.py}
}

17 changes: 17 additions & 0 deletions man/clic_lookup.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
\name{clic_lookup}
\alias{clic_lookup}

\title{Fetch listing of CLiC content}
\description{
Fetches a listing of the texts for each of the available corpora in CLiC.

}

\usage{
clic_lookup()
}

\value{
Returns a \code{data.frame} listing the texts for each of the available corpora.
}

33 changes: 33 additions & 0 deletions man/clic_set_user_agent.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
\name{clic_set_user_agent}
\alias{clic_set_user_agent}
\alias{clic_get_user_agent}

\title{Set User-Agent}
\description{
Set the \sQuote{User-Agent} string for CLiC API requests.
}

\usage{
clic_set_user_agent(x)
clic_get_user_agent()
}

\arguments{
\item{x}{
The new \sQuote{User-Agent} string.
}
}

\details{
Setting the User-Agent helps us to understand who is using the API.
Ideally set the User-Agent to something that either identifies you
or your application.

The string you supply will be concatenated with the version details of
this package. The concatenated string is returned.
}

\value{
Both functions return the User-Agent string.
}

29 changes: 29 additions & 0 deletions man/clic_texts.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
\name{clic_texts}
\alias{clic_texts}

\title{Fetch texts}
\description{
Fetches texts using the CLiC API.
}

\usage{
clic_texts(shortname, subset = NULL)
}

\arguments{
\item{shortname}{
Can be any value from the \sQuote{corpus} or \sQuote{shortname} columns
returned by \code{\link{clic_lookup}}.
Can be given as a single string, or a list of strings in which case the
results will be concatenated.
}
\item{subset}{
A string containing Any one of \dQuote{shortsus}, \dQuote{longsus},
\dQuote{nonquote} and \dQuote{quote}.
}
}

\value{
Returns a single string of text.
}

40 changes: 40 additions & 0 deletions man/clic_tokens.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
\name{clic_tokens}
\alias{clic_tokens}

\title{Fetch tokens}
\description{
Fetches tokens using the CLiC API \sQuote{subset} endpoint.
}

\usage{
clic_tokens(shortname, subset = NULL, lowercase = TRUE, punctuation = FALSE)
}

\arguments{
\item{shortname}{
Can be any value from the \sQuote{corpus} or \sQuote{shortname} columns
returned by \code{\link{clic_lookup}}.
Can be given as a single string, or a list of strings in which case the
results will be concatenated.
}
\item{subset}{
A string containing Any one of \dQuote{shortsus}, \dQuote{longsus},
\dQuote{nonquote} and \dQuote{quote}.
}
\item{lowercase}{
Boolean indicating if the tokens should be transformed to lower case.
}
\item{punctuation}{
Boolean indicating if punctuation tokens should be included.
Note that the puctuation tokens may include whitespace.
}
}

\value{
Returns a vector of tokens.
}

\seealso{
\url{https://github.com/birmingham-ccr/clic/server/clic/subset.py}
}

23 changes: 23 additions & 0 deletions man/clicclient-package.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
\name{clicclient-package}
\alias{clicclient-package}
\alias{clicclient}
\docType{package}

\title{Client for the CLiC API.}
\description{
A set of function wrappers for the CLiC API.
}
\details{
The CLiC web interface can be found at \url{http://clic.bham.ac.uk}.
CLiC also provides an \sQuote{application programming interface} (API)
that makes the data used by the web interface available in a machine
readable form.
This package offers a set of wrapper functions to facilitate
interaction with the API from R.

For a list of all documentation use \code{library(help="clicclient")}.
}

\author{
Maintainer: Anthony Hennessey <anthony.hennessey@nottingham.ac.uk>.
}
Loading

0 comments on commit 33c9f7b

Please sign in to comment.