models.py

"""Data models used to organize data and retrieve information from the database.

Some classes here just organize data that's been collected elsewhere in the
application, but some initialization methods DO reach out to the db to find
additional information.
"""
import math

import config
import db
import helpers

class PaperQueryResponse(object):
  """The object that is serialized and returned to the user after requesting
  a list of papers that match a given set of criteria. Combines the results
  AND the query parameters used to find them; both are included in case a
  user sent query parameters that were out of range (or omitted some altogether);
  that way users will have a way of finding out if they've bumped against a limit.

  """
  def __init__(self, results, query, timeframe, category_filter, metric, current_page, page_size, totalcount, repo):
    """The initialization method takes all the required information and stores it in
    memory; nothing except the final page number is calculated here.

    Arguments:
      - results: A list of SearchResultArticle objects
      - query: A string containing any text-based search parameters
      - timeframe: A string indicating what slice of the available metrics to
          use for the ranking
      - category_filter: A list of strings, each signifying an acceptable
          category for a result to be in.
      - metric: Which metadata (downloads, tweets, etc.) was used when ranking
          the results.
      - current_page: When results are broken down into groups of $page_size elements,
          which one is currently being returned.
      - page_size: How many results to return at one time.
      - totalcount: How many results there are on all pages combined.
      - repo: Which preprint repository was specified in request

    """
    self.results = results
    self.query = query
    self.timeframe = timeframe
    self.category_filter = category_filter
    self.metric = metric
    self.current_page = current_page
    self.page_size = page_size
    self.final_page = math.ceil(totalcount / page_size) - 1 # zero-indexed
    self.totalcount = totalcount
    self.repo = repo

  def json(self):
    """Turns the PaperQueryResponse object into a dict that can be more
    easily serialized into JSON."""
    return {
      "query": {
        "text_search": self.query,
        "timeframe": self.timeframe,
        "categories": self.category_filter,
        "metric": self.metric,
        "page_size": self.page_size,
        "current_page": self.current_page,
        "final_page": self.final_page,
        "total_results": self.totalcount,
        "repository": self.repo
      },
      "results": [r.json() for r in self.results]
    }

class Author:
  """Stores information about an individual person associated with
  one or more papers."""
  def __init__(self, author_id, name=""):
    """Because the Author class is used in several different areas
    that have different requirements for how detailed the info needs
    to be, little is done in the initialization method. One optional
    parameter is author name, which can be specified in cases where
    we know we won't be fetching any more information. It can be left
    out, but would then require another entire DB call to retrieve it,
    which can slow things down dramatically when building up lists of
    authors.

    Arguments:
      - author_id: The Rxivist ID of the author in question; this is
            used in all further steps to determine which information
            to fetch.
      - name: A string indicating the author's name. Optional.

    """
    self.id = author_id
    self.name = name
    self.has_full_info = False
    self.has_basic_info = False

  def GetInfo(self, connection):
    """Fetches all of the information we have about a particular author.

    Arguments:
      - connection: A database Connection object

    Side effects:
      - self.name: The author's name
      - self.institution: One of the author's institutional affiliations
      - self.orcid: The ORCID universal identifier specified by the author
      - self.articles: A list of AuthorArticle objects associated with the author
      - self.emails: Any email addresses associated with the author
      - self.ranks: A list of AuthorRankEntry objects indicating the author's
          position in all-time downloads, relative to all other authors and also
          relative to the other authors of any categories in which the author
          also has at least one paper.
      - self.has_full_info: A boolean indicating all of these values have been fetched.

    """
    self.name, self.institution, self.orcid = self._find_vitals(connection)
    self.articles = self._find_articles(connection)
    self.ranks = self._find_ranks(connection)
    self.has_full_info = True

  def GetBasicInfo(self, connection):
    """Fetches a more limited subset of the author's information

    Arguments:
      - connection: A database Connection object

    Side effects:
      - self.name: The author's name
      - self.institution: One of the author's institutional affiliations
      - self.orcid: The ORCID universal identifier specified by the author
      - self.has_basic_info: A boolean indicating all of these values have been fetched.

    """
    self.name, self.institution, self.orcid = self._find_vitals(connection)
    self.has_basic_info = True

  def json(self):
    """Returns a dict of information we have about the author, based on
    how much has been fetched to this point.

    """
    if self.has_full_info:
      return {
        "id": self.id,
        "name": self.name,
        "institution": self.institution,
        "orcid": self.orcid,
        "articles": [x.json() for x in self.articles],
        "ranks": [x.json() for x in self.ranks]
      }
    elif self.has_basic_info:
      return {
        "id": self.id,
        "name": self.name,
        "institution": self.institution,
        "orcid": self.orcid
      }
    else:
      return {
        "id": self.id,
        "name": self.name
      }

  def _find_vitals(self, connection):
    """Retrieves any identifying information about an author.

    Arguments:
      - connection: A database Connection object

    Returns:
      - name: A string with the author's name
      - institution: A string indicating one of the author's institutional affiliations
      - orcid: The ORCID universal identifier specified by the author

    """
    authorq = connection.read("SELECT name, institution, orcid FROM authors WHERE id = %s;", (self.id,))
    if len(authorq) == 0:
      raise helpers.NotFoundError(self.id)
    if len(authorq) > 1:
      raise ValueError(f"Multiple authors found with id {self.id}")
    name, institution, orcid = authorq[0]
    if institution == "":
      institution = None
    if orcid == "":
      orcid = None
    return name, institution, orcid

  def _find_articles(self, connection):
    """Retrieves basic information about any articles for which the individual
    is listed as one of the authors.

    Arguments:
      - connection: A database Connection object

    Returns:
      - A list of AuthorArticle objects with basic info and rankings for each paper

    """
    sql = """
      SELECT articles.id
      FROM articles
      LEFT JOIN article_authors ON articles.id=article_authors.article
      LEFT JOIN alltime_ranks ON articles.id=alltime_ranks.article
      WHERE article_authors.author=%s ORDER BY alltime_ranks.downloads DESC
    """
    articles = connection.read(sql, (self.id,))
    articles = [AuthorArticle(a[0], connection) for a in articles]
    return articles

  def _find_emails(self, connection):
    """Retrieves any email addresses that have been found on bioRxiv associated with
    the author.

    Arguments:
      - connection: A database Connection object

    Returns:
      - A list of strings, each a unique email address

    """
    emails = []
    emailq = connection.read("SELECT email FROM author_emails WHERE author=%s;", (self.id,))
    for entry in emailq:
      emails.append(entry[0])
    return emails

  def _find_ranks(self, connection):
    """Retrieves pre-calculated ranking information about an author's position relative to others.

    Arguments:
      - connection: A database Connection object

    Returns:
      - A list of AuthorRankEntry objects

    """
    ranks = []
    downloadsq = connection.read("SELECT rank, tie, downloads FROM author_ranks WHERE author=%s;", (self.id,))
    if len(downloadsq) == 1:
      ranks.append(AuthorRankEntry(downloadsq[0][0], downloadsq[0][1], downloadsq[0][2], "alltime"))

    categoryq = connection.read("SELECT rank, tie, downloads, category FROM author_ranks_category WHERE author = %s;", (self.id,))
    for cat in categoryq:
      entry = AuthorRankEntry(cat[0], cat[1], cat[2], cat[3])
      ranks.append(entry)

    return ranks

class DateEntry(object):
  "Stores paper publication date info."
  def __init__(self, month, year):
    self.month = month
    self.year = year
    self.monthname = helpers.num_to_month(month)

class ArticleRankEntry(object):
  """Stores data about a paper's rank within a
  single corpus."""
  def __init__(self, rank=0, tie=False, downloads=0):
    self.downloads = downloads
    self.rank = rank
    self.tie = tie

  def json(self):
    return {
      "downloads": self.downloads,
      "rank": self.rank,
      "tie": self.tie
    }

class AuthorRankEntry(object):
  """Stores data about an author's rank within a
  single corpus."""
  def __init__(self, rank=0, tie=False, downloads=0, category=""):
    self.downloads = downloads
    self.rank = rank
    self.tie = tie
    self.category = category

  def json(self):
    return {
      "downloads": self.downloads,
      "rank": self.rank,
      "tie": self.tie,
      "category": self.category
    }

class ArticleRanks(object):
  """Stores information about all of an individual article's
  rankings: One for each available timeframe, and one reflecting the
  all-time ranking of the article relative only to other articles in
  the same category.

  """
  def __init__(self, article_id, connection):
    """Retrieves all required ranking information for a single article.

    Arguments:
      - article_id: The Rxivist ID of the article in question
      - connection: A database Connection object

    """
    sql = """
      SELECT alltime_ranks.rank, ytd_ranks.rank,
        month_ranks.rank, category_ranks.rank, articles.collection,
        alltime_ranks.downloads, ytd_ranks.downloads, month_ranks.downloads
      FROM articles
      LEFT JOIN alltime_ranks ON articles.id=alltime_ranks.article
      LEFT JOIN ytd_ranks ON articles.id=ytd_ranks.article
      LEFT JOIN month_ranks ON articles.id=month_ranks.article
      LEFT JOIN category_ranks ON articles.id=category_ranks.article
      WHERE articles.id=%s
    """
    sql_entry = connection.read(sql, (article_id,))[0]

    self.alltime = ArticleRankEntry(sql_entry[0], False, sql_entry[5])
    self.ytd = ArticleRankEntry(sql_entry[1], False, sql_entry[6])
    self.lastmonth = ArticleRankEntry(sql_entry[2], False, sql_entry[7])
    self.collection = ArticleRankEntry(sql_entry[3], False, sql_entry[5])

  def json(self):
    return {
      "alltime": self.alltime.json(),
      "ytd": self.ytd.json(),
      "lastmonth": self.lastmonth.json(),
      "category": self.collection.json(),
    }

class Article:
  """Base class for the different formats in which articles
  are presented throughout the site.

  """
  def __init__(self, a_id=None):
    self.id = a_id
    pass

  def get_authors(self, connection):
    """Fetches information about the paper's authors.

    Arguments:
      - connection: a database connection object.

    Side effects:
      - self.authors: A list of Author objects associated with the article

    """
    self.authors = []
    author_data = connection.read("SELECT authors.id, authors.name FROM article_authors as aa INNER JOIN authors ON authors.id=aa.author WHERE aa.article=%s ORDER BY aa.id;", (self.id,))
    if len(author_data) > 0:
      self.authors = [Author(a[0], a[1]) for a in author_data]

  def GetTraffic(self, connection):
    data = connection.read("SELECT month, year, pdf, abstract FROM article_traffic WHERE article_traffic.article=%s ORDER BY year ASC, month ASC;", (self.id,))
    self.traffic = [TrafficEntry(entry) for entry in data]

class TrafficEntry(object):
  "Stores the bioRxiv traffic information for a single month"
  def __init__(self, sql_entry):
    self.month = sql_entry[0]
    self.year = sql_entry[1]
    self.downloads = sql_entry[2]
    self.views = sql_entry[3]

class SearchResultArticle(Article):
  "An article as displayed on the main results page."
  def __init__(self, sql_entry, connection):
    """Organizes all the known information about a single article.

    Arguments:
      - sql_entry: The results of the large query built up in the
          endpoints.paper_query() function.
      - connection: A database Connection object.

    """
    self.downloads = sql_entry[0] # NOTE: This can be "downloads" OR "tweet count"
    self.id = sql_entry[1]
    self.url = sql_entry[2]
    self.title = sql_entry[3]
    self.abstract = sql_entry[4]
    self.collection = sql_entry[5]
    self.posted = sql_entry[6]
    self.doi = sql_entry[7]
    self.repo = sql_entry[8]
    self.get_authors(connection)

    if self.collection is None:
      self.collection = "unknown"

  def json(self):
    return {
      "id": self.id,
      "metric": self.downloads,
      "title": self.title,
      "url": f"{config.host}/v1/papers/{self.id}",
      "biorxiv_url": self.url,
      "doi": self.doi,
      "category": self.collection,
      "first_posted": self.posted.strftime('%Y-%m-%d') if self.posted is not None else "",
      "abstract": self.abstract,
      "authors": [x.json() for x in self.authors],
      "repo": self.repo
    }

class SearchResultAuthor(object):
  "An author, as returned by the author rankings endpoint."
  def __init__(self, id, name, rank, downloads, tie):
    self.id = id
    self.name = name
    self.rank = AuthorRankEntry(rank, tie, downloads)

  def json(self):
    return {
      "id": self.id,
      "name": self.name,
      "rank": self.rank.rank,
      "downloads": self.rank.downloads,
      "tie": self.rank.tie
    }

class ArticleDetails(Article):
  "Article info as returned by the article details endpoint."
  def __init__(self, article_id, connection):
    """Retrieves all required information for a single article.

    Arguments:
      - article_id: The Rxivist ID of the article in question
      - connection: A database Connection object

    """
    sql = """
    SELECT a.url, a.title, a.collection, a.posted, a.doi,
      a.abstract, p.publication, p.doi, a.repo
      FROM articles a
      LEFT JOIN article_publications AS p ON a.id=p.article
      WHERE a.id=%s;
    """
    sql_entry = connection.read(sql, (article_id,))
    if len(sql_entry) == 0:
      raise helpers.NotFoundError(article_id)
    sql_entry = sql_entry[0]

    self.id = article_id
    self.url = sql_entry[0]
    self.title = sql_entry[1]
    self.collection = sql_entry[2]
    self.posted = sql_entry[3]
    self.doi = sql_entry[4]
    self.abstract = sql_entry[5]
    self.ranks = ArticleRanks(self.id, connection)
    self.get_authors(connection)
    self.publication = sql_entry[6]
    self.pub_doi = sql_entry[7]
    self.repo = sql_entry[8]

    if self.collection is None:
      self.collection = "unknown"

    for author in self.authors:
      author.GetBasicInfo(connection)

  def json(self):
    resp = {
      "id": self.id,
      "doi": self.doi,
      "first_posted": self.posted.strftime('%Y-%m-%d') if self.posted is not None else "",
      "repo": self.repo,
      "biorxiv_url": self.url,
      "url":  f"{config.host}/v1/papers/{self.id}",
      "title": self.title,
      "category": self.collection,
      "abstract": self.abstract,
      "authors": [x.json() for x in self.authors],
      "ranks": self.ranks.json()
    }
    if self.pub_doi is not None:
      resp["publication"] = {
        "journal": self.publication,
        "doi": self.pub_doi
      }
    else:
      resp["publication"] = {}
    return resp

class AuthorArticle(Article):
  """Article info returned by the author details endpoint.
  Less data than ArticleDetails class.

  """
  def __init__(self, article_id, connection):
    """Retrieves all required information for a single article.

    Arguments:
      - article_id: The Rxivist ID of the article in question
      - connection: A database Connection object

    """

    sql = "SELECT url, title, collection, posted, doi FROM articles WHERE articles.id=%s"
    sql_entry = connection.read(sql, (article_id,))
    if len(sql_entry) == 0:
      raise helpers.NotFoundError(article_id)
    sql_entry = sql_entry[0]

    self.id = article_id
    self.url = sql_entry[0]
    self.title = sql_entry[1]
    self.collection = sql_entry[2]
    self.posted = sql_entry[3]
    self.doi = sql_entry[4]
    self.ranks = ArticleRanks(self.id, connection)

    if self.collection is None:
      self.collection = "unknown"

  def json(self):
    return {
      "id": self.id,
      "doi": self.doi,
      "biorxiv_url": self.url,
      "url": f"{config.host}/v1/papers/{self.id}",
      "title": self.title,
      "category": self.collection,
      "ranks": self.ranks.json()
    }