diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..b2f5601
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,27 @@
+name: Release Pypi package
+
+on:
+ release:
+ types: [created]
+ workflow_dispatch:
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: "3.x"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install setuptools wheel twine
+ - name: Build and publish
+ env:
+ TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+ TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+ run: |
+ python setup.py sdist bdist_wheel
+ twine upload --repository pypi dist/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..e234a5a
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,32 @@
+name: Test Pypi package
+
+on:
+ push:
+ branches:
+ - main
+ paths-ignore:
+ - README.md
+ pull_request:
+ branches:
+ - main
+ paths-ignore:
+ - README.md
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python: [3.10.11]
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Setup Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python }}
+ - name: Install Tox and any other packages
+ run: pip install tox
+ - name: Run Tox
+ # Run tox using the version of Python in `PATH`
+ run: tox -e py
diff --git a/Burrows Delta Walkthrough.ipynb b/Burrows Delta Walkthrough.ipynb
new file mode 100644
index 0000000..cc083e8
--- /dev/null
+++ b/Burrows Delta Walkthrough.ipynb
@@ -0,0 +1,579 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Walkthrough for using the Fast Stylometry model for Burrows' Delta\n",
+ "\n",
+ "By [Thomas Wood](https://freelancedatascientist.net), [Fast Data Science](https://fastdatascience.com)\n",
+ "\n",
+ "Burrows' Delta is an algorithm for comparing the similarity of the writing styles of documents, known as [forensic stylometry](https://fastdatascience.com/how-you-can-identify-the-author-of-a-document/).\n",
+ "\n",
+ "* [A useful explanation of the maths and thinking behind Burrows' Delta and how it works](https://programminghistorian.org/en/lessons/introduction-to-stylometry-with-python#third-stylometric-test-john-burrows-delta-method-advanced)\n",
+ "\n",
+ "Demonstration of Burrows' Delta on a small corpus downloaded from Project Gutenberg.\n",
+ "\n",
+ "We will test the Burrows' Delta code on two \"unknown\" texts: Sense and Sensibility by Jane Austen, and Villette by Charlotte Bronte. Both authors are in our training corpus.\n",
+ "\n",
+ "This notebook demonstrates how to use the library to calculate the Burrows' Delta value of six candidate authors, and also to calculate the probability that each one is the author of the mystery text."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from faststylometry import Corpus\n",
+ "\n",
+ "from faststylometry import load_corpus_from_folder\n",
+ "from faststylometry import tokenise_remove_pronouns_en\n",
+ "from faststylometry import calculate_burrows_delta\n",
+ "from faststylometry import predict_proba, calibrate, get_calibration_curve"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_corpus = load_corpus_from_folder(\"faststylometry/data/train\")\n",
+ "\n",
+ "train_corpus.tokenise(tokenise_remove_pronouns_en)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Load two books by \"unknown\" authors to test the algorithm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load Sense and Sensibility, written by Jane Austen (marked as \"janedoe\")\n",
+ "# and Villette, written by Charlotte Bronte (marked as \"currerbell\", Bronte's real pseudonym)\n",
+ "\n",
+ "test_corpus = load_corpus_from_folder(\"faststylometry/data/test\", pattern=None)\n",
+ "# You can set pattern to a string value to just load a subset of the corpus.\n",
+ "\n",
+ "test_corpus.tokenise(tokenise_remove_pronouns_en)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Calculate Burrows' Delta for both candidate authors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " currerbell - villette | \n",
+ " janedoe - sense_and_sensibility | \n",
+ "
\n",
+ " \n",
+ " author | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " austen | \n",
+ " 0.997936 | \n",
+ " 0.444582 | \n",
+ "
\n",
+ " \n",
+ " bronte | \n",
+ " 0.521358 | \n",
+ " 0.933160 | \n",
+ "
\n",
+ " \n",
+ " carroll | \n",
+ " 1.116466 | \n",
+ " 1.433247 | \n",
+ "
\n",
+ " \n",
+ " conan_doyle | \n",
+ " 0.867025 | \n",
+ " 1.094766 | \n",
+ "
\n",
+ " \n",
+ " dickens | \n",
+ " 0.800223 | \n",
+ " 1.050542 | \n",
+ "
\n",
+ " \n",
+ " swift | \n",
+ " 1.480868 | \n",
+ " 1.565499 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " currerbell - villette janedoe - sense_and_sensibility\n",
+ "author \n",
+ "austen 0.997936 0.444582\n",
+ "bronte 0.521358 0.933160\n",
+ "carroll 1.116466 1.433247\n",
+ "conan_doyle 0.867025 1.094766\n",
+ "dickens 0.800223 1.050542\n",
+ "swift 1.480868 1.565499"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "calculate_burrows_delta(train_corpus, test_corpus, vocab_size = 50)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Calibrate the model and calculate the probability of each candidate in the training set being the author"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "calibrate(train_corpus)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " currerbell - villette | \n",
+ " janedoe - sense_and_sensibility | \n",
+ "
\n",
+ " \n",
+ " author | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " austen | \n",
+ " 0.324233 | \n",
+ " 0.808401 | \n",
+ "
\n",
+ " \n",
+ " bronte | \n",
+ " 0.757315 | \n",
+ " 0.382278 | \n",
+ "
\n",
+ " \n",
+ " carroll | \n",
+ " 0.231463 | \n",
+ " 0.079831 | \n",
+ "
\n",
+ " \n",
+ " conan_doyle | \n",
+ " 0.445207 | \n",
+ " 0.246974 | \n",
+ "
\n",
+ " \n",
+ " dickens | \n",
+ " 0.510598 | \n",
+ " 0.280685 | \n",
+ "
\n",
+ " \n",
+ " swift | \n",
+ " 0.067123 | \n",
+ " 0.049068 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " currerbell - villette janedoe - sense_and_sensibility\n",
+ "author \n",
+ "austen 0.324233 0.808401\n",
+ "bronte 0.757315 0.382278\n",
+ "carroll 0.231463 0.079831\n",
+ "conan_doyle 0.445207 0.246974\n",
+ "dickens 0.510598 0.280685\n",
+ "swift 0.067123 0.049068"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "predict_proba(train_corpus, test_corpus)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Plot the calibration curve\n",
+ "\n",
+ "We have used Scikit Learn's Logistic Regression to calculate the calibration curve of the model. You could also use your own calibration curve."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "x_values = np.arange(0, 3, 0.1)\n",
+ "\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "''"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "