diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2324b64..c5793174 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a bug in access metrics function and callbacks that use it
 - Fixed bug where schedulers were called before optimisers with newer versions of pytorch
 - Fixed a bug where the csv logger closed the file too early
+- Fixed compat with pytorch > 1.1.0 versioning
+- Fixed typos in doc strings
+- Fixes for tests where pytorch >2 Tensors were causing issues with mocks
 
 ## [0.5.3] - 2020-01-31
 ### Added
diff --git a/requirements.txt b/requirements.txt
index a86d2708..08962a4d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ mock
 Pillow
 matplotlib
 torchvision
-pycm;python_version>="3.5"
\ No newline at end of file
+pycm;python_version>="3.5"
+packaging~=23.1
\ No newline at end of file
diff --git a/tests/callbacks/test_cutout.py b/tests/callbacks/test_cutout.py
index 0b8ba6ac..2d46c590 100644
--- a/tests/callbacks/test_cutout.py
+++ b/tests/callbacks/test_cutout.py
@@ -115,7 +115,7 @@ def test_cutmix_targets(self):
         self.assertTrue(((state[torchbearer.TARGET] - target).abs() < 0.00001).all())
 
     def test_target(self):
-        mixup = CutMix(-0.1, classes=2, mixup_loss=True)
+        mixup = CutMix(0.1, classes=2, mixup_loss=True)
         X = torch.rand(2, 3, 100, 100)
         Y_true = torch.Tensor([0., 1.])
 
diff --git a/tests/callbacks/test_torch_scheduler.py b/tests/callbacks/test_torch_scheduler.py
index f7d552d9..2ce70eaa 100644
--- a/tests/callbacks/test_torch_scheduler.py
+++ b/tests/callbacks/test_torch_scheduler.py
@@ -3,6 +3,7 @@
 import warnings
 
 import torchbearer
+from torchbearer.bases import _pytorch_version_lt
 from torchbearer.callbacks import TorchScheduler, LambdaLR, StepLR, MultiStepLR, ExponentialLR, CosineAnnealingLR,\
     ReduceLROnPlateau, CyclicLR
 
@@ -383,10 +384,8 @@ def test_lambda_lr(self, lr_mock):
 
 class TestCyclicLR(TestCase):
     def test_lambda_lr(self):
-        from distutils.version import LooseVersion
         import torch
-        version = torch.__version__ if str(torch.__version__) is torch.__version__ else "0.4.0"
-        if LooseVersion(version) > LooseVersion("1.0.0"):  # CyclicLR is implemented
+        if not _pytorch_version_lt("1.0.0"):  # CyclicLR is implemented
             with patch('torch.optim.lr_scheduler.CyclicLR') as lr_mock:
                 state = {torchbearer.OPTIMIZER: 'optimizer', torchbearer.EPOCH: 0, torchbearer.MODEL: Mock()}
 
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
index 87fe15aa..5e5f7060 100644
--- a/tests/test_end_to_end.py
+++ b/tests/test_end_to_end.py
@@ -60,9 +60,9 @@ def test_basic_opt(self):
 
     def test_callbacks(self):
         from torch.utils.data import TensorDataset
-        traingen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1))
-        valgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1))
-        testgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1))
+        traingen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1, 1))
+        valgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1, 1))
+        testgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1, 1))
 
         model = torch.nn.Linear(3, 1)
         optim = torch.optim.SGD(model.parameters(), lr=0.01)
diff --git a/tests/test_trial.py b/tests/test_trial.py
index 4951302e..75671987 100644
--- a/tests/test_trial.py
+++ b/tests/test_trial.py
@@ -2750,8 +2750,8 @@ def test_func(self_inner):
         self.assertTrue(c_inj.call_args[0][0] == test_callback)
 
     def test_deep_to_tensor(self):
-        base_tensor = torch.Tensor([1])
-        tensor = MagicMock(spec=base_tensor)
+        tensor = torch.Tensor([1])
+        tensor.to = Mock()
         new_dtype = torch.float16
         new_device = 'cuda:1'
 
@@ -2760,9 +2760,8 @@ def test_deep_to_tensor(self):
         self.assertTrue(tensor.to.call_args[0][1] == new_dtype)
 
     def test_deep_to_tensor_int_dtype(self):
-        base_tensor = torch.Tensor([1])
-        tensor = MagicMock(spec=base_tensor)
-        tensor.dtype = torch.uint8
+        tensor = torch.tensor([1], dtype=torch.uint8)
+        tensor.to = Mock()
         new_device = 'cuda:1'
         new_dtype = torch.uint8
 
@@ -2771,9 +2770,10 @@ def test_deep_to_tensor_int_dtype(self):
         self.assertTrue(len(tensor.to.call_args[0]) == 1)
 
     def test_deep_to_list(self):
-        base_tensor = torch.Tensor([1])
-        tensor_1 = MagicMock(spec=base_tensor)
-        tensor_2 = MagicMock(spec=base_tensor)
+        tensor_1 = torch.Tensor([1])
+        tensor_1.to = Mock()
+        tensor_2 = torch.Tensor([1])
+        tensor_2.to = Mock()
         tensors = [tensor_1, tensor_2]
         new_dtype = torch.float16
         new_device = 'cuda:1'
diff --git a/torchbearer/bases.py b/torchbearer/bases.py
index d52c042d..7e70c99d 100644
--- a/torchbearer/bases.py
+++ b/torchbearer/bases.py
@@ -1,12 +1,14 @@
-from distutils.version import LooseVersion
 import functools
 import traceback
 import warnings
 
 import torch
+from packaging import version
+
 import torchbearer
 
 import sys
+
 if sys.version_info[0] < 3:
     def set_doc(inner, doc):
         return None  # Not simple to do in Python 2.7 so we can leave it for now, just build docs with Python 3+
@@ -15,14 +17,21 @@ def set_doc(inner, doc):
         inner.__doc__ = doc
 
 
+def _pytorch_version_lt(version_string):
+    ver = torch.__version__ if 'TorchVersion' in str(type(torch.__version__)) or str(
+        torch.__version__) is torch.__version__ else "0.4.0"
+
+    return version.parse(ver) < version.parse(version_string)
+
+
 class no_grad(torch.no_grad):
     """ Context-manager and decorator that disables gradient calculation.
     See `torch.autograd.no_grad <https://pytorch.org/docs/stable/autograd.html#torch.autograd.no_grad>`_
     """
+
     def __init__(self):
         super(no_grad, self).__init__()
-        version = torch.__version__ if str(torch.__version__) is torch.__version__ else "0.4.1"
-        if LooseVersion(version) < LooseVersion("0.4.1"):  # No grad is not a decorator
+        if _pytorch_version_lt("0.4.1"):  # No grad is not a decorator
             _patch_call(self, self.call)
 
     def call(self, func):
@@ -38,6 +47,7 @@ def _patch_call(instance, func):
     class _(type(instance)):
         def __call__(self, *arg, **kwarg):
             return func(*arg, **kwarg)
+
     instance.__class__ = _
 
 
@@ -45,10 +55,10 @@ class enable_grad(torch.enable_grad):
     """ Context-manager and decorator that enables gradient calculation.
     See `torch.autograd.enable_grad <https://pytorch.org/docs/stable/autograd.html#torch.autograd.enable_grad>`_
     """
+
     def __init__(self):
         super(enable_grad, self).__init__()
-        version = torch.__version__ if str(torch.__version__) is torch.__version__ else "0.4.1"
-        if LooseVersion(version) < LooseVersion("0.4.1"):  # Enable grad is not a decorator
+        if _pytorch_version_lt("0.4.1"):  # Enable grad is not a decorator
             _patch_call(self, self.call)
 
     def call(self, func):
@@ -361,6 +371,7 @@ def base_closure(x, model, y_pred, y_true, crit, loss, opt):
     Returns:
         function: Standard closure function
     """
+
     def closure(state):
         # Zero grads
         state[opt].zero_grad()
@@ -382,11 +393,12 @@ def closure(state):
         state[loss].backward(**state[torchbearer.BACKWARD_ARGS])
 
         state[torchbearer.CALLBACK_LIST].on_backward(state)
+
     return closure
 
 
 standard_closure = lambda: base_closure(torchbearer.X, torchbearer.MODEL, torchbearer.Y_PRED, torchbearer.Y_TRUE,
-                                torchbearer.CRITERION, torchbearer.LOSS, torchbearer.OPTIMIZER)
+                                        torchbearer.CRITERION, torchbearer.LOSS, torchbearer.OPTIMIZER)
 
 
 def apex_closure():
@@ -404,7 +416,8 @@ def _apex_closure(state):
         try:
             state[torchbearer.LOSS] = state[torchbearer.CRITERION](state)
         except TypeError:
-            loss_function_params = _get_param_list(state[torchbearer.Y_PRED]) + _get_param_list(state[torchbearer.Y_TRUE])
+            loss_function_params = _get_param_list(state[torchbearer.Y_PRED]) + _get_param_list(
+                state[torchbearer.Y_TRUE])
             state[torchbearer.LOSS] = state[torchbearer.CRITERION](*loss_function_params)
 
         state[torchbearer.CALLBACK_LIST].on_criterion(state)
@@ -414,6 +427,7 @@ def _apex_closure(state):
             scaled_loss.backward(**state[torchbearer.BACKWARD_ARGS])
 
         state[torchbearer.CALLBACK_LIST].on_backward(state)
+
     return _apex_closure
 
 
@@ -427,6 +441,7 @@ def cite(bibtex):
     Returns:
         The decorator
     """
+
     def decorator(inner):
         doc = inner.__doc__.split('\n')
         i = 0
@@ -448,6 +463,7 @@ def decorator(inner):
         doc.insert(i, to_insert)
         set_doc(inner, '\n'.join(doc))
         return inner
+
     return decorator
 
 
diff --git a/torchbearer/callbacks/gradient_clipping.py b/torchbearer/callbacks/gradient_clipping.py
index bc402a86..b5f6fcb1 100644
--- a/torchbearer/callbacks/gradient_clipping.py
+++ b/torchbearer/callbacks/gradient_clipping.py
@@ -6,7 +6,7 @@
 
 
 class GradientNormClipping(Callback):
-    """GradientNormClipping callback, which uses 'torch.nn.utils.clip_grad_norm\_' to clip the gradient norms to the
+    """GradientNormClipping callback, which uses 'torch.nn.utils.clip_grad_norm_' to clip the gradient norms to the
     given value. If params is None they will be retrieved from state.
 
     Example: ::
@@ -57,7 +57,7 @@ def on_backward(self, state):
 
 
 class GradientClipping(Callback):
-    """GradientClipping callback, which uses 'torch.nn.utils.clip_grad_value\_' to clip the gradients of the given
+    """GradientClipping callback, which uses 'torch.nn.utils.clip_grad_value_' to clip the gradients of the given
     parameters to the given value. If params is None they will be retrieved from state.
 
     Example: ::
diff --git a/torchbearer/callbacks/manifold_mixup.py b/torchbearer/callbacks/manifold_mixup.py
index 764935f9..2a00d133 100644
--- a/torchbearer/callbacks/manifold_mixup.py
+++ b/torchbearer/callbacks/manifold_mixup.py
@@ -35,7 +35,8 @@ def __init__(self, alpha=1.0, lam=RANDOM):
         self._mixup_layers = None
         self.alpha = alpha
         self.lam = lam
-        self.distrib = Beta(self.alpha, self.alpha)
+        if alpha > 0:
+            self.distrib = Beta(self.alpha, self.alpha)
         self.layer_names = []
         self.depth = 0
         self._layer_filter = []
diff --git a/torchbearer/callbacks/mixup.py b/torchbearer/callbacks/mixup.py
index 883fbc25..af8c1847 100644
--- a/torchbearer/callbacks/mixup.py
+++ b/torchbearer/callbacks/mixup.py
@@ -78,7 +78,8 @@ def __init__(self, alpha=1.0, lam=RANDOM):
         super(Mixup, self).__init__()
         self.alpha = alpha
         self.lam = lam
-        self.distrib = Beta(self.alpha, self.alpha)
+        if alpha > 0:
+            self.distrib = Beta(self.alpha, self.alpha)
 
     @staticmethod
     def mixup_loss(state):
diff --git a/torchbearer/callbacks/tensor_board.py b/torchbearer/callbacks/tensor_board.py
index 173b043a..fa70ad35 100644
--- a/torchbearer/callbacks/tensor_board.py
+++ b/torchbearer/callbacks/tensor_board.py
@@ -96,7 +96,7 @@ def close_writer(log_dir, logger):
     if log_dir in __writers__:
         __writers__[log_dir]['references'].discard(logger)
 
-        if len(__writers__[log_dir]['references']) is 0:
+        if len(__writers__[log_dir]['references']) == 0:
             if 'writer' in __writers__[log_dir]:
                 __writers__[log_dir]['writer'].close()
 
diff --git a/torchbearer/callbacks/torch_scheduler.py b/torchbearer/callbacks/torch_scheduler.py
index f1b7cb5a..2ed4caf6 100644
--- a/torchbearer/callbacks/torch_scheduler.py
+++ b/torchbearer/callbacks/torch_scheduler.py
@@ -1,12 +1,12 @@
 import functools
-
-import torchbearer
-from torchbearer.callbacks import Callback
-from torchbearer.bases import get_metric
+import warnings
 
 import torch
+from packaging import version
 
-import warnings
+import torchbearer
+from torchbearer.bases import get_metric, _pytorch_version_lt
+from torchbearer.callbacks import Callback
 
 
 class TorchScheduler(Callback):
@@ -16,9 +16,7 @@ def __init__(self, scheduler_builder, monitor=None, step_on_batch=False):
         self._scheduler = None
         self._step_on_batch = step_on_batch
 
-        from distutils.version import LooseVersion
-        version = torch.__version__ if str(torch.__version__) is torch.__version__ else "0.4.0"
-        self._newstyle = LooseVersion(version) > LooseVersion("1.1.0")
+        self._newstyle = not _pytorch_version_lt("1.1.0")
 
     def _step(self, state, current=None):
         if state[torchbearer.MODEL].training is False:
@@ -245,9 +243,7 @@ class CyclicLR(TorchScheduler):
     def __init__(self,  base_lr, max_lr, monitor='val_loss', step_size_up=2000, step_size_down=None, mode='triangular',
                  gamma=1., scale_fn=None, scale_mode='cycle', cycle_momentum=True, base_momentum=0.8, max_momentum=0.9,
                  step_on_batch=False):
-        from distutils.version import LooseVersion
-        version = torch.__version__ if str(torch.__version__) is torch.__version__ else "0.4.0"
-        if LooseVersion(version) > LooseVersion("1.0.0"):  # CyclicLR is implemented
+        if not _pytorch_version_lt("1.0.0"):  # CyclicLR is implemented
             super(CyclicLR, self).__init__(functools.partial(torch.optim.lr_scheduler.CyclicLR,
                                            base_lr=base_lr, max_lr=max_lr, step_size_up=step_size_up,
                                            step_size_down=step_size_down, mode=mode, gamma=gamma,
diff --git a/torchbearer/metrics/decorators.py b/torchbearer/metrics/decorators.py
index 2eb1f412..e23b7380 100644
--- a/torchbearer/metrics/decorators.py
+++ b/torchbearer/metrics/decorators.py
@@ -248,7 +248,7 @@ def decorator(clazz):
 def running_mean(clazz=None, batch_size=50, step_size=10, dim=None):
     """The :func:`running_mean` decorator is used to add a :class:`.RunningMean` to the :class:`.MetricTree`. If the
     inner class is not a :class:`.MetricTree` then one will be created. The :class:`.RunningMean` will be wrapped in a
-    :class:`.ToDict` (with 'running\_' prepended to the name) for simplicity.
+    :class:`.ToDict` (with 'running_' prepended to the name) for simplicity.
 
     .. note::
         The decorator function does not need to be called if not desired, both: `@running_mean` and `@running_mean()`
diff --git a/torchbearer/metrics/wrappers.py b/torchbearer/metrics/wrappers.py
index dbfb1b57..351c794b 100644
--- a/torchbearer/metrics/wrappers.py
+++ b/torchbearer/metrics/wrappers.py
@@ -12,7 +12,7 @@
 
 class ToDict(AdvancedMetric):
     """The :class:`ToDict` class is an :class:`.AdvancedMetric` which will put output from the inner :class:`.Metric` in
-    a dict (mapping metric name to value) before returning. When in `eval` mode, 'val\_' will be prepended to the metric
+    a dict (mapping metric name to value) before returning. When in `eval` mode, 'val_' will be prepended to the metric
     name.
 
     Example: ::