From d54d2e2ee79730fe0d26e0cd2cfc0c4073724c3f Mon Sep 17 00:00:00 2001 From: teddygroves Date: Mon, 18 Mar 2024 18:00:14 +0100 Subject: [PATCH] Fix some wording in week3.qmd --- docs/metropolis-hastings.html | 44 ++++++------- docs/search.json | 2 +- docs/week3.html | 121 +++++++++++++++++----------------- materials/week3.qmd | 19 +++--- 4 files changed, 95 insertions(+), 91 deletions(-) diff --git a/docs/metropolis-hastings.html b/docs/metropolis-hastings.html index 84a7582..c685a98 100644 --- a/docs/metropolis-hastings.html +++ b/docs/metropolis-hastings.html @@ -5006,7 +5006,7 @@

Markov Chains

For example:

The transitions can be measured as discrete time steps with the following matrix representation

-
+
import numpy as np
 T = np.matrix([[0.1, 0.1, 0.8], [0.5, 0.1, 0.4], [0.5, 0.2, 0.3]])
 print(T)
@@ -5017,7 +5017,7 @@

Markov Chains

Given an initial starting position

-
+
v0 = np.matrix([0.1, 0.4, 0.5])
 print(v0)
@@ -5025,7 +5025,7 @@

Markov Chains

We can simulate the probabilities of the next step given the transition matrix.

-
+
v1 = v0*T
 print(v1)
@@ -5033,7 +5033,7 @@

Markov Chains

Following this again we can simulate the states after two steps

-
+
v2 = v1*T
 print(v2)
@@ -5041,21 +5041,21 @@

Markov Chains

There’s a convenient way to calculate the next step given the starting condition.

-
+
print(v0*T**2)
[[0.316 0.139 0.545]]

What happens if we continue doing this for a long time?

-
+
print(v0*T**100)
[[0.35714286 0.14935065 0.49350649]]

And how does this change when taking the next step?

-
+
print(v0*T**101)
[[0.35714286 0.14935065 0.49350649]]
@@ -5120,7 +5120,7 @@

Define probability density function

-
+
import numpy as np
 from scipy.stats import norm
 
@@ -5130,21 +5130,21 @@ 

Define

Define proposal distribution

-
+
def proposal(x):
   return norm.rvs(x, 1, 1)[0]

Initialise sampler

-
+
current = 0.0
 samples = [current]

Sample from distribution

-
+
for i in range(10000):
     prop = proposal(current)
     accept_reject = prob(prop)/prob(current)
@@ -5160,21 +5160,21 @@ 

Sample from distr

Plot distribution

-
+
import matplotlib.pyplot as plt
 plt.hist(samples)
-
(array([   6.,   45.,  377., 1127., 1581., 1315.,  503.,  105.,    9.,
-           2.]),
- array([0.        , 0.43380605, 0.8676121 , 1.30141815, 1.7352242 ,
-        2.16903025, 2.6028363 , 3.03664235, 3.4704484 , 3.90425445,
-        4.3380605 ]),
+
(array([   4.,    7.,  110.,  545., 1309., 1525., 1033.,  329.,   55.,
+           4.]),
+ array([-0.34331577,  0.09770966,  0.5387351 ,  0.97976053,  1.42078597,
+         1.8618114 ,  2.30283684,  2.74386228,  3.18488771,  3.62591315,
+         4.06693858]),
  <BarContainer object of 10 artists>)
-

+

@@ -5182,13 +5182,13 @@

Plot distribution

Trace plot

-
+
draw = [draw for draw, _ in enumerate(samples)]
 plt.plot(draw, samples)
-

+

@@ -5198,12 +5198,12 @@

Trace plot

Part 2: determining mean and standard deviation from data

I suggest using logs due to numerical issues. Here’s an example function which you can use to evaluate the probability of the data.

-
+
def eval_prob(data, mu, sigma):
     return np.log(norm.pdf(data,mu,sigma)).sum()

Here’s also a multivariate random number generator to generate proposals.

-
+
def proposal_multi(mu, sigma):
     mean = [mu, sigma]
     cov = [[0.2, 0], [0, 0.2]]  # diagonal covariance
diff --git a/docs/search.json b/docs/search.json
index 54825a7..3b9536b 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -367,7 +367,7 @@
     "href": "metropolis-hastings.html#coding-the-metropolis-hastings-in-practice",
     "title": "Metropolis Hastings",
     "section": "Coding the Metropolis Hastings in practice",
-    "text": "Coding the Metropolis Hastings in practice\n\nPart 1: sampling from a normal distribution\nGiven a \\(p(x) = N(2, 0.5)\\) how would draw samples using the Metropolis-Hastings algorithm?\n\nChoose proposal value\nEvaluate probability ratio\nAccept-Reject\nincrement step\n\n\nDefine probability density function\n\nimport numpy as np\nfrom scipy.stats import norm\n\ndef prob(x):\n  return norm.pdf(x, 2, 0.5)\n\n\n\nDefine proposal distribution\n\ndef proposal(x):\n  return norm.rvs(x, 1, 1)[0]\n\n\n\nInitialise sampler\n\ncurrent = 0.0\nsamples = [current]\n\n\n\nSample from distribution\n\nfor i in range(10000):\n    prop = proposal(current)\n    accept_reject = prob(prop)/prob(current)\n    if accept_reject > 1:\n        samples.append(prop)\n        current = prop\n    else:\n        cutoff = np.random.rand(1)[0]\n        if accept_reject > cutoff:\n            samples.append(prop)\n            current = prop\n\n\n\nPlot distribution\n\nimport matplotlib.pyplot as plt\nplt.hist(samples)\n\n(array([   6.,   45.,  377., 1127., 1581., 1315.,  503.,  105.,    9.,\n           2.]),\n array([0.        , 0.43380605, 0.8676121 , 1.30141815, 1.7352242 ,\n        2.16903025, 2.6028363 , 3.03664235, 3.4704484 , 3.90425445,\n        4.3380605 ]),\n <BarContainer object of 10 artists>)\n\n\n\n\n\n\n\n\n\n\n\nTrace plot\n\ndraw = [draw for draw, _ in enumerate(samples)]\nplt.plot(draw, samples)\n\n\n\n\n\n\n\n\n\n\n\nPart 2: determining mean and standard deviation from data\nI suggest using logs due to numerical issues. Here’s an example function which you can use to evaluate the probability of the data.\n\ndef eval_prob(data, mu, sigma):\n    return np.log(norm.pdf(data,mu,sigma)).sum()\n\nHere’s also a multivariate random number generator to generate proposals.\n\ndef proposal_multi(mu, sigma):\n    mean = [mu, sigma]\n    cov = [[0.2, 0], [0, 0.2]]  # diagonal covariance\n    return np.random.multivariate_normal(mean, cov, 1)[0]\n\nHere is how you’d call the proposal function\nprop_mu, prop_sigma = proposal_multi(current_mu, current_sigma)\nthe accept_reject probability should also be updated to account for the log-probability\naccept_reject = np.exp(prob_prop - prob_current)\nYou should sample a 95% interval including a \\(\\mu = 5\\) and a \\(\\sigma = 0.2\\). This may be difficult at first to sample and I would recommend initialising at these values.",
+    "text": "Coding the Metropolis Hastings in practice\n\nPart 1: sampling from a normal distribution\nGiven a \\(p(x) = N(2, 0.5)\\) how would draw samples using the Metropolis-Hastings algorithm?\n\nChoose proposal value\nEvaluate probability ratio\nAccept-Reject\nincrement step\n\n\nDefine probability density function\n\nimport numpy as np\nfrom scipy.stats import norm\n\ndef prob(x):\n  return norm.pdf(x, 2, 0.5)\n\n\n\nDefine proposal distribution\n\ndef proposal(x):\n  return norm.rvs(x, 1, 1)[0]\n\n\n\nInitialise sampler\n\ncurrent = 0.0\nsamples = [current]\n\n\n\nSample from distribution\n\nfor i in range(10000):\n    prop = proposal(current)\n    accept_reject = prob(prop)/prob(current)\n    if accept_reject > 1:\n        samples.append(prop)\n        current = prop\n    else:\n        cutoff = np.random.rand(1)[0]\n        if accept_reject > cutoff:\n            samples.append(prop)\n            current = prop\n\n\n\nPlot distribution\n\nimport matplotlib.pyplot as plt\nplt.hist(samples)\n\n(array([   4.,    7.,  110.,  545., 1309., 1525., 1033.,  329.,   55.,\n           4.]),\n array([-0.34331577,  0.09770966,  0.5387351 ,  0.97976053,  1.42078597,\n         1.8618114 ,  2.30283684,  2.74386228,  3.18488771,  3.62591315,\n         4.06693858]),\n <BarContainer object of 10 artists>)\n\n\n\n\n\n\n\n\n\n\n\nTrace plot\n\ndraw = [draw for draw, _ in enumerate(samples)]\nplt.plot(draw, samples)\n\n\n\n\n\n\n\n\n\n\n\nPart 2: determining mean and standard deviation from data\nI suggest using logs due to numerical issues. Here’s an example function which you can use to evaluate the probability of the data.\n\ndef eval_prob(data, mu, sigma):\n    return np.log(norm.pdf(data,mu,sigma)).sum()\n\nHere’s also a multivariate random number generator to generate proposals.\n\ndef proposal_multi(mu, sigma):\n    mean = [mu, sigma]\n    cov = [[0.2, 0], [0, 0.2]]  # diagonal covariance\n    return np.random.multivariate_normal(mean, cov, 1)[0]\n\nHere is how you’d call the proposal function\nprop_mu, prop_sigma = proposal_multi(current_mu, current_sigma)\nthe accept_reject probability should also be updated to account for the log-probability\naccept_reject = np.exp(prob_prop - prob_current)\nYou should sample a 95% interval including a \\(\\mu = 5\\) and a \\(\\sigma = 0.2\\). This may be difficult at first to sample and I would recommend initialising at these values.",
     "crumbs": [
       "Course materials",
       "Metropolis Hastings"
diff --git a/docs/week3.html b/docs/week3.html
index 508b52c..b10820e 100644
--- a/docs/week3.html
+++ b/docs/week3.html
@@ -5147,7 +5147,7 @@ 

Example

We’ll go through some diagnostics using arviz.

Step one is to load some data. Rather than going through a whole modelling workflow, we’ll just take one of the example MCMC outputs that arviz provides via the function load_arviz_data.

This particular MCMC output has to do with measurements of soil radioactivity in the USA. You can read more about it here.

-
+
import arviz as az
 import numpy as np  # numpy is for numerical programming
 import xarray as xr  # xarray is for manipulating n dimensional tables; arviz uses it a lot! 
@@ -5163,8 +5163,8 @@ 

Example

  • - - + +
      @@ -5491,7 +5491,7 @@

      Example

      inference_library: pymc3 inference_library_version: 3.9.2 sampling_time: 18.096983432769775 - tuning_steps: 1000
  • created_at :
    2020-07-24T18:15:12.191355
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2
    sampling_time :
    18.096983432769775
    tuning_steps :
    1000

  • - - + +
      @@ -5848,20 +5848,20 @@

      Example

      created_at: 2020-07-24T18:15:12.449843 arviz_version: 0.9.0 inference_library: pymc3 - inference_library_version: 3.9.2
  • created_at :
    2020-07-24T18:15:12.449843
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2

  • - - + +
      @@ -6179,20 +6179,20 @@

      Example

      created_at: 2020-07-24T18:15:12.448264 arviz_version: 0.9.0 inference_library: pymc3 - inference_library_version: 3.9.2
  • created_at :
    2020-07-24T18:15:12.448264
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2

  • - - + +
      @@ -6520,17 +6520,17 @@

      Example

      inference_library: pymc3 inference_library_version: 3.9.2 sampling_time: 18.096983432769775 - tuning_steps: 1000
  • created_at :
    2020-07-24T18:15:12.197697
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2
    sampling_time :
    18.096983432769775
    tuning_steps :
    1000

  • - - + +
      @@ -6857,7 +6857,7 @@

      Example

      created_at: 2020-07-24T18:15:12.454586 arviz_version: 0.9.0 inference_library: pymc3 - inference_library_version: 3.9.2
    • g_coef
      PandasIndex
      PandasIndex(Index(['intercept', 'slope'], dtype='object', name='g_coef'))
  • created_at :
    2020-07-24T18:15:12.454586
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2

  • - - + +
      @@ -7214,20 +7214,20 @@

      Example

      created_at: 2020-07-24T18:15:12.457652 arviz_version: 0.9.0 inference_library: pymc3 - inference_library_version: 3.9.2
  • created_at :
    2020-07-24T18:15:12.457652
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2

  • - - + +
      @@ -7543,17 +7543,17 @@

      Example

      created_at: 2020-07-24T18:15:12.458415 arviz_version: 0.9.0 inference_library: pymc3 - inference_library_version: 3.9.2
  • created_at :
    2020-07-24T18:15:12.458415
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2

  • - - + +
      @@ -7872,7 +7872,7 @@

      Example

      created_at: 2020-07-24T18:15:12.459832 arviz_version: 0.9.0 inference_library: pymc3 - inference_library_version: 3.9.2
  • created_at :
    2020-07-24T18:15:12.459832
    arviz_version :
    0.9.0
    inference_library :
    pymc3
    inference_library_version :
    3.9.2

  • @@ -8197,7 +8197,7 @@

    Example

    For example, if you click on the dropdown arrows above you will see that the group posterior contains a variable called a_county that has three dimensions called chain, draw and County. There are 85 counties and the first one is labelled 'AITKIN'.

    The function az.summary lets us look at some useful summary statistics, including \(\hat{R}\), divergent transitions and MCSE.

    The variable lp, which you can find in the group sample_stats is the model’s total log probability density. It’s not very meaningful on its own, but is useful for judging overall convergence. diverging counts the number of divergent transitions.

    -
    +
    az.summary(idata.sample_stats, var_names=["lp", "diverging"])
    /Users/tedgro/repos/biosustain/bayesian_statistics_for_computational_biology/.venv/lib/python3.12/site-packages/arviz/stats/diagnostics.py:592: RuntimeWarning: invalid value encountered in scalar divide
    @@ -8255,8 +8255,9 @@ 

    Example

    +

    In this case there were no post-warmup diverging transitions, and the \(\hat{R}\) statistic for the lp variable is pretty close to 1: great!

    Sometimes it’s useful to summarise individual parameters. This can be done by pointing az.summary at the group where the parameters of interest live. In this case the group is called posterior.

    -
    +
    az.summary(idata.posterior, var_names=["sigma", "g"])
    @@ -8322,9 +8323,8 @@

    Example

    -

    In this case there were no post-warmup diverging transitions, and the \(\hat{R}\) statistic for the lp variable is pretty close to 1: great!

    Now we can start evaluating the model. First we check to see whether replicated measurements from the model’s posterior predictive distribution broadly agree with the observed measurements, using the arviz function plot_lm:

    -
    +
    az.style.use("arviz-doc")
     az.plot_lm(
         y=idata.observed_data["y"],
    @@ -8336,13 +8336,14 @@ 

    Example

    -

    +

    -

    The function az.loo performs approximate leave-one-out cross validation, which can be useful for evaluating how well the model might make predictions. Watch out for the warning column, which can tell you if the approximation is likely to be incorrect. It’s usually a good idea to set the pointwise argument to True, as this allows for more detailed analysis at the per-observation level.

    -
    +

    The function az.loo can quickly estimate a model’s out of sample log likelihood (which we saw above is a nice default loss function), allowing a nice numerical comparison between models.

    +

    Watch out for the warning column, which can tell you if the estimation is likely to be incorrect. It’s usually a good idea to set the pointwise argument to True, as this allows for more detailed analysis at the per-observation level.

    +
    az.loo(idata, var_name="y", pointwise=True)
    Computed from 2000 posterior samples and 919 observations log-likelihood matrix.
    @@ -8361,7 +8362,7 @@ 

    Example

    The function az.compare is useful for comparing different out of sample log likelihood estimates.

    -
    +
    idata.log_likelihood["fake"] = xr.DataArray(
         # generate some fake log likelihoods
         np.random.normal(0, 2, [4, 500, 919]),
    @@ -8410,21 +8411,21 @@ 

    Example

    -1027.176982 26.817161 0.000000 -0.985377 +0.985145 28.848998 -0.000000 +0.00000 False log fake 1 --1799.350872 -3621.756304 -772.173891 -0.014623 -3.523104 -28.906533 +-1804.264639 +3634.492107 +777.087658 +0.014855 +3.580520 +28.99616 True log @@ -8436,7 +8437,7 @@

    Example

    The function az.plot_compare shows these results on a nice graph:

    -
    +
    az.plot_compare(comparison);
    /Users/tedgro/repos/biosustain/bayesian_statistics_for_computational_biology/.venv/lib/python3.12/site-packages/arviz/plots/backends/matplotlib/compareplot.py:87: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
    @@ -8445,7 +8446,7 @@ 

    Example

    -

    +

    diff --git a/materials/week3.qmd b/materials/week3.qmd index ff90a5a..6b24027 100644 --- a/materials/week3.qmd +++ b/materials/week3.qmd @@ -175,6 +175,9 @@ divergent transitions. az.summary(idata.sample_stats, var_names=["lp", "diverging"]) ``` +In this case there were no post-warmup diverging transitions, and the $\hat{R}$ +statistic for the `lp` variable is pretty close to 1: great! + Sometimes it's useful to summarise individual parameters. This can be done by pointing `az.summary` at the group where the parameters of interest live. In this case the group is called `posterior`. @@ -183,9 +186,6 @@ this case the group is called `posterior`. az.summary(idata.posterior, var_names=["sigma", "g"]) ``` -In this case there were no post-warmup diverging transitions, and the $\hat{R}$ -statistic for the `lp` variable is pretty close to 1: great! - Now we can start evaluating the model. First we check to see whether replicated measurements from the model's posterior predictive distribution broadly agree with the observed measurements, using the arviz function [`plot_lm`](https://python.arviz.org/en/latest/api/generated/arviz.plot_lm.html#arviz.plot_lm): @@ -201,11 +201,14 @@ az.plot_lm( ); ``` -The function `az.loo` performs approximate leave-one-out cross validation, which -can be useful for evaluating how well the model might make predictions. Watch -out for the `warning` column, which can tell you if the approximation is likely -to be incorrect. It's usually a good idea to set the `pointwise` argument to -`True`, as this allows for more detailed analysis at the per-observation level. +The function `az.loo` can quickly estimate a model's out of sample log +likelihood (which we saw above is a nice default loss function), allowing a nice +numerical comparison between models. + +Watch out for the `warning` column, which can tell you if the estimation is +likely to be incorrect. It's usually a good idea to set the `pointwise` argument +to `True`, as this allows for more detailed analysis at the per-observation +level. ```{python} az.loo(idata, var_name="y", pointwise=True)