In a full bayesian approach the parameters of a model are considered as random variables. In this way a Bayesian handles with the uncertainty about the values of the parameters.
%matplotlib inline
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import Image
Image(filename='./thumbtack.jpg', width=200, height=200)
To simulate the tossing behaviour of the thumtack we need the Bernoulli distribution.
$\theta$ is the probability of success, i.e. $X=1$; here: thumtack shows head with $\theta=0.3$:
$$ P(X=1) = \theta \\ P(X=0) = 1 - \theta $$
$n$-iid (independent and identically distributed) Benoulli trials can be simulated by:
import scipy.stats
scipy.stats.bernoulli.rvs(0.3, size=10)
with $\theta$ the probability of successes of a Bernoulli trial the probability mass function (pmf) of $k$ successes of $n$ iid trials is given by
$$ P( X = k \mid \theta) = {n \choose k} \theta^k (1-\theta)^{n-k} $$
def plot_binom(theta, n):
p=theta
binom = scipy.stats.binom
fig, ax = plt.subplots(1, 1)
x = np.arange(binom.ppf(0.001, n, p),
binom.ppf(0.999, n, p))
ax.plot(x, binom.pmf(x, n, p), 'bo', ms=8, label='binom pmf')
ax.set_xlabel("k")
ax.set_ylabel("p(k)")
ax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5)
plot_binom(theta=0.3, n=10)
Probability density function (pdf) of the Beta distribution:
$$ \begin{align} Beta(\alpha, \beta) = p(\theta \mid \alpha,\beta) & = \frac{\theta^{\alpha-1}(1-\theta)^{\beta-1}}{\int_0^1 u^{\alpha-1} (1-u)^{\beta-1}\, du} \end{align} $$
with
Note: The denominator normalizes the pdf
alpha=10.; beta=40.
plt.figure(figsize=(5,5))
plt.title("Beta(%0.1f,%0.1f)"%(alpha,beta))
r = np.arange(0.,1.01,.01)
plt.xlabel("$\\theta$")
plt.ylabel("beta")
plt.plot(r, scipy.stats.beta.pdf(r, alpha, beta), 'r-', lw=5, alpha=0.6, label='beta pdf')
plt.show()
Bayes Rule for the parameters $\vec \theta$ given the observation $\mathcal D$ (data):
$$ p(\vec \theta | \mathcal D) = \frac{p({\mathcal D \mid \vec \theta}) p(\vec \theta)}{p(\mathcal D)} $$
In Bayesian probability theory, if the posterior distributions $p(\theta | \mathcal D)$ are in the same family as the prior probability distribution $p(\theta)$, the prior and posterior are then called conjugate distributions, and the prior is called a conjugate prior for the likelihood function.
Example: The Beta distribution is the conjugate prior if the likelihood function is the Binomial distribution.
Definition:
Sufficient statistics for Binominal (Example "tossing of the thumtack"): $k, n$
$$ p(\theta \mid \mathcal D) = \frac{p(\mathcal D \mid \theta) p(\theta)}{p(\mathcal D)} $$ Note: the denominator is independent of $\theta$ and normalizes the distribution.
with the sufficent statistics of $\mathcal D$ the number of successes $X=k$ and the number of trials $n$:
$$ p(\theta \mid \mathcal D) \propto \theta^k (1-\theta)^{n-k} \theta^{\alpha-1}(1-\theta)^{\beta-1} = \theta^{k+\alpha-1}(1-\theta)^{n-k+\beta-1} $$
def plot_prior_posterior_beta(alpha, beta, k, n):
r=np.arange(0.,1.01,.01)
plt.figure(figsize=(10,5))
ax = plt.subplot(121)
plt.title("Prior Beta(%0.1f,%0.1f)"%(alpha,beta))
ax.plot(r, scipy.stats.beta.pdf(r, alpha, beta), 'r-', lw=2, alpha=0.6, label='beta pdf')
ax.set_xlabel("$\\theta$")
ax.set_ylabel("prior probability density")
ax = plt.subplot(122)
a = alpha+k; b = beta+n-k
plt.title("Posterior Beta(%0.1f,%0.1f)"%(a ,b))
ax.plot(r, scipy.stats.beta.pdf(r, a, b), 'b-', lw=2, alpha=0.6, label='beta pdf')
ax.set_xlabel("$\\theta$")
ax.set_ylabel("posterior probability density")
plt.show()
plot_prior_posterior_beta(alpha=1., beta=1., k=100, n=300)
The prior is our knowledgle of the machine. The machine produces a thumtack. Without any data (throwing the thumtack) our knowledge about $\theta$ is given by our knowledge of the machine.
From the observe data (by throwing the thumtack) we can compute the likelihood of the data.
With Bayes rule we can combine our prior knowledge with the knowledge we gained from the data (the likelihood).
# without any observation of a produced thumtack: k=0, n=0
# with machine parameter: alpha=2., beta=6.
# tacking one produced thumtack and throwing is 100 times
# with 20 positive observations:
plot_prior_posterior_beta(alpha=2., beta=6., k=20, n=100)
s = 0.9
figsize(11*s, 9*s)
def plot_thumtack_flip(theta_unk = 0.3):
# Beta distribution is the conjugate prior of the binomial distribution
dist = scipy.stats.beta
n_trials = [0, 1, 2, 3, 4, 5, 8, 15, 50, 500]
data = scipy.stats.bernoulli.rvs(theta_unk, size=n_trials[-1])
x = np.linspace(0, 1, 100)
# For the already prepared, I'm using Binomial's conj. prior.
for k, N in enumerate(n_trials):
sx = plt.subplot(len(n_trials) / 2, 2, k + 1)
#plt.xlabel("$\\theta$, probability of heads") \
# if k in [0, len(n_trials) - 1] else None
plt.setp(sx.get_yticklabels(), visible=False)
heads = data[:N].sum()
y = dist.pdf(x, 1 + heads, 1 + N - heads)
plt.plot(x, y, label="observe %d tosses,\n %d heads" % (N, heads))
plt.fill_between(x, 0, y, color="#348ABD", alpha=0.4)
plt.vlines(theta_unk, 0, 4, color="k", linestyles="--", lw=1)
leg = plt.legend()
leg.get_frame().set_alpha(0.4)
plt.autoscale(tight=True)
plt.suptitle("Bayesian updating of posterior probabilities of $\\theta$",
y=1.02,
fontsize=14)
plt.tight_layout()
plot_thumtack_flip()
Here we choose $X$ to be Beta distributed (conjugate prior to the Binomial distribution):
$$ X \sim Beta(\alpha=1,\beta=6) $$
Such a variable is called stochastic in the terminology of pymc.
alpha=1; beta=6;
import pymc3
def get_samples(alpha, beta, num_samples):
with pymc3.Model() as model:
b = pymc3.Beta('b', alpha=alpha, beta=beta)
tr = pymc3.sample(num_samples)
return tr.get_values('b')
def plot_sample_of_beta_dist(alpha, beta, num_samples):
samples = get_samples(alpha, beta, num_samples)
plt.figure(figsize=(10,5))
ax = plt.subplot(121)
ax.hist(samples, bins=int(np.sqrt(num_samples)), normed=True )
ax.set_xlim(0.,1.)
ax.set_ylim(0.,7.)
plt.title("Sample-Histogram Beta(%0.1f,%0.1f)"%(alpha,beta))
ax = plt.subplot(122)
plt.title("Exact distribution Beta(%0.1f,%0.1f)"%(alpha,beta))
r=np.arange(0.,1.01,.01)
ax.set_xlim(0.,1.)
ax.set_ylim(0.,7.)
ax.plot(r, scipy.stats.beta.pdf(r, alpha, beta), 'r-', lw=5, alpha=0.6, label='beta pdf')
plt.show()
plot_sample_of_beta_dist(alpha=1., beta=7., num_samples=10000)
Let's assume we have some data from iid binomial trials and we want to estimate the probability of observing a 1.
#simulate some observations,
theta_unk = 0.3 # unknown theta, that's the parameter we want to estimate
nb_data = n = 40
data = scipy.stats.bernoulli.rvs(theta_unk, size=nb_data)
print ("Data:", data)
k = data.sum()
print ("Suffient statistics:\n\t number of trails:",n, "\n\t number of positive outcomes:", k)
theta_ml = float(k)/float(n)
print ("Maximum likelihood estimate of theta:", theta_ml)
We choose the Beta distribution as prior with $\alpha=1$ und $\beta=1$:
theta = pymc.Beta('theta', alpha=1, beta=1)
If we would put more prior (knowledge) in our model we could use a Beta-Distribution with different parameters $\alpha, \beta$, e.g.:
theta = pymc.Beta('theta', alpha=4., beta=8.)
Our observed variable is a Bernoulli random variable.
For a pymc Bernoulli variable we can use another pymc variable as value for p: our theta
-Variable:
pymc3.Bernoulli('bernoulli',p=theta, observed=data)
# with bernoulli distribution
def create_model_pymc3(data):
with pymc3.Model() as model:
theta = pymc3.Beta('theta', alpha=1, beta=1)
bernoulli = pymc3.Bernoulli('bernoulli',p=theta, observed=data)
return model
Alternatively we can use the sufficient statistics of the data $\mathcal D$, i.e. $(k, n)$.
Our observed variable is then a Binomial random variable:
pymc3.Binomial('binominal', len(data), p, observed=np.sum(data))
# with binominal distribution
def create_model_pymc3_(data):
with pymc3.Model() as model:
theta = pymc3.Beta('theta', alpha=1, beta=1)
# sufficient statistics
n = len(data)
k = np.sum(data)
binominal = pymc3.Binomial('binominal', n, p, observed=k)
return model
model = create_model_pymc3(data)
map_estimate = pymc3.find_MAP(model=model)
map_estimate
with model:
# draw 50000 posterior samples
p_trace = pymc3.sample(50000)
theta_trace = p_trace['theta'][10000:]
plt.figure(figsize=(14,5))
ax = plt.subplot(121)
hist, bins = np.histogram(theta_trace, bins=np.arange(0.,1.01,0.01), normed=True)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
ax.bar(center, hist, align='center', width=width)
plt.xlim([0., 1.0])
plt.ylim([0., 6.0])
ax = plt.subplot(122)
ax.plot(bins, scipy.stats.beta.pdf(bins, k+1, n-k+1), 'r-', lw=5, alpha=0.6, label='beta pdf')
plt.xlim([0., 1.0])
plt.ylim([0., 6.0])
plt.legend()
plt.show()
sample_size=30
def get_traces_pymc3(sample_size, theta_unk=.3):
observed_data=scipy.stats.bernoulli.rvs(theta_unk, size=sample_size)
model_pymc3 = create_model_pymc3(observed_data)
with model_pymc3:
# obtain starting values via MAP
start = pymc3.find_MAP()
# draw 2000 posterior samples
trace = pymc3.sample(20000, start=start)
return trace.get_values('theta'), observed_data
Comparing pymc's approximation with the exact solution:
s = 0.8
figsize(11*s, 9*s)
n_samples = [20,50,100,500,5000]
def get_theta_histograms():
histograms = dict()
data = dict()
bins = np.arange(0.,1.01,0.01)
for i, N in enumerate(n_samples):
#prior_trace, observed_data = get_traces(N, theta_unk=theta_unk)
prior_trace, observed_data = get_traces_pymc3(N, theta_unk=theta_unk)
hist, bins = np.histogram(prior_trace, bins=bins)
histograms[N] = hist
data[N] = observed_data
return histograms, bins, data
histograms, bins, data = get_theta_histograms()
def plot_theta_histograms(histograms, bins, data):
x = np.linspace(0, 1, 100)
theta_unk = 0.3
plt.figure(figsize=(10,16))
for i, N in enumerate(n_samples):
ax = plt.subplot(len(n_samples), 2, (i*2)+1)
plt.setp(ax.get_yticklabels(), visible=False)
hist = histograms[N]
#np.histogram(prior_trace, bins=np.arange(0.,1.01,0.01))
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
ax.bar(center, hist, align='center', width=width)
plt.xlim([0., 1.0])
ax.set_xlabel("$theta$")
ax.set_ylabel("$p(theta)$")
ax = plt.subplot(len(n_samples), 2, (i*2) + 2)
plt.setp(ax.get_yticklabels(), visible=False)
observed_data = data[N]
k = observed_data.sum(); n=len(observed_data)
ax.plot(bins, scipy.stats.beta.pdf(bins, k+1, n-k+1), 'r-', lw=2, alpha=0.6, label='beta pdf')
ax.set_xlabel("$theta$")
ax.set_ylabel("$p(theta)$")
plt.suptitle("MCMC sample histograms and exact solution for different $n$")
#plt.legend()
plt.show()
plot_theta_histograms(histograms, bins, data)