# gaussian_diffusion.pydefget_named_beta_schedule(schedule_name, num_diffusion_timesteps):"""
Get a pre-defined beta schedule for the given name.
"""if schedule_name =="linear":# Linear schedule from Ho et al, extended to work for any number of# diffusion steps.
scale =1000/ num_diffusion_timesteps
beta_start = scale *0.0001
beta_end = scale *0.02return np.linspace(
beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
)elif schedule_name =="cosine":return betas_for_alpha_bar(
num_diffusion_timesteps,lambda t: math.cos((t +0.008)/1.008* math.pi /2)**2,)
alphas_cumprod 是
α
‾
t
\overline{\alpha}_t
αt, alphas_cumprod_prev 是
α
‾
t
−
1
\overline{\alpha}_{t-1}
αt−1, alphas_cumprod_next 是
α
‾
t
+
1
\overline{\alpha}_{t+1}
αt+1
# log calculation clipped because the posterior variance is 0 at the# beginning of the diffusion chain.
posterior_log_variance_clipped = np.log(
np.append(self.posterior_variance[1], self.posterior_variance[1:]))
μ
~
(
X
t
,
X
0
)
=
α
‾
t
−
1
1
−
α
‾
t
X
0
+
α
t
(
1
−
α
‾
t
−
1
)
1
−
α
‾
t
X
t
\widetilde{\mu}(X_t, X_0) = \frac{\sqrt{\overline{\alpha}_{t-1}}}{1-\overline{\alpha}_t} X_0 + \frac{\sqrt{\alpha_t}(1-\overline{\alpha}_{t-1})}{1-\overline{\alpha}_{t}}X_t
μ(Xt,X0)=1−αtαt−1X0+1−αtαt(1−αt−1)Xt, 其中
X
0
X_0
X0 前的系数对应 posterior_mean_coef1,
X
t
X_t
Xt 前的系数对应 posterior_mean_coef2。
传入 (x_start, t), 得到 均值和方差
q
(
X
t
∣
X
0
)
=
N
(
X
t
;
α
‾
t
X
0
,
(
1
−
α
‾
t
)
I
)
q(X_t|X_0) = N(X_t; \sqrt{\overline{\alpha}_t}X_0, (1-\overline{\alpha}_t)I)
q(Xt∣X0)=N(Xt;αtX0,(1−αt)I)
后验的均值和分布
μ
~
(
X
t
,
X
0
)
=
α
‾
t
−
1
1
−
α
‾
t
X
0
+
α
t
(
1
−
α
‾
t
−
1
)
1
−
α
‾
t
X
t
\widetilde{\mu}(X_t, X_0) = \frac{\sqrt{\overline{\alpha}_{t-1}}}{1-\overline{\alpha}_t} X_0 + \frac{\sqrt{\alpha_t}(1-\overline{\alpha}_{t-1})}{1-\overline{\alpha}_{t}}X_t
μ(Xt,X0)=1−αtαt−1X0+1−αtαt(1−αt−1)Xt
if self.model_var_type == ModelVarType.LEARNED:
model_log_variance = model_var_values
model_variance = th.exp(model_log_variance)
而improve-DDPM中是预测范围, 及预测下列式子的v。
Σ
θ
(
X
t
,
t
)
=
e
x
p
(
v
l
o
g
β
t
+
(
1
−
v
)
l
o
g
β
~
t
)
\Sigma_{\theta}(X_t, t)=exp(vlog\beta_t + (1-v)log \widetilde{\beta}_t)
Σθ(Xt,t)=exp(vlogβt+(1−v)logβt)
因为
β
~
t
=
1
−
α
‾
t
−
1
1
−
α
‾
t
β
t
\widetilde{\beta}_t = \frac{1-\overline{\alpha}_{t-1}}{1-\overline{\alpha}_t} \beta_t
βt=1−αt1−αt−1βt, 而
1
−
α
‾
t
−
1
<
1
−
α
‾
t
1-\overline{\alpha}_{t-1} < 1-\overline{\alpha}_t
1−αt−1<1−αt, 所以
β
~
t
<
β
t
\widetilde{\beta}_t < \beta_t
βt<βt
# The model_var_values is [-1, 1] for [min_var, max_var].
frac =(model_var_values +1)/2
然后根据公式
Σ
θ
(
X
t
,
t
)
=
e
x
p
(
v
l
o
g
β
t
+
(
1
−
v
)
l
o
g
β
~
t
)
\Sigma_{\theta}(X_t, t)=exp(vlog\beta_t + (1-v)log \widetilde{\beta}_t)
Σθ(Xt,t)=exp(vlogβt+(1−v)logβt)
(2) 方差不可学习 在DDPM中是用
β
t
\beta_t
βt, 而在IDDPM中有两种方式
β
t
\beta_t
βt or
β
~
t
\widetilde{\beta}_t
βt 大的方差即
β
t
\beta_t
βt,
ModelVarType.FIXED_LARGE:(# for fixedlarge, we set the initial (log-)variance like so# to get a better decoder log likelihood.
np.append(self.posterior_variance[1], self.betas[1:]),
np.log(np.append(self.posterior_variance[1], self.betas[1:])),),
μ
~
(
X
t
,
X
0
)
=
α
‾
t
−
1
1
−
α
‾
t
X
0
+
α
t
(
1
−
α
‾
t
−
1
)
1
−
α
‾
t
X
t
\widetilde{\mu}(X_t, X_0) = \frac{\sqrt{\overline{\alpha}_{t-1}}}{1-\overline{\alpha}_t} X_0 + \frac{\sqrt{\alpha_t}(1-\overline{\alpha}_{t-1})}{1-\overline{\alpha}_{t}}X_t
μ(Xt,X0)=1−αtαt−1X0+1−αtαt(1−αt−1)Xt
利用该公式计算
X
0
X_0
X0:
μ
~
(
X
t
,
X
0
)
=
α
‾
t
−
1
1
−
α
‾
t
X
0
+
α
t
(
1
−
α
‾
t
−
1
)
1
−
α
‾
t
X
t
\widetilde{\mu}(X_t, X_0) = \frac{\sqrt{\overline{\alpha}_{t-1}}}{1-\overline{\alpha}_t} X_0 + \frac{\sqrt{\alpha_t}(1-\overline{\alpha}_{t-1})}{1-\overline{\alpha}_{t}}X_t
μ(Xt,X0)=1−αtαt−1X0+1−αtαt(1−αt−1)Xt
X
0
=
1
α
t
(
X
t
−
β
t
1
−
α
‾
t
ϵ
)
X_0 = \frac{1}{\sqrt{\alpha_t}}(X_t-\frac{\beta_t}{\sqrt{1-\overline{\alpha}_t}}\epsilon)
X0=αt1(Xt−1−αtβtϵ) 将上述公式化简
X
0
=
1
α
‾
t
X
t
−
1
α
‾
t
−
1
ϵ
X_0 = \frac{1}{\sqrt{\overline{\alpha}_t}}X_t - \sqrt{\frac{1}{\overline{\alpha}_t}-1}~\epsilon
X0=αt1Xt−αt1−1ϵ
for i in indices:
t = th.tensor([i]* shape[0], device=device)with th.no_grad():
out = self.p_sample(
model,
img,
t,
clip_denoised=clip_denoised,
denoised_fn=denoised_fn,
model_kwargs=model_kwargs,)yield out
img = out["sample"]
out = self.p_mean_variance(
model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
)
计算两个高斯分布的KL散度。
L
t
−
1
=
D
K
L
(
q
(
X
t
−
1
∣
X
t
,
X
0
)
∣
∣
p
θ
(
X
t
−
1
∣
X
t
)
)
L_{t-1} = D_{KL}(q(X_{t-1}|X_t, X_0)~||~ p_\theta (X_{t-1}|X_t))
Lt−1=DKL(q(Xt−1∣Xt,X0)∣∣pθ(Xt−1∣Xt))
defdiscretized_gaussian_log_likelihood(x,*, means, log_scales):"""
Compute the log-likelihood of a Gaussian distribution discretizing to a
given image.
:param x: the target images. It is assumed that this was uint8 values,
rescaled to the range [-1, 1].
:param means: the Gaussian mean Tensor.
:param log_scales: the Gaussian log stddev Tensor.
:return: a tensor like x of log probabilities (in nats).
"""assert x.shape == means.shape == log_scales.shape
centered_x = x - means
inv_stdv = th.exp(-log_scales)
plus_in = inv_stdv *(centered_x +1.0/255.0)
cdf_plus = approx_standard_normal_cdf(plus_in)
min_in = inv_stdv *(centered_x -1.0/255.0)
cdf_min = approx_standard_normal_cdf(min_in)
log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
log_one_minus_cdf_min = th.log((1.0- cdf_min).clamp(min=1e-12))
cdf_delta = cdf_plus - cdf_min
log_probs = th.where(
x <-0.999,
log_cdf_plus,
th.where(x >0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),)assert log_probs.shape == x.shape
return log_probs
training_losses
如果losstype是KL的话
if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
if self.loss_type == LossType.RESCALED_MSE:# Divide by 1000 for equivalence with initial implementation.# Without a factor of 1/1000, the VB term hurts the MSE term.
terms["vb"]*= self.num_timesteps /1000.0
接下来就看目标预测的是哪种了 可以是
X
t
−
1
X_{t-1}
Xt−1 时刻的均值和方差,
X
0
X_0
X0, 也可以是噪声