Better initialisation to avoid divergence of smoothing parameter #22

mfasiolo · 2017-07-21T17:22:15Z

require(RhpcBLASctl); blas_set_num_threads(1) #optional

data("UKload")

qu <- 0.5
form <- NetDemand~s(wM,k=20,bs='cr') + s(wM_s95,k=20,bs='cr') + 
  s(Posan,bs='ad',k=30,xt=list("bs"="cc")) + Dow + s(Trend,k=4) + NetDemand.48 + Holy

# Do library(debug); mtrace(tuneLearn); 
# and then mtrace(newton) after for( ii in 1:nt ) 

library(qgam)
require(RhpcBLASctl); blas_set_num_threads(1) #optional
qu <- 0.05
tic <- proc.time()
set.seed(41241)
sigSeq <- seq(3, 8, length.out = 16)
closs1 <- tuneLearn(form = form, data = UKload, err = 0.15,
                   lsig = sigSeq, qu = qu, control = list("K" = 2))
proc.time() - tic
#check(closs1)

# At the first iteration you will see

# Initial smoothing parameters
7.571046   9.556191   2.441944 -19.327599 -12.368925   5.065998   4.854984  -2.279645 

# After one iteration lsp get to 
-1.630176   1.364197 -12.974725 -19.327599 -12.840063 -11.929365  -9.659406 -11.917420

# with gradient
143.748873878   6.825546046   0.367822900   0.005996531  -0.019127832  -0.226958093  -0.786281196  13.154424694

# Hessian: very concave along the second variable!
[,1]          [,2]         [,3]         [,4]          [,5]          [,6]          [,7]          [,8]
[1,]  2.458789e+01  8.412941e+00 -0.011584408 8.621974e-05  0.0029556391 -0.0265641062 -0.1539408394 -0.3165983797
[2,]  8.412941e+00 -5.877177e+00 -0.001159426 3.529038e-06 -0.0001722144  0.0004438109 -0.0186866835 -0.1314088233
[3,] -1.158441e-02 -1.159426e-03  1.095528270 3.340906e-03  0.4568092625  0.0038351018  0.4755544119  0.0046309245
[4,]  8.621974e-05  3.529038e-06  0.003340906 6.018966e-03  0.0047966777  0.0002868956  0.0006440296  0.0000427432
[5,]  2.955639e-03 -1.722144e-04  0.456809262 4.796678e-03  1.9886273956  0.5466783024  0.0944302443  0.0183346497
[6,] -2.656411e-02  4.438109e-04  0.003835102 2.868956e-04  0.5466783024  0.3882664949  0.3151293611  0.0581675198
[7,] -1.539408e-01 -1.868668e-02  0.475554412 6.440296e-04  0.0944302443  0.3151293611  3.0543169197  0.2696595281
[8,] -3.165984e-01 -1.314088e-01  0.004630925 4.274320e-05  0.0183346497  0.0581675198  0.2696595281  7.4095616814

# Eig values
26.7637733  7.4210911  3.1796840  0.9925382  0.3475061 -8.0462096

# The hessian is not PD hence the following code flips the negative eigenvalue.
grad <- c(143.7488739, 6.8255460, 0.3678229, 0.005996531, -0.019127832, -0.2269581, -0.7862812, 13.1544247)

grad1 <- c(143.7488739, 6.8255460, 0.3678229, -0.2269581, -0.7862812, 13.1544247) 

hess1 <- matrix(c(24.58788695,  8.4129414625, -0.011584408, -0.0265641062, -0.15394084, -0.316598380,
           8.41294146, -5.8771772964, -0.001159426,  0.0004438109, -0.01868668, -0.131408823,
           -0.01158441, -0.0011594263,  1.095528270,  0.0038351018,  0.47555441,  0.004630925,
           -0.02656411,  0.0004438109,  0.003835102,  0.3882664949,  0.31512936,  0.058167520,
           -0.15394084, -0.0186866835,  0.475554412,  0.3151293611,  3.05431692,  0.269659528,
           -0.31659838, -0.1314088233,  0.004630925,  0.0581675198,  0.26965953,  7.409561681), 6, 6)

## get the trial step ...
eh <- eigen(hess1,symmetric=TRUE)
d <- eh$values;U <- eh$vectors
## set eigen-values to their absolute value - heuristically appealing
## as it avoids very long steps being proposed for indefinte components,
## unlike setting -ve e.v.s to very small +ve constant...
ind <- d < 0
pdef <- if (sum(ind)>0) FALSE else TRUE ## is it positive definite? 
d[ind] <- -d[ind] ## see Gill Murray and Wright p107/8
low.d <- max(d)*.Machine$double.eps^.7
ind <- d < low.d
if (sum(ind)>0) pdef <- FALSE ## not certain it is positive definite
d[ind] <- low.d 
ind <- d != 0
d[ind] <- 1/d[ind]

# This produces the following step: the second component is going in the wrong direction 
# (increasing, rather than decreasing)
-drop(U%*%(d*(t(U)%*%grad1))) # (modified) Newton direction
# -6.0315446  2.2039636 -0.4772446  0.4242607  0.2126593 -2.0296309

# This moves ends up lowering (negative) REML quite a lot. Probably because the first coefficient
# update (-6.03) corresponds to precipitation, while the incriminated update (2.203) corresponds to
# smoothed precipitation. Maybe they are quite correlated and get traded off each other?

# Now the smoothing parameters are
[1]  -6.630176   3.191227 -13.370349 -19.327599 -12.840063 -11.577664  -9.483116 -13.599933

# While the old were
-1.630176   1.364197 -12.974725 -19.327599 -12.840063 -11.929365  -9.659406 -11.917420

# The gradient is now
[1]  4.024835175  0.001849281  0.146872205  0.004788172  0.031598356  0.118494166 -0.287925519  3.607569377

# and the hessian 
[,1]          [,2]         [,3]         [,4]          [,5]          [,6]          [,7]          [,8]
[1,] 3.3560882713  8.002191e-04 4.697608e-03 4.038460e-05  7.289772e-03  1.344963e-02  6.638055e-02  4.888544e-02
[2,] 0.0008002191 -1.842555e-03 6.647676e-06 3.294869e-08 -1.114174e-05 -7.135799e-05 -3.043383e-05 -3.971898e-06
[3,] 0.0046976082  6.647676e-06 5.926385e-01 3.131827e-03  4.160236e-01  3.672006e-03  3.751701e-01  3.499639e-04
[4,] 0.0000403846  3.294869e-08 3.131827e-03 4.818144e-03  5.765253e-03  3.440575e-04  8.994476e-04  9.959019e-06
[5,] 0.0072897719 -1.114174e-05 4.160236e-01 5.765253e-03  2.007663e+00  5.485817e-01  8.480139e-02  2.516710e-03
[6,] 0.0134496271 -7.135799e-05 3.672006e-03 3.440575e-04  5.485817e-01  8.691455e-01  2.940997e-01  1.705993e-02
[7,] 0.0663805549 -3.043383e-05 3.751701e-01 8.994476e-04  8.480139e-02  2.940997e-01  3.495033e+00  6.174737e-02
[8,] 0.0488854368 -3.971898e-06 3.499639e-04 9.959019e-06  2.516710e-03  1.705993e-02  6.174737e-02  3.995061e+00

# So the objective is kind of flat and negative definite across the 2nd parameter. Still the grad[2] has the 
# right sign. Now this check
uconv.ind <- uconv.ind & abs(grad)>max(abs(grad))*.001 
# gives
[1]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
# Hence the second lsp is stuck where it is.
# Next iteration is kind of similar (similar gradient and hessian) and 2nd lsp fails the uconv.ind test and
# remains stuck.

# In the end convergence is achieved, that is this is FALSE
abs(old.score-score)>score.scale*conv.tol
# where the gradient is 
[1] 5.936495e-06 2.892744e-03 5.867093e-04 5.995174e-04 1.633661e-03 2.032578e-04 5.398356e-04 1.390044e-04
# and score.scale = 19303.56. Notice that here
score.scale <- abs(log(b$scale.est)) + abs(score)
# but log(b$scale.est) = 0, which might explain why convergence is slower when the response is normalized.
# Maybe we should set scale.est to the learning rate sigma?
# The final hessian is 

[,1]          [,2]         [,3]         [,4]          [,5]          [,6]          [,7]          [,8]
[1,] 9.906413e-01  9.528050e-05 2.017215e-03 7.067377e-06  2.865816e-02  3.390677e-02  9.211675e-02  2.808279e-02
[2,] 9.528050e-05 -2.851802e-03 2.235050e-06 3.550656e-09 -2.025883e-05 -9.484876e-05 -6.078402e-05 -5.010377e-05
[3,] 2.017215e-03  2.235050e-06 8.990583e-02 2.509880e-04  2.426490e-01  7.070839e-04  1.842829e-01  2.548583e-05
[4,] 7.067377e-06  3.550656e-09 2.509880e-04 6.004118e-04  1.184680e-03  3.313392e-05  2.237754e-04  3.348968e-07
[5,] 2.865816e-02 -2.025883e-05 2.426490e-01 1.184680e-03  2.828701e+00  4.866797e-01  1.712816e-01  6.120458e-04
[6,] 3.390677e-02 -9.484876e-05 7.070839e-04 3.313392e-05  4.866797e-01  4.255299e-01  2.428750e-01  2.172774e-03
[7,] 9.211675e-02 -6.078402e-05 1.842829e-01 2.237754e-04  1.712816e-01  2.428750e-01  4.008821e+00  1.476809e-02
[8,] 2.808279e-02 -5.010377e-05 2.548583e-05 3.348968e-07  6.120458e-04  2.172774e-03  1.476809e-02  9.706237e-01

The text was updated successfully, but these errors were encountered:

mfasiolo · 2017-07-24T15:57:06Z

mgcv using the method in Section 6.7 of Simon Wood's book (2nd ed) to initialize the sp. Several fixes in qgam and mgcv are needed:

log smoothing parameters from gausFit cannot possibly be used to initialize the sp of elf, because they are on different scales;
elf should not initialize mustart to y, otherwise the total curvature ( sum(w) ) in mgcv:::initial.sp is much higher than what it should be. Which make to that the smoothing parameters are over-estimated by initial.sp (which makes trace(X^TWX) to trace(S));
in initial.sp, this line
w <- .5 * as.numeric(family$Dd(y,mustart,theta,weights)$Dmu2*family$mu.eta(family$linkfun(mustart))^2)
makes so the EXPECTED weights are used. These are on average much larger then the actual weights, because the model is mispecified. Hence the initial guess for the sp is too high.

mfasiolo · 2017-09-25T16:28:06Z

Points 1 and 2 hopefully addressed in these commits:

f87b537
164e927

but point 3 needs to be addressed in mgcv.

mfasiolo added the bug label Jul 21, 2017

mfasiolo removed the bug label Sep 25, 2017

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Better initialisation to avoid divergence of smoothing parameter #22

Better initialisation to avoid divergence of smoothing parameter #22

mfasiolo commented Jul 21, 2017

mfasiolo commented Jul 24, 2017

mfasiolo commented Sep 25, 2017

Better initialisation to avoid divergence of smoothing parameter #22

Better initialisation to avoid divergence of smoothing parameter #22

Comments

mfasiolo commented Jul 21, 2017

mfasiolo commented Jul 24, 2017

mfasiolo commented Sep 25, 2017