Machine Learning Ex5.2 - Regularized Logistic Regression

Exercise 5.2 Improves the Logistic Regression implementation done in Exercise 4 by adding a regularization parameter that reduces the problem of over-fitting. We will be using Newton’s Method.

With implementation in R.

Data

Here’s how the data we want to fit, looks like:

google.spreadsheet <- function (key) {
  library(RCurl)
  # ssl validation off
  ssl.verifypeer <- FALSE

  tt <- getForm("https://spreadsheets.google.com/spreadsheet/pub", 
                hl ="en_GB",
                key = key, 
                single = "true", gid ="0", 
                output = "csv", 
                .opts = list(followlocation = TRUE, verbose = TRUE)) 

  read.csv(textConnection(tt), header = TRUE)
}

# load the data
mydata = google.spreadsheet("0AnypY27pPCJydHZPN2pFbkZGd1RKeU81OFY3ZHJldWc")

# plot the data
plot(mydata$u[mydata$y == 0], mydata$v[mydata$y == 0],, xlab="u", ylab="v")
points(mydata$u[mydata$y == 1], mydata$v[mydata$y == 1], col="blue", pch=3)
legend("topright", c("y=0","y=1"), pch=c(1, 3), col=c("black", "blue"), bty="n")

/assets/images/ml-ex52-data.png

The idea of “fitting” is to create a mathematical model, that will separate the circles from the crosses in the plot above by learning from the existing data. That will then allow to make predictions for a new u and v value, the probability of being a cross.

Theory

Hypothesis is:

\h_\theta(x) = g(\theta^T x) = \frac{1}{ 1 + e ^{- \theta^T x} }

Regularization is all about loosen up the tight fit, avoiding over-fitting and thus obtain a more generalized fit, that more likely will work better on new data(for doing predictions).

For that we define the cost function, with an added generalization parameter that blunts the fit, like so:

J(\theta) = \frac{1}{m} \sum_{i=1}^m [(-y)log(h_\theta(x)) - (1 - y) log(1- h_\theta(x))] + \frac{\lambda}{2m} \sum_{i=1}^n \theta^2

lambda is called the regularization parameter.

The iterative theta updates using Newton’s method is defined as:

\theta^{(t+1)} = \theta^{(t)} - H^{-1} \nabla_{\theta}J

And the gradient and Hessian are defined like so(in vectorized versions):

\nabla_{\theta}J = \frac{1}{m} \sum_{i=1}^m (h_\theta(x) - y) x + \frac{\lambda}{m} \theta

H = \frac{1}{m} \sum_{i=1}^m [h_\theta(x) (1 - h_\theta(x)) x^T x] + \frac{\lambda}{m} \begin{bmatrix} 0 & & & \ & 1 & & \ & & … & \ & & & 1 \end{bmatrix}

Implementation

Lets first define the functions above, with the added generalization parameter:

# sigmoid function
g = function (z) {
  return (1 / (1 + exp(-z)))
} # plot(g(c(1,2,3,4,5,6)), type="l")

# build hight order feature vector
# for 2 features, for a given degree
hi.features = function (f1,f2,deg) {
  n = ncol(f1)
  ma = matrix(rep(1,length(f1)))
  for (i in 1:deg) {
    for (j in 0:i)    
      ma = cbind(ma, f1^(i-j) * f2^j)
  }
  return(ma)
} # hi.features(c(1,2), c(3,4),2)
# creates: 1 u v u^2 uv v^2 ...

# hypothesis
h = function (x,th) {
  return(g(x %*% th))
} # h(x,th)

# derivative of J 
grad = function (x,y,th,m,la) {
  G = (la/m * th)
  G[1,] = 0
  return((1/m * t(x) %*% (h(x,th) - y)) +  G)
} # grad(x,y,th,m,la)

# hessian
H = function (x,y,th,m,la) {
  n = length(th)
  L = la/m * diag(n)
  L[1,] = 0
  return((1/m * t(x) %*% x * diag(h(x,th)) * diag(1 - h(x,th))) + L)
} # H(x,y,th,m,la)

# cost function
J = function (x,y,th,m,la) {
  pt = th
  pt[1] = 0
  A = (la/(2*m))* t(pt) %*% pt
  return((1/m * sum(-y * log(h(x,th)) - (1 - y) * log(1 - h(x,th)))) + A)
} # J(x,y,th,m,la)

Now we can make it iterate until convergence, first for lambda=1

# setup variables
m = length(mydata$u) # samples
x = hi.features(mydata$u, mydata$v,6)
n = ncol(x) # features
y = matrix(mydata$y, ncol=1)

# lambda = 1
# use the cost function to check is works
th1 = matrix(0,n)
la = 1
jiter = array(0,c(15,1))
for (i in 1:15) {
  jiter[i] = J(x,y,th1,m,la)
  th1 = th1 - solve(H(x,y,th1,m,la)) %*% grad(x,y,th1,m,la) 
}

Validate that is converging properly, by plotting the Cost(J) function against the number of iterations.

# check that is converging correctly
plot(jiter, xlab="iterations", ylab="cost J")

/assets/images/ml-ex52-j.png

Converging well and fast, as is typical from Newton’s method.

And now we make it iterate for lambda=0 and lambda=10 for comparing fits later:

# lambda = 0
th0 = matrix(0,n)
la = 0
for (i in 1:15) {
  th0 = th0 - solve(H(x,y,th0,m,la)) %*% grad(x,y,th0,m,la) 
}

# lambda = 10
th10 = matrix(0,n)
la = 10
for (i in 1:15) {
  th10 = th10 - solve(H(x,y,th10,m,la)) %*% grad(x,y,th10,m,la) 
}

Finally calculate the decision boundary line and visualize it:

# calculate the decision boundary line
# by creating many points
u = seq(-1, 1.2, len=200);
v = seq(-1, 1.2, len=200);
z0 = matrix(0, length(u), length(v))
z1 = matrix(0, length(u), length(v))
z10 = matrix(0, length(u), length(v))
for (i in 1:length(u)) {
  for (j in 1:length(v)) {
    z0[j,i] =  hi.features(u[i],v[j],6) %*% th0
    z1[j,i] =  hi.features(u[i],v[j],6) %*% th1
    z10[j,i] =  hi.features(u[i],v[j],6) %*% th10
  }
}

# plots
contour(u,v,z0,nlev = 0, xlab="u", ylab="v", nlevels=0, col="black",lty=2)
contour(u,v,z1,nlev = 0, xlab="u", ylab="v", nlevels=0, col="red",lty=2, add=TRUE)
contour(u,v,z10,nlev = 0, xlab="u", ylab="v", nlevels=0, col="green3",lty=2, add=TRUE)
points(mydata$u[mydata$y == 0], mydata$v[mydata$y == 0])
points(mydata$u[mydata$y == 1], mydata$v[mydata$y == 1], col="blue", pch=3)
legend("topright",  c(expression(lambda==0), expression(lambda==1),expression(lambda==10)), lty=1, col=c("black", "red","green3"),bty="n" )

/assets/images/ml-ex52-fit.png

See that the black line (lambda=0) is the more tightly fit to the crosses, and as we increase the lambda values it becomes more loose(and more generalized) and consequently a better predictor for new unseen data.

References

Updated: