11.2 Logistic regression

Logistic regression is an example of a binary classifier, where the output takes one two values 0 or 1 for each data point. We call the two values classes.

Formulation as an optimization problem

Define the sigmoid function

S(x)=11+exp(x).

Next, given an observation xRd and a weights θRd we set

hθ(x)=S(θTx)=11+exp(θTx).

The weights vector θ is part of the setup of the classifier. The expression hθ(x) is interpreted as the probability that x belongs to class 1. When asked to classify x the returned answer is

x{1hθ(x)1/2,0hθ(x)<1/2.

When training a logistic regression algorithm we are given a sequence of training examples xi, each labelled with its class yi{0,1} and we seek to find the weights θ which maximize the likelihood function

ihθ(xi)yi(1hθ(xi))1yi.

Of course every single yi equals 0 or 1, so just one factor appears in the product for each training data point. By taking logarithms we can define the logistic loss function:

J(θ)=i:yi=1log(hθ(xi))i:yi=0log(1hθ(xi)).

The training problem with regularization (a standard technique to prevent overfitting) is now equivalent to

minθJ(θ)+λθ2.

This can equivalently be phrased as

(11.13)minimizeiti+λrsubject totilog(hθ(x))=log(1+exp(θTxi))if yi=1,tilog(1hθ(x))=log(1+exp(θTxi))if yi=0,rθ2.

Implementation

As can be seen from (11.13) the key point is to implement the softplus bound tlog(1+eu), which is the simplest example of a log-sum-exp constraint for two terms. Here t is a scalar variable and u will be the affine expression of the form ±θTxi. This is equivalent to

exp(ut)+exp(t)1

and further to

(11.14)(z1,1,ut)Kexp(z1exp(ut)),(z2,1,t)Kexp(z2exp(t)),z1+z21.

This formulation can be entered using affine conic constraints (see Sec. 6.2 (From Linear to Conic Optimization)).

Listing 11.8 Implementation of tlog(1+eu) as in (11.14). Click here to download.
"""
Adds ACCs for t_i >= log ( 1 + exp((1-2*y[i]) * theta' * X[i]) )
Adds auxiliary variables, AFE rows and constraints
"""
function softplus(task :: Mosek.Task, d::Int, n::Int, theta::Int, t::Int, X::Matrix{Float64}, y::Vector{Bool})
    nvar = getnumvar(task)
    ncon = getnumcon(task)
    nafe = getnumafe(task)
    z1       = nvar
    z2       = z1+n
    zcon     = ncon
    thetaafe = nafe
    tafe     = thetaafe+n
    z1afe    = tafe+n
    z2afe    = z1afe+n

    appendvars(task,2*n)   # z1, z2
    appendcons(task,n)     # z1 + z2 = 1
    appendafes(task,4*n)   #theta * X[i] - t[i], -t[i], z1[i], z2[i]

    # Linear constraints
    let subi = Vector{Int32}(undef,2*n),
        subj = Vector{Int32}(undef,2*n),
        aval = Vector{Float64}(undef,2*n),
        k = 1

        for i in 1:n
            # z1 + z2 = 1
            subi[k] = zcon+i;  subj[k] = z1+i;  aval[k] = 1;  k += 1
            subi[k] = zcon+i;  subj[k] = z2+i;  aval[k] = 1;  k += 1
        end

        putaijlist(task,subi, subj, aval)
    end
    putconboundsliceconst(task,zcon+1, zcon+n+1, MSK_BK_FX, 1, 1)
    putvarboundsliceconst(task,nvar+1, nvar+2*n+1, MSK_BK_FR, -Inf, Inf)

    # Affine conic expressions
    let afeidx = Vector{Int64}(undef,d*n+4*n),
        varidx = Vector{Int32}(undef,d*n+4*n),
        fval   = Vector{Float64}(undef,d*n+4*n),
        k = 1
        # Thetas
        for i in 1:n
            for j in 1:d
                afeidx[k] = thetaafe + i; varidx[k] = theta + j
                fval[k]   = if y[i] X[i,j]*(-1.0) else X[i,j] end
                k += 1
            end
        end

        # -t[i]
        for i in 1:n
            afeidx[k] = thetaafe + i; varidx[k] = t + i; fval[k] = -1; k += 1
            afeidx[k] = tafe + i;     varidx[k] = t + i; fval[k] = -1; k += 1
        end

        # z1, z2
        for i in 1:n
            afeidx[k] = z1afe + i; varidx[k] = z1 + i; fval[k] = 1; k += 1
            afeidx[k] = z2afe + i; varidx[k] = z2 + i; fval[k] = 1; k += 1
        end

        # Add the expressions
        putafefentrylist(task,afeidx, varidx, fval)
    end

    # Add a single row with the constant expression "1.0"

    let oneafe = getnumafe(task)+1,
        # Add an exponential cone domain
        expdomain = appendprimalexpconedomain(task)

        appendafes(task,1)
        putafeg(task,oneafe,1.0)

        # Conic constraints
        for i in 1:n
            appendacc(task,expdomain, [z1afe+i, oneafe, thetaafe+i], nothing)
            appendacc(task,expdomain, [z2afe+i, oneafe, tafe+i],     nothing)
        end
    end
end # softplus

Once we have this subroutine, it is easy to implement a function that builds the regularized loss function model (11.13).

Listing 11.9 Implementation of (11.13). Click here to download.
"""
Model logistic regression (regularized with full 2-norm of theta)
X - n x d matrix of data points
y - length n vector classifying training points
lamb - regularization parameter
"""
function logisticRegression(X    :: Matrix{Float64},
                            y    :: Vector{Bool},
                            lamb :: Float64)
    (n,d) = size(X)

    maketask() do task
        # Use remote server: putoptserverhost(task,"http://solve.mosek.com:30080")
        # Variables [r; theta; t]
        nvar = 1+d+n
        appendvars(task,nvar)
        putvarboundsliceconst(task,1, nvar+1, MSK_BK_FR, -Inf, Inf)

        r     = 0
        theta = r+1
        t     = theta+d

        # Objective lambda*r + sum(t)
        putcj(task,r+1,lamb)
        for i in 1:n
            putcj(task,t+i, 1.0)
        end

        # Softplus function constraints
        softplus(task, d, n, theta, t, X, y)

        # Regularization
        # Append a sequence of linear expressions (r, theta) to F
        numafe = getnumafe(task)
        appendafes(task,1+d)
        putafefentry(task,numafe+1, r+1, 1.0)

        for i in 1:d
            putafefentry(task,numafe+i+1, theta+i, 1.0)
        end

        # Add the constraint
        appendaccseq(task,appendquadraticconedomain(task,1+d), numafe+1, zeros(d+1))

        # Solution
        optimize(task)

        getxxslice(task,MSK_SOL_ITR, theta+1, theta+d+1)
    end
end # logisticRegression

Example: 2D dataset fitting

In the next figure we apply logistic regression to the training set of 2D points taken from the example ex2data2.txt . The two-dimensional dataset was converted into a feature vector xR28 using monomial coordinates of degrees at most 6.

_images/logistic-regression.png

Fig. 11.2 Logistic regression example with none, medium and strong regularization (small, medium, large λ). Without regularization we get obvious overfitting.