11.2 Logistic regression

Logistic regression is an example of a binary classifier, where the output takes one two values 0 or 1 for each data point. We call the two values classes.

Formulation as an optimization problem

Define the sigmoid function

S(x)=11+exp(x).

Next, given an observation xRd and a weights θRd we set

hθ(x)=S(θTx)=11+exp(θTx).

The weights vector θ is part of the setup of the classifier. The expression hθ(x) is interpreted as the probability that x belongs to class 1. When asked to classify x the returned answer is

x{1hθ(x)1/2,0hθ(x)<1/2.

When training a logistic regression algorithm we are given a sequence of training examples xi, each labelled with its class yi{0,1} and we seek to find the weights θ which maximize the likelihood function

ihθ(xi)yi(1hθ(xi))1yi.

Of course every single yi equals 0 or 1, so just one factor appears in the product for each training data point. By taking logarithms we can define the logistic loss function:

J(θ)=i:yi=1log(hθ(xi))i:yi=0log(1hθ(xi)).

The training problem with regularization (a standard technique to prevent overfitting) is now equivalent to

minθJ(θ)+λθ2.

This can equivalently be phrased as

(11.13)minimizeiti+λrsubject totilog(hθ(x))=log(1+exp(θTxi))if yi=1,tilog(1hθ(x))=log(1+exp(θTxi))if yi=0,rθ2.

Implementation

As can be seen from (11.13) the key point is to implement the softplus bound tlog(1+eu), which is the simplest example of a log-sum-exp constraint for two terms. Here t is a scalar variable and u will be the affine expression of the form ±θTxi. This is equivalent to

exp(ut)+exp(t)1

and further to

(11.14)(z1,1,ut)Kexp(z1exp(ut)),(z2,1,t)Kexp(z2exp(t)),z1+z21.

This formulation can be entered using affine conic constraints (see Sec. 6.2 (From Linear to Conic Optimization)).

Listing 11.8 Implementation of tlog(1+eu) as in (11.14). Click here to download.
#[allow(non_snake_case)]
fn softplus(task : & mut TaskCB, d : i32, n : i32, theta : i32, t : i32, X : &[f64], Y : &[bool]) -> Result<(),String> {
    let nvar = task.get_num_var()?;
    let ncon = task.get_num_con()?;
    let nafe = task.get_num_afe()?;
    task.append_vars(2*n)?;   // z1, z2
    task.append_cons(n)?;     // z1 + z2 = 1
    task.append_afes(4*n as i64)?;   //theta * X[i] - t[i], -t[i], z1[i], z2[i]
    let z1 = nvar;
    let z2 = nvar+n;
    let zcon = ncon;
    let thetaafe = nafe;
    let tafe = nafe+n   as i64;
    let z1afe = tafe+n  as i64;
    let z2afe = z1afe+n as i64;

    // Linear constraints
    {
        let mut subi = vec![0i32; 2*n as usize];
        let mut subj = vec![0i32; 2*n as usize];
        let mut aval = vec![0.0;  2*n as usize];

        for i in 0..n {
            task.put_var_name(z1+i,format!("z1[{}]",i).as_str())?;
            task.put_var_name(z2+i,format!("z2[{}]",i).as_str())?;
        }
        for ((i,&zx),si, sj, av) in izip!(iproduct!(0..n,&[z1,z2]),
                                          subi.iter_mut(),
                                          subj.iter_mut(),
                                          aval.iter_mut()) {
            *si = zcon+i;
            *sj = zx+i as i32;
            *av = 1.0;
        }

        task.put_aij_list(subi.as_slice(), subj.as_slice(), aval.as_slice())?;
        task.put_con_bound_slice_const(zcon, zcon+n, Boundkey::FX, 1.0, 1.0)?;
        task.put_var_bound_slice_const(nvar, nvar+2*n, Boundkey::FR, -INF, INF)?;
    }
    // Affine conic expressions
    let mut afeidx = vec![0i64; (d*n+4*n) as usize];
    let mut varidx = vec![0i32; (d*n+4*n) as usize];
    let mut fval   = vec![0.0;  (d*n+4*n) as usize];

    // Thetas
    let mut k : usize = 0;
    for ((i,j),afei,vari) in izip!(iproduct!(0..n,0..d),
                                   & mut afeidx[k..k+(n*d) as usize],
                                   & mut varidx[k..k+(n*d) as usize]) {
        *afei = thetaafe + i as i64;
        *vari = theta    + j;
    }

    for ((&yi,_j),&xij,fv) in izip!(iproduct!(Y,0..d), X, & mut fval[k..k+(n*d) as usize]) {
        *fv = (if yi {-1.0} else {1.0}) * xij;
    }
    k += (n*d) as usize;

    for fv in fval[k..k+(2*n) as usize].iter_mut() { *fv = -1.0; }
    for (i,afei,vari) in izip!(0..n,
                               &mut afeidx[k..k+(2*n) as usize].iter_mut().step_by(2),
                               &mut varidx[k..k+(2*n) as usize].iter_mut().step_by(2)) {
        *afei = thetaafe+i as i64; *vari = t+i;
    }
    for (i,afei,vari) in izip!(0..n,
                               &mut afeidx[k+1..k+(2*n) as usize].iter_mut().step_by(2),
                               &mut varidx[k+1..k+(2*n) as usize].iter_mut().step_by(2)) {
        *afei = tafe+i as i64; *vari = t+i;
    }
    k += (n*2) as usize;

    for fv in fval[k..k+(2*n) as usize].iter_mut() { *fv = 1.0; }
    for (i,afei,vari) in izip!(0..n,
                               &mut afeidx[k..k+(2*n) as usize].iter_mut().step_by(2),
                               &mut varidx[k..k+(2*n) as usize].iter_mut().step_by(2)) {
        *afei = z1afe+i as i64; *vari = z1+i;
    }
    for (i,afei,vari) in izip!(0..n,
                               &mut afeidx[k+1..k+(2*n) as usize].iter_mut().step_by(2),
                               &mut varidx[k+1..k+(2*n) as usize].iter_mut().step_by(2)) {
        *afei = z2afe+i as i64; *vari = z2+i;
    }


    // Add the expressions
    task.put_afe_f_entry_list(afeidx.as_slice(), varidx.as_slice(), fval.as_slice())?;

    {
        // Add a single row with the constant expression "1.0"
        let oneafe = task.get_num_afe()?;
        task.append_afes(1)?;
        task.put_afe_g(oneafe, 1.0)?;

        // Add an exponential cone domain
        let dom = task.append_primal_exp_cone_domain()?;

        // Conic constraints
        let zeros = &[0.0,0.0,0.0];
        let mut acci = task.get_num_acc()?;
        for i in 0..n as i64 {
            task.append_acc(dom, &[z1afe+i, oneafe, thetaafe+i], zeros)?;
            task.append_acc(dom, &[z2afe+i, oneafe, tafe+i],     zeros)?;
            task.put_acc_name(acci,format!("z1:theta[{}]",i).as_str())?;
            task.put_acc_name(acci+1,format!("z2:t[{}]",i).as_str())?;

            acci += 2;
        }
    }

    Ok(())
}

Once we have this subroutine, it is easy to implement a function that builds the regularized loss function model (11.13).

Listing 11.9 Implementation of (11.13). Click here to download.
#[allow(non_snake_case)]
fn logistic_regression(X : &[f64],
                       Y : &[bool],
                       lamb : f64) -> Result<Vec<f64>,String> {
    let n = Y.len() as i32;
    let d = (X.len()/Y.len()) as i32;  // num samples, dimension

    /* Create the optimization task. */
    let mut task = match Task::new() {
        Some(e) => e,
        None => return Err("Failed to create task".to_string()),
        }.with_callbacks();
    task.put_stream_callback(Streamtype::LOG, |msg| print!("{}",msg))?;

    // Variables [r; theta; t]
    let nvar : i32 = 1+d+n;
    task.append_vars(nvar)?;
    task.put_var_bound_slice_const(0, nvar, Boundkey::FR, -INF, INF)?;
    let (r,theta,t) = (0i32,1i32,1+d);
    task.put_var_name(r,"r")?;
    for j in 0..d { task.put_var_name(theta+j,format!("theta[{}]",j).as_str())?; }
    for j in 0..n { task.put_var_name(t+j,format!("t[{}]",j).as_str())?; }

    // Objective lambda*r + sum(t)
    task.put_c_j(r, lamb)?;
    for i in 0..n { task.put_c_j(t+i, 1.0)?; }
    task.put_obj_sense(Objsense::MINIMIZE)?;

    // Softplus function constraints
    softplus(& mut task, d, n, theta, t, X, Y)?;

    // Regularization
    // Append a sequence of linear expressions (r, theta) to F
    let numafe = task.get_num_afe()?;
    task.append_afes(1+d as i64)?;
    task.put_afe_f_entry(numafe, r, 1.0)?;
    for i in 0..d {
        task.put_afe_f_entry(numafe + i as i64 + 1, theta + i, 1.0)?;
    }

    // Add the constraint
    {
        let dom = task.append_quadratic_cone_domain((1+d) as i64)?;
        task.append_acc_seq(dom,
                            numafe,
                            vec![0.0; 1+d as usize].as_slice())?;
    }
    // Solution
    task.write_data("logistic.ptf")?;
    task.optimize()?;

    let mut xx = vec![0.0; d as usize];
    task.get_xx_slice(Soltype::ITR, theta, theta+d as i32,xx.as_mut_slice())?;
    Ok(xx)
}

Example: 2D dataset fitting

In the next figure we apply logistic regression to the training set of 2D points taken from the example ex2data2.txt . The two-dimensional dataset was converted into a feature vector xR28 using monomial coordinates of degrees at most 6.

_images/logistic-regression.png

Fig. 11.2 Logistic regression example with none, medium and strong regularization (small, medium, large λ). Without regularization we get obvious overfitting.