Training a binary classifier on a data set

Description

This code trains a model of binary classifier by using a data set read from STDIN, and writes the resultant model to STDOUT. This code assumes that features in the data set are represented by integer identifiers.

/*
 *      A sample program for training a binary classifier.
 *
 * Copyright (c) 2008,2009 Naoaki Okazaki
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the names of the authors nor the names of its contributors
 *       may be used to endorse or promote products derived from this
 *       software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* $Id: train_binary_batch.cpp 180 2009-09-25 08:51:06Z naoaki $ */

#include <iostream>
#include <string>
#include <vector>

#include <classias/classias.h>
#include <classias/classify/linear/binary.h>
#include <classias/train/pegasos.h>
#include <classias/train/online_scheduler.h>

#include "strsplit.h"   // necessary for strsplit() and get_id_value().

// Define the type of a training algorithm. This code uses Pegasos algorithm
// (classias::train::pegasos_binary) with the hinge loss function
// (classias::classify::linear_binary_hinge). The type of a data set is
// given by classias::bdata.
typedef classias::train::online_scheduler_binary<
    classias::bdata,
    classias::train::pegasos_binary<
        classias::classify::linear_binary_hinge<classias::weight_vector>
        >
    >
    trainer_type;

// This is an example of training with Averaged Perceptron.
// #include <classias/train/averaged_perceptron.h>
// typedef classias::train::online_scheduler_binary<
//     classias::bdata,
//     classias::train::averaged_perceptron_binary<
//         classias::classify::linear_binary<classias::weight_vector>
//         >
//     >
//     trainer_type;
//
// This is an example of training with L-BFGS. Do not forget to link this
// sample program with libLBFGS.
// #include <classias/train/lbfgs.h>
// typedef classias::train::lbfgs_logistic_binary<classias::bsdata>
//     trainer_type;
//
// See /frontend/train/binary.cpp for the examples of other algorithms.

int main(int argc, char *argv[])
{
    int max_fid = -1;
    classias::bdata data;
    std::istream& is = std::cin;
    std::ostream& os = std::cout;
    std::ostream& es = std::cerr;

    // Read a data set from STDIN.
    for (;;) {
        // Read a line.
        std::string line;
        std::getline(is, line);
        if (is.eof()) {
            break;
        }

        // Split the line into fields with space characters.
        std::vector<std::string> fields;
        strsplit(fields, line);

        // The line must have at least a label and a feature.
        if (fields.size() > 2) {
            // Create a new instance in the data set.
            classias::binstance& inst = data.new_element();
            // Set the boolean label for the instance.
            inst.set_label(fields[0] != "-1");

            // Loop over the rest of fields.
            for (size_t i = 1;i < fields.size();++i) {
                // Split the field into a feature identifier and value.
                int fid;
                double value;
                get_id_value(fields[i], fid, value, ':');

                // Store the maximum number of feature identifiers.
                if (max_fid < fid) {
                    max_fid = fid;
                }

                // Append the feature to the instance.
                inst.append(fid, value);
            }
        }
    }

    // Do not forget to set the number of features in the data set.
    data.set_num_features(max_fid+1);

    // Create an instance of the training algorithm.
    trainer_type tr;

    // Set some parameters for the training algorithm.
    tr.params().set("c", 1.0);
    tr.params().set("max_iterations", 100);

    // Start training; progress report will be shown in es (STDERR).
    tr.train(data, es);

    // Output the model.
    const classias::weight_vector& w = tr.model();
    for (int i = 0;i <= max_fid;++i) {
        // Feature ID and its weight.
        os << i << '\t' << w[i] << std::endl;
    }

    return 0;
}


Copyright (c) 2002-2009 by Naoaki Okazaki
Mon Dec 28 23:41:02 2009