%
% A simple OCTAVE/MATLAB implementation of the SoftDoubleMaxMinOver
% learning algorithm (SDMMO) for support vector classification.
%
%Input:
%   X : training samples (one sample per column)
%   y : class information (in {-1,1})
%   C : hardness of classification (default: 10^15 i.e., hard-margin)
%   K_func: kernel function to be used
%   K_params: hyper-parameters of kernel function
%   epsilon : precision for duality gap (default 10^-2)
%
%Output:
% alpha : weights of support vectors
% b     : threshold
% SVs   : support vectors
%
%function [alpha, b, SVs] = sdmmo (X, y, C, K_func, K_params, epsilon)
%
% Copyright (C) 2009 Kai Labusch
%
% This program is free software; you can redistribute it and/or
% modify it under the terms of the GNU General Public License
% as published by the Free Software Foundation; either version 2
% of the License, or (at your option) any later version.
%
function [alpha, b, SVs] = sdmmo (X, y, C, K_func, K_params, epsilon)

if ~exist('C','var')
    C = 10^15; % perform hard-margin training by default
end

if ~exist('epsilon','var')
    epsilon = 10^-2; % default threshold for duality gap
end

if ~exist('K_func','var') || strcmp('linear', K_func) == 1
    K_func = @LinK; % default kernel
    disp('linear kernel');
end

if ischar(K_func) && strcmp('rbf', K_func) == 1
    K_func = @RbfK; % default kernel
    
    if ~exist('K_params','var')
        K_params = 1;
    end
    
    disp('rbf kernel');
    
elseif ischar(K_func) && strcmp('polynom', K_func) == 1
    K_func = @PolyK;
    
    if ~exist('K_params','var')
        K_params = [ 2 0 ];
    end
    
else
    K_func = @LinK;

    if ~exist('K_params','var')
        K_params = [];
    end
end

t_max= 100000; % maximal number of learning iterations

I_p = find(y== 1); % indices of class  1
I_n = find(y==-1); % indices of class -1

alpha = zeros(1,size(X,2)); % weights for the kernel values

K = feval(K_func, K_params, X); % compute kernel matrix

for t=0:t_max
    
    c = sum(diag(y.*alpha)*K, 1) + y.*alpha/C; % compute current classification values
    s = y.*c; 
    
    [h_min_p, x_min_p] = min(s(y== 1)); % find sample of class 1 that is closest to the current hyperplane
    [h_min_n, x_min_n] = min(s(y==-1)); % find sample of class -1 that is closest to the current hyperplane
        
    x_min_p = I_p(x_min_p); % get absolute index of the selected samples
    x_min_n = I_n(x_min_n);
        
    SV_p = I_p(alpha(y==  1) > 0); % find current support vectors that belong to class 1
    SV_n = I_n(alpha(y== -1) > 0); % find current support vectors that belong to class -1
    
    [h_max_p, x_max_p] = max(s(SV_p)); % find support vector of class 1 that is most distant to classification plane
    [h_max_n, x_max_n] = max(s(SV_n)); % find support vector of class -1 that is most distant to classification plane
    
    x_max_p = SV_p(x_max_p); % get absolute index of the selected support vectors
    x_max_n = SV_n(x_max_n);    
    
    if mod(t,1000)==0 % evaluate duality gap every 100 iterations       
        
       b = 0.5*(h_min_p - h_min_n); % compute threshold
        
       if duality_gap(y, alpha,b, K, C, y.*(c - b)) <= epsilon % if duality gap is below epsilon ... stop
           break;
       end
    end
    
    if alpha(x_max_p)  > 0 % decrease alpha values of support vectors that were selected to be forgotten
        alpha(x_max_p) = alpha(x_max_p) - 1;
    end
    if alpha(x_max_n)  > 0
        alpha(x_max_n) = alpha(x_max_n) - 1;
    end
    
    alpha(x_min_p) = alpha(x_min_p) + 2; % increase alpha values of those samples that were selected for learning
    alpha(x_min_n) = alpha(x_min_n) + 2;
end

b     = 0.5*(h_min_p - h_min_n); % compute threshold
SVs   = X(:, alpha > 0);         % select final support vectors
y     = y(alpha > 0);
alpha = alpha(alpha > 0);        % select weights of support vectors
alpha = alpha.*y;

return;

function K = LinK(params, X) %#ok<INUSL>
K=X'*X; % linear kernel
return

function K = RbfK(params, X)
D=X - X; % Gaussian kernel
K = exp(-params*(D'*D));
return

function K = PolyK(params, X)

degree = params(1); % polynomial kernel
o      = params(2);

K = (X'*X + o).^degree;
return


function gap = duality_gap(y, alpha, b, K, C, soft_margin)
    
    factor = min(soft_margin); % scale classifcation such that the margin of the closest correct classified sample is 1
    
    alpha         = alpha / factor;
    b             = b     / factor;
    
    hard_margin = y.*(sum(diag(alpha.*y)*K,1) - b);

    sq_norm_hard = sum(sum((alpha'*alpha).*(y'*y).*K));
    sq_norm_soft = sum(sum((alpha'*alpha).*(y'*y).*(K+diag(ones(1,length(alpha))/C))));
    
    dual = sum(alpha) - 0.5*sq_norm_soft; % compute current value of the dual
    
    xi = 1 - hard_margin(hard_margin < 1); % compute current value of the slack-variables
    
    primal = 0.5*sq_norm_hard + 0.5*C*sum(xi.^2); % compute current value of the primal
    
    gap = (primal - dual)/(primal+1);
    
return;


