%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Matlab script to perform cross validation on a model generated for % numeric data from agriculture % % purpose: data mining with SVM/MLP/RBF/RegTree % % requires: % - statistics toolbox for cvpartition (from Matlab2008a) % - neural network toolbox % - readColData script (see link below) % - SVMTorch (compiled version, of course) % - data structure used for the results (cv_results) % % new: % - comparison of SVM, MLP, RBF, RegTree % - fixed SVM and fixed NNET parameters % - included normalization (reversing model results) % ----- % Georg Ru{\ss} % russ@iws.cs.uni-magdeburg.de % 2008-10-16 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Preparation steps, workspace % clean workspace clear all; % set clock clock_start = clock; % seed random for reproducible results %rand('seed',1); % change paths to wherever your data is located and readable to matlab % uses script readColData from % http://web.cecs.pdx.edu/~gerry/MATLAB/plotting/loadingPlotData.html#colHeadings [label,id_column,data]=readColData('sorted_all',10,10,1); label = transpose(cellstr(label)) %% data-specific stuff % generate three data sets for the nnet to play with % one: N1, Yield2003, EM38 -- target: Yield2004 % two: N1, Yield2003, EM38, N2, REIP32 -- target: Yield2004 % three: N1, Yield2003, EM38, N2, REIP32, N3, REIP49 -- target: Yield2004 % works by eliminating the respective columns from the 'data' matrix above set_1 = data; set_1(:,7) = []; set_1(:,5) = []; set_1(:,4) = []; set_1(:,3) = []; set_1(:,2) = []; set_2 = data; set_2(:,7) = []; set_2(:,5) = []; set_2(:,3) = []; % only set_3 is actually used here set_3 = data; set_3(:,7) = []; label(:,1) = [] % delete "ID" label label(:,7) = [] % labels of input columns label_inputs = label(:,1:7); % target label label_target = label(:,8); Size_set_3 = size(set_3); %% modeling stage % partitioning parameters for cross validation k=10; % k-fold holdout cross validation p = 1/k; % normalize data (requires double transposition of data matrix) [set_3_norm, PS] = mapstd(transpose(set_3)); % uncomment this for using normalized values %set_3 = transpose(set_3_norm); % where to store the results modelresults = struct('j',[],'mae_svm',[],'rmse_svm',[], 'mae_mlp',[], 'rmse_mlp', []); modelresults.mae_rbf = []; modelresults.rmse_rbf = []; modelresults.mae_regtree = []; modelresults.rmse_regtree = []; j_res=[]; for j = 1:5 % generate data partition (cvpartition from matlab's statistics toolbox) % size of holdout (test) set: 1/k % (yields indices of data set) cvdata = cvpartition(Size_set_3(1,1),'Holdout',p) % structured object to store actual, predicted and error values (for each % model) regtreeresults = struct('actual',[], 'prediction', [], 'abserr', [], 'squerr',[]); j_res = vertcat(j_res,j); TrainSet = struct(); TrainSet.all = set_3(cvdata.training,:); TrainSet.size = size(TrainSet.all); TrainSet.examples = TrainSet.all(:,1:TrainSet.size(1,2)-1); TrainSet.targets = TrainSet.all(:,TrainSet.size(1,2)); TestSet = struct(); TestSet.all = set_3(cvdata.test,:); TestSet.size = size(TestSet.all); TestSet.examples = TestSet.all(:,1:TestSet.size(1,2)-1); TestSet.targets = TestSet.all(:,TestSet.size(1,2)); % ---------------------------------------- % begin regression tree stuff % construct tree tree = classregtree(TrainSet.examples, TrainSet.targets, 'names', label_inputs); %,y,param1,val1,param2,val2) prediction = eval(tree,TestSet.examples); regtreeresults.prediction = prediction regtreeresults.actual = TestSet.targets; % un-normalize rbf results to original ranges %rbfresults.prediction = (rbfresults.prediction - PS.ymean) * PS.xstd(8)/PS.ystd + PS.xmean(8); %rbfresults.actual = (rbfresults.actual - PS.ymean) * PS.xstd(8)/PS.ystd + PS.xmean(8); % end regression tree stuff % ---------------------------------------- % ---------------------------------------- % collect error values % generate error measures and store them inside the respective results struct regtreeresults.abserr = abs(regtreeresults.actual - regtreeresults.prediction); regtreeresults.squerr = (regtreeresults.abserr).^2; % calculate mae and rmse regtreemae = mean(regtreeresults.abserr) regtreermse = sqrt(mean(regtreeresults.squerr)) % store mae and rmse modelresults.mae_regtree = vertcat(modelresults.mae_regtree, regtreemae); modelresults.rmse_regtree = vertcat(modelresults.rmse_regtree, regtreermse); end %% plot mae and rmse against j (trial run number, simply collected) e = figure; plot(j_res,modelresults.mae_regtree,'-.b'); legend('mae regtree'); title('mean absolute error of regression tree'); xlabel('trial #'); ylabel('error'); hold on; h = figure; plot(j_res,modelresults.rmse_regtree,'-.b'); legend('rmse regtree'); title('root mean squared error of regression tree'); xlabel('trial #'); ylabel('error'); % corr = figure; % plot(modelresults.rmse_mlp,modelresults.rmse_svm,'.'); % hold on; % title('rmse mlp vs. rmse svm'); % xlabel('rmse mlp'); % ylabel('rmse svm'); %% get script stats duration = etime(clock, clock_start)