%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Matlab script to perform cross validation on a model generated for % numeric data from agriculture % % purpose: data mining with SVMs vs. an MLP (neural network) % % requires: % - statistics toolbox for cvpartition (from Matlab2008a) % - neural network toolbox % - readColData script (see link below) % - SVMTorch (compiled version, of course) % - data structure used for the results (cv_results) % % new: % - comparison of SVM result vs. NNet result % - fixed SVM and fixed NNET parameters % ----- % Georg Ru{\ss} % russ@iws.cs.uni-magdeburg.de % 2008-10-08 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Preparation steps, workspace % clean workspace clear all; % set clock clock_start = clock; % seed random for reproducible results %rand('seed',1); % change paths to wherever your data is located and readable to matlab % uses script readColData from % http://web.cecs.pdx.edu/~gerry/MATLAB/plotting/loadingPlotData.html#colHeadings [label,id_column,data]=readColData('sorted_all',10,10,1); %% data-specific stuff % generate three data sets for the nnet to play with % one: N1, Yield2003, EM38 -- target: Yield2004 % two: N1, Yield2003, EM38, N2, REIP32 -- target: Yield2004 % three: N1, Yield2003, EM38, N2, REIP32, N3, REIP49 -- target: Yield2004 % works by eliminating the respective columns from the 'data' matrix above set_1 = data; set_1(:,7) = []; set_1(:,5) = []; set_1(:,4) = []; set_1(:,3) = []; set_1(:,2) = []; set_2 = data; set_2(:,7) = []; set_2(:,5) = []; set_2(:,3) = []; % only set_3 is actually used here set_3 = data; set_3(:,7) = []; Size_set_3 = size(set_3); %% modeling stage % partitioning parameters for cross validation k=10; % k-fold holdout cross validation p = 1/k; % where to store the results modelresults = struct('j',[],'mae_svm',[],'rmse_svm',[], 'mae_mlp',[], 'rmse_mlp', []); j_res=[]; for j = 1:20 % generate data partition (cvpartition from matlab's statistics toolbox) % size of holdout (test) set: 1/p % (yields indices of data set) cvdata = cvpartition(Size_set_3(1,1),'Holdout',p) % structured object to store actual, predicted and error values (for each % model) svmresults = struct('actual',[], 'prediction', [], 'abserr', [], 'squerr',[]); mlpresults = struct('actual',[], 'prediction', [], 'abserr', [], 'squerr',[]); j_res = vertcat(j_res,j); TrainSet = struct(); TrainSet.all = set_3(cvdata.training,:); TrainSet.size = size(TrainSet.all); TrainSet.examples = TrainSet.all(:,1:TrainSet.size(1,2)-1); TrainSet.targets = TrainSet.all(:,TrainSet.size(1,2)); TestSet = struct(); TestSet.all = set_3(cvdata.test,:); TestSet.size = size(TestSet.all); TestSet.examples = TestSet.all(:,1:TestSet.size(1,2)-1); TestSet.targets = TestSet.all(:,TestSet.size(1,2)); % -------------------------------------- % begin SVMTorch stuff % write training data to file dlmwrite('svm-train',TrainSet.size,' '); dlmwrite('svm-train',TrainSet.all,'-append','delimiter',' '); % write test data to file dlmwrite('svm-test',TestSet.size,' '); dlmwrite('svm-test',TestSet.all,'-append','delimiter',' '); % %run svmtorch as the model % requires writing a script file first ... fid = fopen('svmtorch.sh','w'); fprintf(fid,'%s\n%s%u%s\n','#! /bin/bash','SVMTorch -rm -t 2 -eps 0.3 -std ',12,' svm-train svm-model') fclose(fid) % ... and making it executable ... ! chmod 700 svmtorch.sh % ... before running it: % generate model ! ./svmtorch.sh % test model and store results in file 'svm-results' ! SVMTest -oa svm-results svm-model svm-test % remove model ! rm svm-model % read in model output on test values (from file) svmres_read = dlmread('svm-results'); % append values svmresults.prediction = vertcat(svmresults.prediction,svmres_read); svmresults.actual = vertcat(svmresults.actual,TestSet.targets); % end SVMTorch stuff % ---------------------------------------- % ---------------------------------------- % begin MLP stuff % TrainSet_examples = transpose(TrainSet(:,1: mlp = newff(transpose(TrainSet.examples),transpose(TrainSet.targets),[10]); mlp.trainParam.min_grad = 0.001; mlp.trainParam.epochs = 10000; mlp.trainParam.lr = 0.25; mlp.trainParam.showWindow = false; mlp.trainParam.showCommandLine = true; mlp.trainParam.show = 50; [mlptrained, mlptrainrecord] = train(mlp,transpose(TrainSet.examples),transpose(TrainSet.targets)); mlpresults.prediction = transpose(sim(mlptrained,transpose(TestSet.examples))); mlpresults.actual = TestSet.targets; % end MLP stuff % ---------------------------------------- % ---------------------------------------- % collect error values % generate error measures and store them inside the respective results struct svmresults.abserr = abs(svmresults.actual - svmresults.prediction); svmresults.squerr = (svmresults.abserr).^2; mlpresults.abserr = abs(mlpresults.actual - mlpresults.prediction); mlpresults.squerr = (mlpresults.abserr).^2; % calculate mae and rmse svmmae = mean(svmresults.abserr) svmrmse = sqrt(mean(svmresults.squerr)) mlpmae = mean(mlpresults.abserr) mlprmse = sqrt(mean(mlpresults.squerr)) % store mae and rmse for this particular svm parameter setting modelresults.mae_mlp = vertcat(modelresults.mae_mlp, mlpmae); modelresults.rmse_mlp = vertcat(modelresults.rmse_mlp, mlprmse); modelresults.mae_svm = vertcat(modelresults.mae_svm, svmmae); modelresults.rmse_svm = vertcat(modelresults.rmse_svm, svmrmse); end pause; %% plot mae and rmse against j (trial run number, simply collected) clf; e = figure; plot(j_res,modelresults.mae_mlp,'-.b'); hold on; plot(j_res,modelresults.mae_svm,'-r') legend('mae mlp','mae svm'); title('mean absolute error of MLP vs SVM'); xlabel('step'); ylabel('error'); hold on; h = figure; plot(j_res,modelresults.rmse_mlp,'-.b'); hold on; plot(j_res,modelresults.rmse_svm,'-r') legend('rmse mlp','rmse svm'); title('root mean squared error of MLP vs SVM'); xlabel('step'); ylabel('error'); %% get script stats duration = etime(clock, clock_start)