%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Matlab script to perform cross validation on numeric data from agricult. % purpose: data mining with ANNs/MLPs % see http://blog.georgruss.de/?p=62 % ----- % script to generate plots and determine optimal network % for agriculture data, accompanies a paper submitted for review % at ICDM2008 http://www.data-mining-forum.de/icdm2008.php % ----- % Georg Ru\ss % russ@iws.cs.uni-magdeburg.de % 2008-01-18 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Preparation steps, workspace % clean workspace clear all; % set clock clock_start = clock; % seed random for reproducible results rand('seed',1); %pause; % change paths to wherever your data is located and readable to matlab % uses script readColData from % http://web.cecs.pdx.edu/~gerry/MATLAB/plotting/loadingPlotData.html#colHeadings [label,id_column,data]=readColData('sorted_all',10,10,1); %% data specific stuff % generate three data sets for the nnet to play with % one: N1, Yield2003, EM38 -- target: Yield2004 % two: N1, Yield2003, EM38, N2, REIP32 -- target: Yield2004 % three: N1, Yield2003, EM38, N2, REIP32, N3, REIP49 -- target: Yield2004 % works by eliminating the respective columns from the 'data' matrix above set_1 = data; set_1(:,7) = []; set_1(:,5) = []; set_1(:,4) = []; set_1(:,3) = []; set_1(:,2) = []; set_2 = data; set_2(:,7) = []; set_2(:,5) = []; set_2(:,3) = []; set_3 = data; set_3(:,7) = []; %% store dimensions of data for possible later loops %[number_of_examples_cv,number_of_attributes_kv] = size(crossv); %% Prepare the data for the neural net % put input attributes and target values into suitable matrices % requires selecting and transposing the raw input data size_set_1 = size(set_1); size_set_2 = size(set_2); size_set_3 = size(set_3); set_1_examples = transpose(set_1(:,1:size_set_1(1,2)-1)); set_1_targets = transpose(set_1(:,size_set_1(1,2))); set_2_examples = transpose(set_2(:,1:size_set_2(1,2)-1)); set_2_targets = transpose(set_2(:,size_set_2(1,2))); set_3_examples = transpose(set_3(:,1:size_set_3(1,2)-1)); set_3_targets = transpose(set_3(:,size_set_3(1,2))); % max size of first hidden layer max_size_first_layer = 32; % max size of second hidden layer max_size_second_layer = 32; % number of iterations number_of_iterations = 250; nets_perf_collection = zeros(number_of_iterations,3); %net_1_perf_collection = zeros(max_size_first_layer,max_size_second_layer); %net_2_perf_collection = zeros(max_size_first_layer,max_size_second_layer); %net_3_perf_collection = zeros(max_size_first_layer,max_size_second_layer); %create network for i = 1:number_of_iterations; %for j = 2:max_size_second_layer; % %%%%%%%%%%%%% % network for first data set net_1 = newff(set_1_examples,set_1_targets,[16 16]); net_1.trainparam.min_grad = 0.001; net_1.trainParam.epochs = 100000; net_1.trainParam.lr = 0.5; net_1.trainParam.show = NaN; net_1.divideFcn = 'dividerand'; % training step [net_1_trained,net_1_tr] = train(net_1,set_1_examples,set_1_targets); % determine mse on test data (from network training) net_1_perf = net_1_tr.tperf(1,size(net_1_tr.tperf,2)); nets_perf_collection(i,1) = sqrt(net_1_perf); % %%%%%%%%%%%%% % network for second data set net_2 = newff(set_2_examples,set_2_targets,[16 16]); net_2.trainparam.min_grad = 0.001; net_2.trainParam.epochs = 100000; net_2.trainParam.lr = 0.5; net_2.trainParam.show = NaN; net_2.divideFcn = 'dividerand'; % training step [net_2_trained,net_2_tr] = train(net_2,set_2_examples,set_2_targets); % determine mse on test data (from network training) net_2_perf = net_2_tr.tperf(1,size(net_2_tr.tperf,2)); nets_perf_collection(i,2) = sqrt(net_2_perf); % %%%%%%%%%%%%% % network for third data set net_3 = newff(set_3_examples,set_3_targets,[16 16]); net_3.trainparam.min_grad = 0.001; net_3.trainParam.epochs = 100000; net_3.trainParam.lr = 0.5; net_3.trainParam.show = NaN; net_3.divideFcn = 'dividerand'; % network training [net_3_trained,net_3_tr] = train(net_3,set_3_examples,set_3_targets); % determine mse on test data (from network training) net_3_perf = net_3_tr.tperf(1,size(net_3_tr.tperf,2)); nets_perf_collection(i,3) = sqrt(net_3_perf); i % j % end end %% clf; nets_perf_collection_1 = nets_perf_collection(:,1); nets_perf_collection_2 = nets_perf_collection(:,2); nets_perf_collection_3 = nets_perf_collection(:,3); plot(nets_perf_collection_1,'-or'); hold on; plot(nets_perf_collection_2,'-xb'); plot(nets_perf_collection_3,'-+g'); title('Absolute Error on Different Data Sets'); h = legend('Error FT1','Error FT2','Error FT3',1); set(h,'Interpreter','none') % generate difference plots manually from the following data %diff_net_1_net_2 = net_1_perf_collection - net_2_perf_collection; %diff_net_2_net_3 = net_2_perf_collection - net_3_perf_collection; %diff_net_1_net_3 = net_1_perf_collection - net_3_perf_collection; duration = etime(clock, clock_start)