% Perform the parametric polynomial regression with validation approaches
% Author: Chia-Feng Lu, 2020.06.03
clear, close all

%% load Auto data: 
[num,txt,raw]=xlsread('Auto.csv');
mpg=num(:,1); % gas mileage in miles per gallon
horsepower=num(:,4); 

excludeind=find(isnan(horsepower));  % identify the indices with missing horsepower data
mpg(excludeind,:)=[]; % remove the observations with missing horsepower data
horsepower(excludeind,:)=[]; % remove the observations with missing horsepower data

%% perform polynomial regression
p_d1=polyfit(horsepower,mpg,1); % linear
RSS=sum((polyval(p_d1,horsepower)-mpg).^2); % residual sum of square
TSS=sum((mpg-mean(mpg)).^2); % total sum of square
R_square_d1=1-RSS/TSS;

p_d2=polyfit(horsepower,mpg,2); % Degree 2
RSS=sum((polyval(p_d2,horsepower)-mpg).^2); % residual sum of square
TSS=sum((mpg-mean(mpg)).^2); % total sum of square
R_square_d2=1-RSS/TSS;

p_d5=polyfit(horsepower,mpg,5); % Degree 5
RSS=sum((polyval(p_d5,horsepower)-mpg).^2); % residual sum of square
TSS=sum((mpg-mean(mpg)).^2); % total sum of square
R_square_d5=1-RSS/TSS;

figure,
scatter(horsepower,mpg), hold on
x=sort(horsepower);
plot(x,polyval(p_d1,x),'k-','linewidth',2)
plot(x,polyval(p_d2,x),'r-','linewidth',2)
plot(x,polyval(p_d5,x),'g-','linewidth',2)
xlabel('horsepower'), ylabel('mpg')
legend('Observations',...
             ['Linear, R^2=' num2str(R_square_d1)],...
             ['Degree 2, R^2=' num2str(R_square_d2)],...
             ['Degree 5, R^2=' num2str(R_square_d5)])
         
%% Validation-set approach
rng(0)
p={};
MSE=[];
% randomly hold out 50% data for validation
C = cvpartition(length(mpg),'HoldOut',0.5); 
for dg=1:7 % the degree of polynomial regression
    % Build the regression model only use the training set
    p{dg}=polyfit(horsepower(C.training),mpg(C.training),dg); 
    % Calculate mean squared error on the validation set
    MSE(dg)=mean((polyval(p{dg},horsepower(C.test))-mpg(C.test)).^2); 
end

figure, plot(1:7,MSE,'b.-','markersize',16)
xlabel('Degree of polynomial regression')
ylabel('Mean squared error')
ylim([14 28])
grid on

%% K-fold cross validation
rng(0)
p={};
MSE=[];

% randomly partition data to K-fold for cross validation
K=5;
C = cvpartition(length(mpg),'Kfold',K); 
figure, hold on
for i=1:K
    for dg=1:7 % the degree of polynomial regression
        % Build the regression model only use the training set
        p{dg,i}=polyfit(horsepower(C.training(i)),mpg(C.training(i)),dg);
        % Calculate mean squared error on the validation set
        MSE(dg,i)=mean((polyval(p{dg,i},horsepower(C.test(i)))-mpg(C.test(i))).^2);
    end
    plot(1:7,MSE(:,i),'.-','markersize',16)
end 
plot(1:7,mean(MSE,2),'r.-','markersize',16,'linewidth',2)
xlabel('Degree of polynomial regression')
ylabel('Mean squared error')
ylim([14 28])
grid on
box on

%% Leave-one-out cross validation (LOOCV)
rng(0)
p={};
MSE=[];

% randomly partition data to K-fold for cross validation
C = cvpartition(length(mpg),'leaveout'); 
figure, hold on
for i=1:C.NumTestSets
    for dg=1:7 % the degree of polynomial regression
        % Build the regression model only use the training set
        p{dg,i}=polyfit(horsepower(C.training(i)),mpg(C.training(i)),dg);
        % Calculate mean squared error on the validation set
        MSE(dg,i)=mean((polyval(p{dg,i},horsepower(C.test(i)))-mpg(C.test(i))).^2);
    end
    plot(1:7,MSE(:,i),'.-','markersize',16)
end 
plot(1:7,mean(MSE,2),'r.-','markersize',16,'linewidth',2)
xlabel('Degree of polynomial regression')
ylabel('Mean squared error')
% ylim([14 28])
grid on
box on





         