##Discriminate lab 2 - R script ##Setup your R work session setwd('c:/work/stats/projects/2008/discriminate/') library(boot) library(energy) library(MASS) library(vegan) library(nortest) source('c:/work/R/scripts/biostats.R') turtle<-read.csv('byturtle.csv',header=TRUE) y<-turtle[,6:30] grp<-turtle[,3] table(grp) ##Evaluate the model assumptions #Equal Covariance Matrices cov.test(y,grp,method='bartlett') cov.test(y,grp,method='fligner') box.plots(turtle,'openwater:distupland',by='turtle') grp.sizes<-table(grp) #group sample sizes y.sort<-y[order(grp),] #sorted data matrix eqdist.etest(y.sort,sizes=grp.sizes,distance=FALSE,R=999) y.std<-data.stand(y,method='standardize',margin='column',plot=FALSE) y.eucl<-data.dist(y.std,method='euclidean') dist.plots(y.eucl,grp) #Multivariate Normality norm.test(y,grp,method='ad') hist.plots(turtle,'openwater:distupland',by='turtle') mvnorm.etest(y[grp==0,]) mvnorm.etest(y[grp==1,]) dist.plots(y.eucl,grp) #Singularities and Multicollinearity as.dist(cor(y)) summary(aov(y$edgefine~grp)) summary(aov(y$edgecoarse~grp)) y<-y[,-c(17,18,20,22)] names(y) #Linearity pairs(y[1:10]) ##Transform and/or standardize data as appropriate ##Stepwise selection of variables y.mat<-as.matrix(y) summary(lm(y.mat~grp)) summary(manova(y.mat[,c(1,4)]~grp),test='Wilks') summary(manova(y.mat[,c(1,6)]~grp),test='Wilks') summary(manova(y.mat[,c(1,4,6,7,9,10,15,18,21)]~grp),test='Wilks') y<-y[,c(1,4,6,7,9,10,15,18,21)] names(y) y.glm<-glm(grp~.,family='binomial',data=y) y.step<-stepAIC(y.glm,scope=list(upper=~.,lower=~1)) ##Conduct the discriminant analysis #linear discriminant analysis y.lda<-lda(y,grouping=grp) y.lda y.lda<-lda(y,grouping=grp,prior=c(.2,.8)) #singular values (eigenvalues) and proportion of trace y.lda$svd y.lda$svd^2/sum(y.lda$svd^2) #canonical scores and canonical correlation y.lda.pred<-predict(y.lda) y.lda.pred scores<-y.lda.pred$x summary(lm(scores~grp)) #plots of canonical scores x<-as.data.frame(cbind(grp,scores)) box.plots(x,var='LD1',by='grp',notch=TRUE,varwidth=TRUE) hist.plots(x,var='LD1',by='grp') #eqscplot(scores[,c(1,2)],type='n',xlab='LD1',ylab='LD2') #text(scores[,c(1,2)],col=grp,labels=grp) – if grp is numeric #text(scores[,c(1,2)],col=c('blue','red')[grp],labels=c('a','b')[grp]) – if grp is character or desire specific labels #classification accuracy y.table<-table(grp,y.lda.pred$class) y.table sum(diag(y.table))/sum(y.table) cohen.kappa(y.table) tau(y.table,prior=y.lda$prior) #quadratic discriminant analysis y.qda<-qda(y,grouping=grp) y.qda y.qda$ldet #qda classification accuracy - resubstitution y.qda.pred<-predict(y.qda) y.table<-table(grp,y.qda.pred$class) y.table sum(diag(y.table))/sum(y.table) cohen.kappa(y.table) #qda classification accuracy - jackknife cross validation y.qda.pred<-predict(y.qda,method='looCV') y.table<-table(grp,y.qda.pred$class) y.table sum(diag(y.table))/sum(y.table) cohen.kappa(y.table) #raw canonical coefficients y.lda$scaling #structure coefficients (canonical loadings) lda.structure(scores,y) ##Validating the discriminant analysis ran.split(y,grp,prop=.5) y.qda<-qda(calibrate,grouping=grp.cal) #can change priors here y.qda.pred<-predict(y.qda,newdata=validate) y.table<-table(grp.val,y.qda.pred$class) y.table sum(diag(y.table))/sum(y.table) cohen.kappa(y.table) class.monte(y,grouping=grp,prop=.5,type='qda') #if priors proportional class.monte(y,grouping=grp,prop=.5,type='qda',prior=c(.2,.8)) #if priors specified ##Final evaluation of assumptions cov.test(scores,grp,method='bartlett') norm.test(scores,grp,method='ad') scores.grp<-as.data.frame(cbind(scores,grp)) qqnorm.plots(scores.grp,var='LD1',by='grp') uv.outliers(scores.grp,var='LD1',by='grp')