R Scripts
x=read.csv(file.choose(),header=T) #import data set
#CORRELATION PLOT (sthda)
library(lattice) #make sure this is installed
my_cols <- c("dark red", "dark green", "dark blue") #choose group colors
# load panel functions
panel.hist <- function(s, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(s, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="grey")
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(x[2:8],pch = 19,
col = my_cols[x$Class],diag.panel=panel.hist,main="Wheat Kernels",
upper.panel=panel.cor)
mtext("Wheat Varieties: blue= Rosa; green= Kama; red= Canadian", 1, line=3.7,cex=.8) # add text to plot
#PCA (Coghlan 2010)
library(ggplot2)
standardise=as.data.frame(scale(x[2:8])) #standardize data first
x.Class=x[,1]
x.pca=prcomp(standardise)
summary(x.pca)
screeplot(x.pca,type="lines") #To decide which PC’s to retain
(x.pca$sdev)^2 #Variance
x.pca$rotation[,1] #loadings for PC 1
x.pca$rotation[,2] #loadings for PC 2
head(x.pca$x) #values of PC's
#PLOT PCA (R-Bloggers)
require(devtools)
library(devtools)
install_github("ggbiplot", "vqv")
library(ggbiplot)
g <- ggbiplot(x.pca, obs.scale = 1, var.scale = 1,
groups = x.Class, ellipse = TRUE,
circle = TRUE, cex=1)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
#LINEAR DISCRIMINANT ANALYSIS (Coghlan 2010)
library(MASS)
x=read.csv(file.choose(),header=T) #seed1.csv (training data set)
y=read.csv(file.choose(),header=T) #seed2.csv (prediction data set)
# Wheat variety discriminant analysis
r3 = lda(formula = Class ~ .,data = x) # Use “~ .” to select all variables
plda = predict(object = r3, newdata = y) # predict function (allocation method below)
plda$x #LD function values
plda$class # Classification result
y$Class # Actual varieties
#PLOT LDA (stackoverflow)
ggplotLDAPrep <- function(x){
if (!is.null(Terms <- x$terms)) {
data <- model.frame(x)
X <- model.matrix(delete.response(Terms), data)
g <- model.response(data)
xint <- match("(Intercept)", colnames(X), nomatch = 0L)
if (xint > 0L)
X <- X[, -xint, drop = FALSE]
}
means <- colMeans(x$means)
X <- scale(X, center = means, scale = FALSE) %*% x$scaling
rtrn <- as.data.frame(cbind(X,labels=as.character(g)))
rtrn <- data.frame(X,labels=as.character(g))
return(rtrn)
}
ldaobject <- lda(Class ~ ., data=x) #select which data to plot (whole or half of data set)
fitGraph <- ggplotLDAPrep(ldaobject)
ggplot(fitGraph, aes(LD1,LD2, color=labels))+
geom_point() +
stat_ellipse(aes(x=LD1, y=LD2, fill = labels), alpha = 0.2, geom = "polygon")
#LDA HISTOGRAMS (Coghlan 2010)
ldahist(data=seed.lda.values$x[,1],g=x$Class) #first ld function
ldahist(data=seed.lda.values$x[,2],g=x$Class) #second ld function
# LDA METHOD 2 ~ ALLOCATION RULES (Coghlan 2010)
seed.lda=lda(x$Class~x$A + x$W + x$L + x$P + x$C + x$AC + x$LG)
#You can choose which variables to use instead of using “~ .”
#Use half of data set to find midpoints to use for the other half
seed.lda #loadings
seed.lda.values=predict(seed.lda, x[2:8])
seed.lda.values$x #gives values of discriminant functions
#FIND MEAN OF LDA VALUES FOR EACH VARIETY (Coghlan 2010)
printMeanAndSdByGroup <- function(variables,groupvariable)
{
# find the names of the variables
variablenames <- c(names(groupvariable),names(as.data.frame(variables)))
# within each group, find the mean of each variable
groupvariable <- groupvariable[,1] # ensures groupvariable is not a list
means <- aggregate(as.matrix(variables) ~ groupvariable, FUN = mean)
names(means) <- variablenames
print(paste("Means:"))
print(means)
# within each group, find the standard deviation of each variable:
sds <- aggregate(as.matrix(variables) ~ groupvariable, FUN = sd)
names(sds) <- variablenames
print(paste("Standard deviations:"))
print(sds)
# within each group, find the number of samples:
samplesizes <- aggregate(as.matrix(variables) ~ groupvariable, FUN = length)
names(samplesizes) <- variablenames
print(paste("Sample sizes:"))
print(samplesizes)
}
printMeanAndSdByGroup(seed.lda.values$x, x[1]) #use means to find mp's to make allocaion rules
#CORRELATION PLOT (sthda)
library(lattice) #make sure this is installed
my_cols <- c("dark red", "dark green", "dark blue") #choose group colors
# load panel functions
panel.hist <- function(s, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(s, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="grey")
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(x[2:8],pch = 19,
col = my_cols[x$Class],diag.panel=panel.hist,main="Wheat Kernels",
upper.panel=panel.cor)
mtext("Wheat Varieties: blue= Rosa; green= Kama; red= Canadian", 1, line=3.7,cex=.8) # add text to plot
#PCA (Coghlan 2010)
library(ggplot2)
standardise=as.data.frame(scale(x[2:8])) #standardize data first
x.Class=x[,1]
x.pca=prcomp(standardise)
summary(x.pca)
screeplot(x.pca,type="lines") #To decide which PC’s to retain
(x.pca$sdev)^2 #Variance
x.pca$rotation[,1] #loadings for PC 1
x.pca$rotation[,2] #loadings for PC 2
head(x.pca$x) #values of PC's
#PLOT PCA (R-Bloggers)
require(devtools)
library(devtools)
install_github("ggbiplot", "vqv")
library(ggbiplot)
g <- ggbiplot(x.pca, obs.scale = 1, var.scale = 1,
groups = x.Class, ellipse = TRUE,
circle = TRUE, cex=1)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
#LINEAR DISCRIMINANT ANALYSIS (Coghlan 2010)
library(MASS)
x=read.csv(file.choose(),header=T) #seed1.csv (training data set)
y=read.csv(file.choose(),header=T) #seed2.csv (prediction data set)
# Wheat variety discriminant analysis
r3 = lda(formula = Class ~ .,data = x) # Use “~ .” to select all variables
plda = predict(object = r3, newdata = y) # predict function (allocation method below)
plda$x #LD function values
plda$class # Classification result
y$Class # Actual varieties
#PLOT LDA (stackoverflow)
ggplotLDAPrep <- function(x){
if (!is.null(Terms <- x$terms)) {
data <- model.frame(x)
X <- model.matrix(delete.response(Terms), data)
g <- model.response(data)
xint <- match("(Intercept)", colnames(X), nomatch = 0L)
if (xint > 0L)
X <- X[, -xint, drop = FALSE]
}
means <- colMeans(x$means)
X <- scale(X, center = means, scale = FALSE) %*% x$scaling
rtrn <- as.data.frame(cbind(X,labels=as.character(g)))
rtrn <- data.frame(X,labels=as.character(g))
return(rtrn)
}
ldaobject <- lda(Class ~ ., data=x) #select which data to plot (whole or half of data set)
fitGraph <- ggplotLDAPrep(ldaobject)
ggplot(fitGraph, aes(LD1,LD2, color=labels))+
geom_point() +
stat_ellipse(aes(x=LD1, y=LD2, fill = labels), alpha = 0.2, geom = "polygon")
#LDA HISTOGRAMS (Coghlan 2010)
ldahist(data=seed.lda.values$x[,1],g=x$Class) #first ld function
ldahist(data=seed.lda.values$x[,2],g=x$Class) #second ld function
# LDA METHOD 2 ~ ALLOCATION RULES (Coghlan 2010)
seed.lda=lda(x$Class~x$A + x$W + x$L + x$P + x$C + x$AC + x$LG)
#You can choose which variables to use instead of using “~ .”
#Use half of data set to find midpoints to use for the other half
seed.lda #loadings
seed.lda.values=predict(seed.lda, x[2:8])
seed.lda.values$x #gives values of discriminant functions
#FIND MEAN OF LDA VALUES FOR EACH VARIETY (Coghlan 2010)
printMeanAndSdByGroup <- function(variables,groupvariable)
{
# find the names of the variables
variablenames <- c(names(groupvariable),names(as.data.frame(variables)))
# within each group, find the mean of each variable
groupvariable <- groupvariable[,1] # ensures groupvariable is not a list
means <- aggregate(as.matrix(variables) ~ groupvariable, FUN = mean)
names(means) <- variablenames
print(paste("Means:"))
print(means)
# within each group, find the standard deviation of each variable:
sds <- aggregate(as.matrix(variables) ~ groupvariable, FUN = sd)
names(sds) <- variablenames
print(paste("Standard deviations:"))
print(sds)
# within each group, find the number of samples:
samplesizes <- aggregate(as.matrix(variables) ~ groupvariable, FUN = length)
names(samplesizes) <- variablenames
print(paste("Sample sizes:"))
print(samplesizes)
}
printMeanAndSdByGroup(seed.lda.values$x, x[1]) #use means to find mp's to make allocaion rules