Skip to content.

bioconductor.org

Bioconductor is an open source and open development software project
for the analysis and comprehension of genomic data.

Sections

lab5.R

##load up required libraries

library(cluster) library(Biobase) library(annotate) library(genefilter) library(golubEsets)

############################ ## data processing ###########################

data(golubTrain) golubTrain data(golubTest) golubTest

LS<-exprs(golubTrain) cl<-golubTrain$ALL.AML TS<-exprs(golubTest) clts<-golubTest$ALL.AML

##mmfilt is in the library

LS[LS<100]<-100 TS[TS<100]<-100 LS[LS>16000]<-16000 TS[TS>16000]<-16000

mmfilt <- function(r=5, d=500, na.rm=TRUE) { function(x) { minval <- min(x, na.rm=na.rm) maxval <- max(x, na.rm=na.rm) (maxval/minval > 5) && (maxval-minval > 500) } }

mmfun <- mmfilt()

ffun <- filterfun(mmfun)

good <- genefilter(cbind(LS, TS), ffun )

sum(good) ##I get 3571

LSsub <- log10(LS[good,]) TSsub <- log10(TS[good,]) LTsub <- cbind(LSsub, TSsub)

############################## ## Annotate : optional ##############################

##probably want to do a ttest filter here ##but we'll leave that to you

ll<- read.annotation("hgu68ll") #locus link gname <- row.names(LSsub)[50] llname<-get(gname, env=ll) locuslink(llname) ##open your browser to the right thing

chroms <- read.annotation("hgu68Chrom")

whichC <- multiget(row.names(LSsub[1:100,]), env=chroms, iffail=NA) cc <-as.numeric(unlist(whichC)) hist(cc)

################################### ## Clustering #################################

gf <- gapFilter(550, 800, .1) ff <- filterfun(gf) xx <- 10^LSsub ##to get back to original units good <- genefilter(xx, gf) sum(good) # should be 1093

LS2 <- LSsub[good,]

library(mva) clust.res<-hclust(as.dist(1-cor(LS2)), method = "complete") plot.hclust(clust.res, labels=cl,hang = 0.1)

clust.res<-hclust(dist(t(LS2)), method = "average") plot.hclust(clust.res, labels=cl,hang = 0.1)

pamLS <- pam(as.dist(1-cor(LS2)), 3, diss=TRUE) plot(pamLS)

####################################### ## classification ##################################

library(class)

ALL <- c(as.character(golubTrain$ALL), as.character(golubTest$ALL)) BT <- c(as.character(golubTrain$T.B), as.character(golubTest$T.B)) BTs <- ifelse(BT=="B-cell", "B", "T") BTs <- ifelse(BT=="T-cell", "T", BTs) BTs <- ifelse(BT=="NA", "", BTs) newALL <- factor(paste(ALL, BTs, sep="")) nALL.train <- newALL[1:38] nALL.test <-newALL[39:72]

##now do an Anova filter af <- Anova(nALL.train, p=0.0000001) ffA <- filterfun(af) wh.ALL <- genefilter(LSsub, af) #use only training data sum(wh.ALL) #how many

##how does it work on the test set pred.test <- knn(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train, k=5) table(pred.test, nALL.test)

##how does it work on the training set pred.tr <- knn(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train, k=5) table(pred.tr, nALL.train)

## ##use knn and cross-validation to choose the best k!

knn.cvk<-function(LS, cl, k=1:5, l=0, prob=FALSE, use.all=TRUE) { classes<-unique(cl) cv.err<-rep(0,length(k)) cl.pred<-matrix(NA, nrow(LS), length(k)) for(j in (1:length(k))) { cl.pred[,j]<-knn.cv(LS, cl, k[j], l, prob, use.all) cv.err[j]<-sum(cl!=cl.pred[,j]) } k0<-k[which.min(cv.err)] return(k=k0,pred=classes[cl.pred[,which.min(cv.err)]]) }

whichone <- knn.cvk(t(LSsub[wh.ALL,]), nALL.train)

##Tree based classifiers

library(rpart)

dorpart <- function(LS, TS, cl) { LS <- as.data.frame(LS) TS <- as.data.frame(TS) dimnames(TS)[[2]] <- dimnames(LS)[[2]] <- paste("V", 1:ncol(LS), sep = "") fit <- rpart(factor(cl) ~ ., data = LS) predict.rpart(fit, TS, type = "class") }

all.rp <- dorpart(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train) table(all.rp, nALL.test)

##Discriminant Analysis

library(MASS)

dolda <- function(LS, TS, cl, prior, pmethod="plug-in") { ndata <- data.frame(LS) ndata$cl <- cl if( missing(prior) ) mn <- lda(cl~., data=ndata) else mn <- lda(cl~., data=ndata, prior=prior) pdata<-data.frame(TS) preds <- predict(mn, pdata, method=pmethod) rval <- list(lda = mn, preds=preds) class(rval) <- "mylda" rval }

lda.test <- dolda(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train) table(lda.test$preds$class, nALL.test)

##how does it work on the training set ##not very meaningful lda.pred.tr <- dolda(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train) table(lda.pred.tr$preds$class, nALL.train)

News
2009-10-26

BioC 2.5, consisting of 352 packages and designed to work with R 2.10.z, was released today.

2009-01-07

R, the open source platform used by Bioconductor, featured in a series of articles in the New York Times.