## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"-------------------- BiocStyle::latex(relative.path = TRUE) ## ----knitr, echo=FALSE, results="hide"------------------------------------- library("knitr") opts_chunk$set( tidy=FALSE, dev="png", fig.show="hide", # fig.width=4, fig.height=4.5, fig.width=10, fig.height=8, fig.pos="tbh", cache=TRUE, message=FALSE) ## ----chu_citation, dev="pdf"----------------------------------------------- citation("DaMiRseq") ## ----chu_1----------------------------------------------------------------- library(DaMiRseq) ## only for example: # rawdata.path <- system.file(package = "DaMiRseq","extdata") # setwd(rawdata.path) # filecounts <- list.files(rawdata.path, full.names = TRUE)[2] # filecovariates <- list.files(rawdata.path, full.names = TRUE)[1] # count_data <- read.delim(filecounts) # covariate_data <- read.delim(filecovariates, stringAsFactor = T) # SE<-DaMiR.makeSE(count_data, covariate_data) ## ----chu_2----------------------------------------------------------------- data(SE) assay(SE)[1:5, c(1:5, 21:25)] colData(SE) ## ----chu_4----------------------------------------------------------------- data_norm <- DaMiR.normalization(SE, minCounts=10, fSample=0.7, hyper = "no") ## ----chu_5----------------------------------------------------------------- data_norm <- DaMiR.normalization(SE, minCounts=10, fSample=0.7, hyper = "yes", th.cv=3) print(data_norm) assay(data_norm)[c(1:5), c(1:5, 21:25)] ## ----chu_6, eval=FALSE----------------------------------------------------- # # Time Difference, using VST or rlog for normalization: # # # #data_norm <- DaMiR.normalization(SE, minCounts=10, fSample=0.7, th.cv=3) # # VST: about 80 seconds # # # #data_norm <- DaMiR.normalization(SE, minCounts=10, fSample=0.7, th.cv=3, # # type="rlog") # # rlog: about 8890 seconds (i.e. 2 hours and 28 minutes!) ## ----chu_7----------------------------------------------------------------- data_filt <- DaMiR.sampleFilt(data_norm, th.corr=0.9) dim(data_filt) ## ----chu_8, dev="pdf"------------------------------------------------------ sv <- DaMiR.SV(data_filt) ## ----chu_10, dev="pdf"----------------------------------------------------- data_adjust<-DaMiR.SVadjust(data_filt, sv, n.sv=4) assay(data_adjust[c(1:5), c(1:5, 21:25)]) ## ----chu_11, dev="pdf"----------------------------------------------------- # After gene filtering and normalization DaMiR.Allplot(data_filt, colData(data_filt)) ## ----chu_12, dev="pdf"----------------------------------------------------- # After sample filtering and sv adjusting DaMiR.Allplot(data_adjust, colData(data_adjust)) ## ----chu_export, dev="pdf", eval=FALSE------------------------------------- # outputfile <- "DataNormalized.txt" # write.table(data_norm, file = outputfile_norm, quote = FALSE, sep = "\t") ## ----chu_13---------------------------------------------------------------- set.seed(12345) data_clean<-DaMiR.transpose(assay(data_adjust)) df<-colData(data_adjust) data_reduced <- DaMiR.FSelect(data_clean, df, th.corr=0.4) ## ----chu_14, dev="pdf"----------------------------------------------------- data_reduced <- DaMiR.FReduct(data_reduced$data) DaMiR.MDSplot(data_reduced, df) ## ----chu_15, dev="pdf"----------------------------------------------------- # Rank genes by importance: df.importance <- DaMiR.FSort(data_reduced, df) head(df.importance) ## ----chu_16, dev="pdf"----------------------------------------------------- # Select Best Predictors: selected_features <- DaMiR.FBest(data_reduced, ranking=df.importance, n.pred = 5) selected_features$predictors # Dendrogram and heatmap: DaMiR.Clustplot(selected_features$data, df) ## ----chu_17, dev="pdf"----------------------------------------------------- Classification_res <- DaMiR.EnsembleLearning(selected_features$data, classes=df$class, fSample.tr = 0.5, fSample.tr.w = 0.5, iter = 30) ## ----chu_17bis1, dev="pdf"------------------------------------------------- # Dataset for prediction set.seed(10101) nSampl_cl1 <- 5 nSampl_cl2 <- 5 ## May create unbalanced Learning and Test sets # idx_test <- sample(1:ncol(data_adjust), 10) # Create balanced Learning and Test sets idx_test_cl1<-sample(1:(ncol(data_adjust)/2), nSampl_cl1) idx_test_cl2<-sample(1:(ncol(data_adjust)/2), nSampl_cl2) + ncol(data_adjust)/2 idx_test <- c(idx_test_cl1, idx_test_cl2) Test_set <- data_adjust[, idx_test, drop=FALSE] Learning_set <- data_adjust[, -idx_test, drop=FALSE] ## ----chu_17bis12, dev="pdf"------------------------------------------------ # Training and Test into a 'nfold' Cross Validation nfold <- 3 cv_sample <- c(rep(seq_len(nfold), each=ncol(Learning_set)/(2*nfold)), rep(seq_len(nfold), each=ncol(Learning_set)/(2*nfold))) # Variables initialization cv_models <- list() cv_predictors <- list() res_df <- data.frame(matrix(nrow = nfold, ncol = 7)) colnames(res_df) <- c("Accuracy", "N.predictors", "MCC", "sensitivity", "Specificty", "PPV", "NPV") ## ----chu_17bis2, dev="pdf", results="hide"--------------------------------- for (cv_fold in seq_len(nfold)){ # Create Training and Validation Sets idx_cv <- which(cv_sample != cv_fold) TR_set <- Learning_set[,idx_cv, drop=FALSE] Val_set <- Learning_set[,-idx_cv, drop=FALSE] #### Feature selection data_reduced <- DaMiR.FSelect(t(assay(TR_set)), as.data.frame(colData(TR_set)), th.corr=0.4) data_reduced <- DaMiR.FReduct(data_reduced$data,th.corr = 0.9) df_importance <- DaMiR.FSort(data_reduced, as.data.frame(colData(TR_set))) selected_features <- DaMiR.FBest(data_reduced, ranking=df_importance, autoselect = "yes") # update datasets TR_set <- TR_set[selected_features$predictors,, drop=FALSE] Val_set <- Val_set[selected_features$predictors,drop=FALSE] ### Model building ensl_model <- DaMiR.EnsL_Train(TR_set, cl_type = c("RF", "LR")) # Store all trained models cv_models[[cv_fold]] <- ensl_model ### Model testing res_Val <- DaMiR.EnsL_Test(Val_set, EnsL_model = ensl_model) # Store all ML results res_df[cv_fold,1] <- res_Val$accuracy[1] # Accuracy res_df[cv_fold,2] <- length(res_Val$predictors) # N. of predictors res_df[cv_fold,3] <- res_Val$MCC[1] res_df[cv_fold,4] <- res_Val$sensitivity[1] res_df[cv_fold,5] <- res_Val$Specificty[1] res_df[cv_fold,6] <- res_Val$PPV[1] res_df[cv_fold,7] <- res_Val$NPV[1] cv_predictors[[cv_fold]] <- res_Val$predictors } ## ----chu_17bis3, dev="pdf"------------------------------------------------- # Model Selection res_df[,1:5] idx_best_model <- DaMiR.ModelSelect(res_df, type.sel = "mode", npred.sel = "min") ## ----chu_17bis4, dev="pdf"------------------------------------------------- # Prediction on the the independent test set res_predict <- DaMiR.EnsL_Predict(Test_set, bestModel = cv_models[[idx_best_model]]) # Predictors cv_predictors[[idx_best_model]] # Prediction assessment for Ensemble learning id_classifier <- 1 # Ensemble Learning table(colData(Test_set)$class, res_predict[,id_classifier]) # Prediction assessment for Logistic regression id_classifier <- 3 # Logistic regression table(colData(Test_set)$class, res_predict[,id_classifier]) ## ----chu_17ter1, dev="pdf"------------------------------------------------- data(SE) # create Independent test set and Learning set (raw counts) idx_test <- c(18,19,39,40) Ind_Test_set <- SE[, idx_test, drop=FALSE] Learning_set <- SE[, -idx_test, drop=FALSE] # DaMiRseq pipeline on Learning Set data_norm <- DaMiR.normalization(Learning_set, minCounts=10, fSample=0.7, hyper = "yes", th.cv=3) sv <- DaMiR.SV(data_norm) data_adjust <- DaMiR.SVadjust(data_norm, sv, n.sv=4) ## ----chu_17ter2, dev="pdf"------------------------------------------------- # remove not expressed genes from Learning_set and Ind_Test_set expr_LearningSet <- Learning_set[rownames(data_norm)] expr_Ind_Test_set <- Ind_Test_set[rownames(data_norm)] # Independent test set Normalization norm_ind_ts <- DaMiR.iTSnorm(expr_LearningSet, expr_Ind_Test_set, normtype = "vst", method = "precise") # Independent test set batch Adjusting adj_norm_ind_ts <- DaMiR.iTSadjust(data_adjust, norm_ind_ts) ## ----chu_17ter3, dev="pdf"------------------------------------------------- # Prediction on independent test set prediction <- DaMiR.EnsL_Predict(t(adj_norm_ind_ts), bestModel = cv_models[[idx_best_model]]) prediction # confusion matrix for the Ensemble Learner table(Ind_Test_set@colData$class, prediction[,1]) ## ----chu_18, dev="pdf"----------------------------------------------------- ## Feature Selection set.seed(12345) data_clean_2<-DaMiR.transpose(assay(data_filt)) df_2<-colData(data_filt) data_reduced_2 <- DaMiR.FSelect(data_clean_2, df_2, th.corr=0.4) data_reduced_2 <- DaMiR.FReduct(data_reduced_2$data) df.importance_2 <- DaMiR.FSort(data_reduced_2, df_2) head(df.importance_2) selected_features_2 <- DaMiR.FBest(data_reduced_2, ranking=df.importance_2, n.pred=5) selected_features_2$predictors ## Classification Classification_res_2 <- DaMiR.EnsembleLearning(selected_features_2$data, classes=df_2$class, fSample.tr = 0.5, fSample.tr.w = 0.5, iter = 30) ## ----sessInfo, results="asis", echo=FALSE---------------------------------- toLatex(sessionInfo())