## ----setup, echo=FALSE--------------------------------------------------- knitr::opts_chunk$set(cache=TRUE) ## ----echo=TRUE, eval=FALSE----------------------------------------------- ## fname <- file.choose() ## "ALLphenoData.tsv" ## stopifnot(file.exists(fname)) ## pdata <- read.delim(fname) ## ----echo=FALSE---------------------------------------------------------- fname <- system.file(package="IntermediateR1", "extdata", "ALLphenoData.tsv") stopifnot(file.exists(fname)) pdata <- read.delim(fname) ## ----ALL-properties------------------------------------------------------ class(pdata) colnames(pdata) dim(pdata) head(pdata) summary(pdata$sex) summary(pdata$cyto.normal) ## ----ALL-subset---------------------------------------------------------- pdata[1:5, 3:4] pdata[1:5, ] head(pdata[, 3:5]) tail(pdata[, 3:5], 3) head(pdata$age) head(pdata$sex) head(pdata[pdata$age > 21,]) ## ----ALL-subset-NA------------------------------------------------------- idx <- pdata$sex == "F" & pdata$age > 40 table(idx) dim(pdata[idx,]) ## ----ALL-BCR/ABL-subset-------------------------------------------------- bcrabl <- pdata[pdata$mol.biol %in% c("BCR/ABL", "NEG"),] ## ----ALL-BCR/ABL-drop-unused--------------------------------------------- bcrabl$mol.biol <- factor(bcrabl$mol.biol) ## ----ALL-BT-------------------------------------------------------------- levels(bcrabl$BT) ## ----ALL-BT-recode------------------------------------------------------- table(bcrabl$BT) levels(bcrabl$BT) <- substring(levels(bcrabl$BT), 1, 1) table(bcrabl$BT) ## ----ALL-BCR/ABL-BT------------------------------------------------------ xtabs(~ BT + mol.biol, bcrabl) ## ----ALL-aggregate------------------------------------------------------- aggregate(age ~ mol.biol + sex, bcrabl, mean) ## ----ALL-age------------------------------------------------------------- t.test(age ~ mol.biol, bcrabl) boxplot(age ~ mol.biol, bcrabl) ## ----echo=TRUE, eval=FALSE----------------------------------------------- ## fname <- file.choose() ## BRFSS-subset.csv ## stopifnot(file.exists(fname)) ## brfss <- read.csv(fname) ## ----echo=FALSE---------------------------------------------------------- fname <- system.file(package="CSAMA2014Lab1", "extdata", "BRFSS-subset.csv") stopifnot(file.exists(fname)) brfss <- read.csv(fname) ## ----brfss-simple-plot--------------------------------------------------- plot(sqrt(Weight) ~ Height, brfss, main="All Years, Both Sexes") ## ----brfss-subset-------------------------------------------------------- brfss2010 <- brfss[brfss$Year == "2010", ] ## ----brfss-pair-plot----------------------------------------------------- opar <- par(mfcol=c(1, 2)) plot(sqrt(Weight) ~ Height, brfss2010[brfss2010$Sex == "Female", ], main="2010, Female") plot(sqrt(Weight) ~ Height, brfss2010[brfss2010$Sex == "Male", ], main="2010, Male") par(mfcol=c(1, 1)) ## ----lattice------------------------------------------------------------- library(lattice) ## ----lattice-conditioning------------------------------------------------ xyplot(sqrt(Weight) ~ Height | Sex, brfss2010) ## ----lattice-density----------------------------------------------------- densityplot(~sqrt(Weight), brfss2010, group=Sex, plot.points=FALSE) ## ----lattice-bwplot------------------------------------------------------ bwplot(sqrt(Weight) ~ factor(Year) | Sex, brfss) ## ----lattice-violin------------------------------------------------------ bwplot(sqrt(Weight) ~ factor(Year) | Sex, brfss, panel=panel.violin) ## ----lattice-panel------------------------------------------------------- xyplot(sqrt(Weight) ~ Height|Sex, brfss2010, panel = function(x, y, ...) { panel.xyplot(x, y, ...) panel.points(median(x, na.rm=TRUE), median(y, na.rm=TRUE), cex=2, pch=20, col="red") }, type=c("p", "g", "r"), lwd=2, col.line="red", xlim=c(120, 210)) ## ----iranges------------------------------------------------------------- library(IRanges) ir <- IRanges(start=c(10, 20, 30), width=5) ir ## ----iranges-flank------------------------------------------------------- flank(ir, 3) ## ----iranges-class------------------------------------------------------- class(ir) getClassDef(class(ir)) ## ----iranges-flank-method, eval=FALSE------------------------------------ ## ?"flank,Ranges-method" ## ----granges------------------------------------------------------------- library(GenomicRanges) gr <- GRanges(c("chr1", "chr1", "chr2"), ir, strand=c("+", "-", "+")) gr ## ----granges-flank------------------------------------------------------- flank(gr, 3) ## ----granges-class------------------------------------------------------- class(gr) getClassDef(class(gr)) ## ----granges-flank-method, eval=FALSE------------------------------------ ## ?"flank,GenomicRanges-method" ## ----granges-methods, eval=FALSE----------------------------------------- ## showMethods(class="GRanges", where=search()) ## ----granges-man-and-vignettes, eval=FALSE------------------------------- ## help(package="GenomicRanges") ## vignette(package="GenomicRanges") ## vignette(package="GenomicRanges", "GenomicRangesHOWTOs") ## ----require------------------------------------------------------------- library(GenomicRanges) ## ----help, eval=FALSE---------------------------------------------------- ## help(package="GenomicRanges") ## vignette(package="GenomicRanges") ## vignette(package="GenomicRanges", "GenomicRangesHOWTOs") ## ?GRanges ## ----vectorize----------------------------------------------------------- x <- 1:10 log(x) ## NOT for (i in seq_along) x[i] <- log(x[i]) ## ----pre-allocate-------------------------------------------------------- result <- numeric(10) result[1] <- runif(1) for (i in 2:length(result)) result[i] <- runif(1) * result[i - 1] result ## ----inefficient--------------------------------------------------------- f0 <- function(n, a=2) { ## stopifnot(is.integer(n) && (length(n) == 1) && ## !is.na(n) && (n > 0)) result <- numeric() for (i in seq_len(n)) result[[i]] <- a * log(i) result } ## ----system-time--------------------------------------------------------- system.time(f0(10000)) n <- 1000 * seq(1, 20, 2) t <- sapply(n, function(i) system.time(f0(i))[[3]]) plot(t ~ n, type="b") ## ----correct-init-------------------------------------------------------- n <- 10000 system.time(expected <- f0(n)) head(expected) ## ----hoist--------------------------------------------------------------- f1 <- function(n, a=2) { result <- numeric() for (i in seq_len(n)) result[[i]] <- log(i) a * result } identical(expected, f1(n)) library(microbenchmark) microbenchmark(f0(n), f1(n), times=5) ## ----preallocate-and-fill------------------------------------------------ f2 <- function(n, a=2) { result <- numeric(n) for (i in seq_len(n)) result[[i]] <- log(i) a * result } identical(expected, f2(n)) microbenchmark(f0(n), f2(n), times=5) ## ----use-apply----------------------------------------------------------- f3 <- function(n, a=2) a * sapply(seq_len(n), log) identical(expected, f3(n)) microbenchmark(f0(n), f2(n), f3(n), times=10) ## ----use-vectorize------------------------------------------------------- f4 <- function(n, a=2) a * log(seq_len(n)) identical(expected, f4(n)) microbenchmark(f0(n), f3(n), f4(n), times=10) ## ----use-compiler-------------------------------------------------------- library(compiler) f2c <- cmpfun(f2) n <- 10000 identical(f2(n), f2c(n)) microbenchmark(f2(n), f2c(n), times=10) ## ----vectorized-scale---------------------------------------------------- n <- 100000 * seq(1, 20, 2) # 100x larger than f0 t <- sapply(n, function(i) system.time(f4(i))[[3]]) plot(t ~ n, type="b") ## ----algo-init----------------------------------------------------------- vec <- c(seq(-100,-1,length.out=1e6), rep(0,20), seq(1,100,length.out=1e6)) ## ----algo-scan----------------------------------------------------------- f0 <- function(v) sum(v < 0) ## ----algo-scan-time------------------------------------------------------ N <- seq(1, 11, 2) * 1e6 Time <- sapply(N, function(n) { v <- sort(rnorm(n)) system.time(f0(v))[[3]] }) plot(Time ~ N, type="b") ## ----algo-binary--------------------------------------------------------- f3 <- function(x, threshold=0) { imin <- 1L imax <- length(x) while (imax >= imin) { imid <- as.integer(imin + (imax - imin) / 2) if (x[imid] >= threshold) imax <- imid - 1L else imin <- imid + 1L } imax } ## ----algo-binary-perform------------------------------------------------- ## identity identical(f0((-2):2), f3((-2):2)) identical(f0(2:4), f3(2:4)) identical(f0(-(4:2)), f3(-(4:2))) identical(f0(vec), f3(vec)) ## scale N <- 10^(1:7) Time <- sapply(N, function(n) { v <- sort(rnorm(n)) system.time(f3(v))[[3]] }) plot(Time ~ N, type="b") ## ----algo-relative------------------------------------------------------- ## relative time library(microbenchmark) microbenchmark(f0(vec), f3(vec)) library(compiler) f3c <- cmpfun(f3) microbenchmark(f3(vec), f3c(vec)) ## ----findInterval-------------------------------------------------------- f4 <- function(v, query=0) findInterval(query - .Machine$double.eps, v) identical(f0(vec), f4(vec)) microbenchmark(f0(vec), f3(vec), f4(vec)) ## ----findInterval-several------------------------------------------------ threshold <- rnorm(10000) identical(sapply(threshold, f3, x=vec), f4(vec, threshold)) microbenchmark(sapply(x, f3), f4(vec, x))