## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( message = FALSE, error = FALSE, warn = FALSE, collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(data.table) library(ggplot2) library(CellBarcode) ## ----eval=FALSE--------------------------------------------------------------- # system.file("extdata", "mef_test_data", package="CellBarcode") ## ----smaples------------------------------------------------------------------ example_data <- system.file("extdata", "mef_test_data", package = "CellBarcode") fq_files <- dir(example_data, "fastq.gz", full=TRUE) # prepare metadata for the samples metadata <- stringr::str_split_fixed(basename(fq_files), "_", 10)[, c(4, 6)] metadata <- as.data.frame(metadata) sample_name <- apply(metadata, 1, paste, collapse = "_") colnames(metadata) = c("cell_number", "replication") # metadata should has the row names consistent to the sample names rownames(metadata) = sample_name metadata ## ----installation Bioconducotr, eval=FALSE------------------------------------ # if(!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("CellBarcode") ## ----installation devel, eval=FALSE------------------------------------------- # # install.packages("remotes") # remotes::install_github("wenjie1991/CellBarcode") ## ----basic_workflow----------------------------------------------------------- # install.packages("stringr") library(CellBarcode) library(magrittr) # The example data is the mix of MEF lines with known barcodes # 2000 reads for each file have been sampled for this test dataset # extract UMI barcode with regular expression bc_obj <- bc_extract( fq_files, # fastq file pattern = "([ACGT]{12})CTCGAGGTCATCGAAGTATC([ACGT]+)CCGTAGCAAGCTCGAGAGTAGACCTACT", pattern_type = c("UMI" = 1, "barcode" = 2), sample_name = sample_name, metadata = metadata ) bc_obj # sample subset operation, select technical repeats 'mixa' bc_sub = bc_subset(bc_obj, sample=replication == "mixa") bc_sub # filter the barcode, UMI barcode amplicon >= 2 & UMI counts >= 2 bc_sub <- bc_cure_umi(bc_sub, depth = 2) %>% bc_cure_depth(depth = 2) # select barcodes with a white list bc_2df(bc_sub) bc_sub[c("AAGTCCAGTACTATCGTACTA", "AAGTCCAGTACTGTAGCTACTA"), ] # export the barcode counts to data.frame head(bc_2df(bc_sub)) # export the barcode counts to matrix head(bc_2matrix(bc_sub)) ## ----quality_control_1-------------------------------------------------------- qc_noFilter <- bc_seq_qc(fq_files) qc_noFilter bc_names(qc_noFilter) class(qc_noFilter) ## ----qc_figure_set1----------------------------------------------------------- bc_plot_seqQc(qc_noFilter) ## ----qc_figure_single1-------------------------------------------------------- qc_noFilter[1] class(qc_noFilter[1]) bc_plot_seqQc(qc_noFilter[1]) ## ----filter_low_quality_seq--------------------------------------------------- # TODO: output the filtering percentage # TODO: Trimming fq_filter <- bc_seq_filter( fq_files, min_average_quality = 30, min_read_length = 60, sample_name = sample_name) fq_filter bc_plot_seqQc(bc_seq_qc(fq_filter)) ## ----------------------------------------------------------------------------- pattern <- "([ACGA]{12})CTCGAGGTCATCGAAGTATC([ACGT]+)CCGTAGCAAGCTCGAGAGTAGACCTACT" pattern_type <- c("UMI" = 1, "barcode" = 2) ## ----extract_barcode_no_UMI--------------------------------------------------- pattern <- "CTCGAGGTCATCGAAGTATC([ACGT]+)CCGTAGCAAGCTCGAGAGTAGACCTACT" bc_obj <- bc_extract( fq_filter, sample_name = sample_name, pattern = pattern, pattern_type = c(barcode = 1)) bc_obj names(bc_messyBc(bc_obj)[[1]]) ## ----extract_barcode_with_UMI------------------------------------------------- pattern <- "([ACGA]{12})CTCGAGGTCATCGAAGTATC([ACGT]+)CCGTAGCAAGCTCGAGAGTAGACCTACT" bc_obj_umi <- bc_extract( fq_filter, sample_name = sample_name, pattern = pattern, maxLDist = 0, pattern_type = c(UMI = 1, barcode = 2)) class(bc_obj_umi) bc_obj_umi ## ----fig.width=5, fig.height=5------------------------------------------------ # select two samples from bc_obj_umi bc_obj_umi_sub <- bc_obj_umi[, c("781_mixa", "781_mixb")] # get the metadata matrix (d <- bc_meta(bc_obj_umi_sub)) # use the row name of the metadata, which contains the sample names d$sample_name <- rownames(d) d$barcode_read_count / d$raw_read_count # visualize ggplot(d) + aes(x=sample_name, y=barcode_read_count / raw_read_count) + geom_bar(stat="identity") ## ----------------------------------------------------------------------------- # Access messyBc slot head(bc_messyBc(bc_obj_umi)[[1]], n=2) # return a data.frame head(bc_messyBc(bc_obj_umi, isList=FALSE), n=2) # Access cleanBc slot # return a data.frame head(bc_cleanBc(bc_obj_umi, isList=FALSE), n=2) ## ----------------------------------------------------------------------------- bc_obj_umi_sub <- bc_obj_umi[, c("781_mixa", "781_mixb")] bc_names(bc_obj_umi_sub) ## ----------------------------------------------------------------------------- bc_meta(bc_obj_umi_sub)$rep <- c("a", "b") bc_meta(bc_obj_umi_sub) ## ----------------------------------------------------------------------------- bc_subset(bc_obj_umi_sub, sample = rep == "a") ## ----correct_barcodde_with_UMI------------------------------------------------ # Filter the barcodes with UMI-barcode tag >= 1, # and treat UMI as absolute unique and do "fish" bc_obj_umi_sub <- bc_cure_umi( bc_obj_umi_sub, depth = 1, isUniqueUMI = TRUE, doFish = TRUE) bc_obj_umi_sub ## ----correct_barcodde_with_count---------------------------------------------- # Apply the barcode sequence depth with depth >= 3 # With isUpdate = FLASE, the data in `messyBc` slot of bc_obj_umi_sub # will be used for depth filtering. The UMI information will be discarded, # the identical barcodes in different UMI-barcode tags are merged before # performing the sequence depth filtering. bc_obj_umi_sub <- bc_cure_depth(bc_obj_umi_sub, depth = 3, isUpdate = FALSE) bc_obj_umi_sub # Apply the UMI count filter, keep barcode >= 3 UMI # The `bc_cure_umi` function applies the filtering on the UMI-barcode tags, # and create a `cleanBc` slot in the return BarcodeObj object. Then, # the `bc_cure_depth` with `isUpdate` argument TRUE will apply the filtering # on the UMI counts in `cleanBc` and updated the `cleanBc`. bc_obj_umi_sub <- bc_cure_umi( bc_obj_umi_sub, depth = 1, isUniqueUMI = TRUE, doFish = TRUE) bc_obj_umi_sub bc_obj_umi_sub <- bc_cure_depth(bc_obj_umi_sub, depth = 3, isUpdate = TRUE) bc_obj_umi_sub ## ----correct_barcodde_clustering---------------------------------------------- # Do the clustering and merging the least abundant barcodes to the similar # abundant ones bc_cure_cluster(bc_obj_umi_sub) ## ----fig.width=5-------------------------------------------------------------- bc_plot_single(bc_obj_umi_sub) ## ----fig.width=5-------------------------------------------------------------- # re-do the filtering using depth threshold 0 to include all barcodes. bc_obj_umi_sub_neo <- bc_cure_depth(bc_obj_umi_sub, depth=0, isUpdate=FALSE) # you can use the count_marks argument to display the cutoff points in the figure # and the highlight argument to highlight specific barcodes. bc_plot_single(bc_obj_umi_sub_neo, count_marks=10, highlight= c("AAGTCCAGTACTATCGTACTA", "AAGTCCAGTACTGTAGCTACTA")) ## ----fig.height=7------------------------------------------------------------- # create a new BarcodeObj for following visualization # use depth as 0 to include all the barcodes. bc_obj_umi_neo <- bc_cure_depth(bc_obj_umi[, 1:4], depth=0) # you can set the count_marks to display the cutoff point # and highlight specific barcodes dots by highlight bc_plot_mutual(bc_obj_umi_neo, count_marks=c(10, 20, 30, 40), highlight= c("AAGTCCAGTACTATCGTACTA", "AAGTCCAGTACTGTAGCTACTA")) ## ----fig.width=5-------------------------------------------------------------- # create a new BarcodeObj for following visualization # use depth as 0 to include all the barcodes. bc_obj_umi_neo <- bc_cure_depth(bc_obj_umi[, 1:4], depth=0) # 2d scatters plot with x axis of sample_x and y axis of sample_y # sample_x, and sample_y can be the sample name or sample index bc_plot_pair( bc_obj_umi_neo, sample_x = c("50000_mixa"), sample_y = c("50000_mixb", "12500_mixa", "195_mixb"), count_marks_x = 10, count_marks_y = c(10, 20, 30), highlight= c("AAGTCCAGTACTATCGTACTA", "AAGTCCAGTACTGTAGCTACTA") ) ## ----------------------------------------------------------------------------- bc_names(bc_obj_umi_sub) ## ----------------------------------------------------------------------------- bc_2df(bc_obj_umi_sub) ## ----------------------------------------------------------------------------- bc_2dt(bc_obj_umi_sub) ## ----misc--------------------------------------------------------------------- bc_2matrix(bc_obj_umi_sub) ## ----------------------------------------------------------------------------- data(bc_obj) # Join two samples with different barcodes bc_obj["AGAG", "test1"] + bc_obj["AAAG", "test1"] # Join two samples with shared barcodes bc_obj_join <- bc_obj["AGAG", "test1"] + bc_obj["AGAG", "test1"] bc_obj_join # In this case, the shared barcodes are not merged. # Applying bc_cure_depth() to merge them. bc_cure_depth(bc_obj_join) # Remove barcodes bc_obj - "AAAG" # Select barcodes in white list bc_obj * "AAAG" ## ----------------------------------------------------------------------------- bc_2df(bc_obj_umi_sub[bc_barcodes(bc_obj_umi_sub)[1], "781_mixa"]) ## 1. Use `bc_barcodes` to pull out all the barcodes in two ## samples, and choose the fist barcode. ## 2. Select the barcode got in step 1, and the sample named "781_mixa". ## 3. Convert the BarcodeObj object to a data.frame. ## ----------------------------------------------------------------------------- sessionInfo()