library(Seurat)
library(plyr)
library(ggplot2)
library(DropletUtils)
library(celda)
library(SingleCellExperiment)
library(scater)
library(scran)
library(scDblFinder)
library(viridis)
library(MASS)
library(patchwork)get_density <- function(x, y, ...) { # function from https://slowkow.com/notes/ggplot2-color-by-density/
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
fancy_scientific <- function(l) { # function from https://stackoverflow.com/a/24241954
# turn in to character string in scientific notation
l <- format(l, scientific = TRUE)
# quote the part before the exponent to keep all the digits
l <- gsub("^(.*)e", "'\\1'e", l)
# turn the 'e+' into plotmath format
l <- gsub("e", "%*%10^", l)
# return this as an expression
parse(text=l)
}
setwd("~/000_GitHub/ibd-bcn_single_cell")
source('source/functions_scrnaseq.R')
source('source/colors.R')Metadata of the samples inside the code.
metadata <- data.frame(
stringsAsFactors = FALSE,
sample = c("SC_002",
"SC_004","SC_005","SC_008","SC_010","SC_011",
"SC_013","SC_014","SC_015","SC_016","SC_017",
"SC_018","SC_019","SC_021","TOF001W0","TOF002W0",
"TOF005W0","TOF010W0"),
sample_name = factor(c("HC 1", "CD 1", "CD 2", "UC 1", "UC 2", "HC 2",
"CD 3", "CD 4", "HC 3", "HC 4", "HC 5", "HC 6",
"CD 5", "CD 6", "UC 3", "UC 4", "UC 5", "UC 6"),
levels = c("HC 1", "HC 2", "HC 3", "HC 4",
"HC 5", "HC 6", "CD 1", "CD 2",
"CD 3", "CD 4", "CD 5", "CD 6",
"UC 1", "UC 2", "UC 3", "UC 4",
"UC 5", "UC 6")),
Health = factor(c("HC", "CDa", "CDa","UCa","UCa","HC","CDa","CDa",
"HC","HC","HC","HC","CDa","CDa","UCa","UCa","UCa",
"UCa"), levels = c("HC", "CDa", "UCa")),
Health_2 = factor(c("HC","IBD",
"IBD","IBD","IBD","HC","IBD","IBD","HC","HC",
"HC","HC","IBD","IBD","IBD","IBD","IBD",
"IBD"), levels = c("HC", "IBD"))
)This code consists of a loop joining the data. First loads the
sample, adds the sample’s metadata m4, and calculates the
doublets and MT%. Puts each sample in an empty list
list_data, and after the loop, merges the samples of the
list using Seurat’s function merge into the new
seudata object.
setwd('~/data_Albas/DATA/') # Dir where we have a folder per sample, with the 3 files from
# outs/filtered_feature_bc_matrix from cellranger output.
list_files <- list.files() # folder names
list_data <- list() # empty list
for(i in list_files){
# read the files
sce2 <- Read10X(i)
# feature filtering
keep_feature <- rowSums(sce2 > 0) > 0
sce2 <- sce2[keep_feature, ]
# doublet finding
sce2 <- scDblFinder(sce2, verbose=FALSE)
# metadata of sample
m4 <- data.frame('sample' = rep(i, ncol(sce2)),
'doublet' = sce2$scDblFinder.class,
'Health' = rep(metadata[metadata$sample == i, 'Health'][1], ncol(sce2)),
'sample_name' = rep(metadata[metadata$sample == i, 'sample_name'][1], ncol(sce2)),
'Health_2' = rep(metadata[metadata$sample == i, 'Health_2'][1], ncol(sce2)))
# identify sample per colname
colnames(sce2) <- paste(i, colnames(sce2), sep='_')
rownames(m4) <- colnames(sce2)
# seurat of the sample
data <- CreateSeuratObject(
counts(sce2),
min.features = 100,
project = i,
assay = "RNA",
meta.data = m4
)
# calcules the percent.mt per sample
data[["percent.mt"]] <- PercentageFeatureSet(object = data, pattern = "^MT-")
# add each sample into the empty list
list_data[[i]] <- data
}
seudata <- list_data[[1]]
for(i in 2:length(list_files)){
seudata <- merge(seudata, list_data[[i]])
}Let’s check the MT% per cell and the number of genes per cell in each sample.
VlnPlot(seudata, features = c('percent.mt', 'nFeature_RNA'),
group.by = 'sample_name', pt.size = 0) &
theme(text = element_text(family = 'Helvetica', size = 10),
axis.text.x = element_text(angle = 90, vjust = 0.5,
family = 'Helvetica', size = 10),
axis.title.x = element_blank(),
axis.text.y = element_text(family = 'Helvetica', size = 10)) &
scale_fill_manual(values = c('#71ad96','#5fb392','#71bda0',
'#77c9a9','#82ceb1','#9debcc',
'#5d5c84','#525174','#6f6f9b',
'#7b7ba3','#8787ab','#9f9fbc',
'#bc9338','#c79d43','#cba552',
'#d0ad62','#d5b572','#dec691'))
Let’s check the distribution of the cells considering the total number of counts and the total number of genes per cell. First plot is colored by doublet. Second plot is colored by sample_name.
meta <- seudata@meta.data
meta <- meta[order(meta$doublet, decreasing = T),]
ggplot(meta) +
geom_point(aes(nCount_RNA, nFeature_RNA, color = doublet), size=0.2) +
theme_classic() +
scale_color_manual(values = c('doublet' = '#F6AE2DCC',
'singlet' = '#86BBD860'))+
theme(plot.title = element_text(size = 25))+
scale_x_continuous(labels=fancy_scientific) +
theme(text = element_text(family = 'Helvetica', size = 10),
axis.text.x = element_text(family = 'Helvetica', size = 10),
axis.text.y = element_text(family = 'Helvetica', size = 10),
legend.key.size = unit(5,"bigpts"), legend.title = element_blank())+
guides(color = guide_legend(override.aes = list(size = 1.2)))
meta_dot <- seudata@meta.data[sample(1:nrow(seudata@meta.data)), ]
sum(is.na(meta_dot$sample_name))## [1] 0
ggplot(meta_dot) +
geom_point(aes(x = nCount_RNA, y = nFeature_RNA, color = sample_name),
size = 0.2) +
theme_classic()+
scale_x_continuous(labels=fancy_scientific) +
scale_y_continuous(labels=fancy_scientific) +
scale_color_manual(values = c('#71ad9680','#5fb39280','#71bda080',
'#77c9a980','#82ceb180','#9debcc80',
'#5d5c8480','#52517480','#6f6f9b80',
'#7b7ba380','#8787ab80','#9f9fbc80',
'#bc933880','#c79d4380','#cba55280',
'#d0ad6280','#d5b57280','#dec69180'))+
guides(color = guide_legend(override.aes = list(size = 1.2), ncol=3))
We first filter out the cells that are considered doublets and let’s recheck the distribution of the cells after filtering.
Tables showing the number of cells classified as singlet and doublet.
The table shows the global numbers. The barplot shows singlets/doublets
per sample. In the code, we filter out the cells classified as doublet
into a new object seudataf.
ta <- as.data.frame(table(seudata$doublet))
colnames(ta) <- c('category', 'n_cells')
knitr::kable(ta)| category | n_cells |
|---|---|
| doublet | 3333 |
| singlet | 54221 |
ta2 <- as.data.frame(table(seudata$sample_name, seudata$doublet))
colnames(ta2) <- c('sample','doublet','n_cells')
ta2$sample <- factor(ta2$sample, levels = c("HC 1", "HC 2", "HC 3", "HC 4",
"HC 5", "HC 6", "CD 1", "CD 2",
"CD 3", "CD 4", "CD 5", "CD 6",
"UC 1", "UC 2", "UC 3", "UC 4",
"UC 5", "UC 6"))
ggplot(data=ta2, aes(x=sample, y=n_cells, fill=doublet)) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=n_cells), vjust=1.6, color="black",
position = position_dodge(0.9), size=2.5)+
scale_fill_manual(values = c('doublet' = '#F6AE2DCC',
'singlet' = '#86BBD860'))+
labs(y = 'number of cells') +
theme_classic()+
theme(axis.text.x = element_text(angle=90, vjust=0.5),
legend.title = element_blank(),
axis.title.x= element_blank())
seudataf <- seudata[,seudata$doublet == 'singlet']Distribution of cells in the scatter plot (ncount/nfeatures) without the doublets.
meta_dot <- seudataf@meta.data[sample(1:nrow(seudataf@meta.data)), ]
ggplot(meta_dot) +
geom_point(aes(x = nCount_RNA, y = nFeature_RNA, color = sample_name),
size = 0.2) +
theme_classic()+
scale_x_continuous(labels=fancy_scientific) +
scale_y_continuous(labels=fancy_scientific) +
scale_color_manual(values = c('#71ad9680','#5fb39280','#71bda080',
'#77c9a980','#82ceb180','#9debcc80',
'#5d5c8480','#52517480','#6f6f9b80',
'#7b7ba380','#8787ab80','#9f9fbc80',
'#bc933880','#c79d4380','#cba55280',
'#d0ad6280','#d5b57280','#dec69180'))+
guides(color = guide_legend(override.aes = list(size = 1.2), ncol=3))
Plasma cells have more counts than other cells with the same number of features.
seudataf2 <- seudataf
meta_dot2 <- FetchData(seudataf2, vars = c('nCount_RNA', 'nFeature_RNA', 'DERL3'))
meta_dot2$DERL3_scaled <- scale(meta_dot2$DERL3)[,1]
meta_dot2 <- meta_dot2[order(meta_dot2$DERL3_scaled),]
ggplot(meta_dot2) +
geom_point(aes(x = nCount_RNA, y = nFeature_RNA, color = DERL3_scaled),
size = 0.2) +
theme_classic()+
scale_color_distiller("scaled/nDERL3",
limits = c(min(meta_dot2$DERL3_scaled, na.rm = T),
max(meta_dot2$DERL3_scaled, na.rm=T)),
na.value = "white", palette = "YlOrRd", direction = 1) +
scale_x_continuous(labels=fancy_scientific) +
scale_y_continuous(labels=fancy_scientific) 
rm(seudataf2, meta_dot2)Distribution of cells in scatter plot (nfeatures/percent.mt) to check where are the majority of our cells. We can see the dots colored by density using the function get_density shown in [Load extra functions and sources].
meta <- seudataf@meta.data
meta$density <- get_density(meta$percent.mt, meta$nFeature_RNA, n = 100)
ggplot(meta) +
geom_point(aes(percent.mt, nFeature_RNA, color = density), size = 0.2) +
scale_color_viridis() + theme_classic() +
theme(text = element_text( size = 12),
axis.title = element_text( size = 12),
legend.text = element_blank())
Most cells are located between 0 and 25% MT. Nevertheless, let’s check the higher % of MT in more detail.
meta2 <- seudata@meta.data[seudata@meta.data$percent.mt > 25,]
meta2$density <- get_density(meta2$percent.mt, meta2$nFeature_RNA, n = 100)
ggplot(meta2) +
geom_vline(xintercept = 65, color = '#7A7A7A', linetype="dashed") +
geom_vline(xintercept = 95, color = '#7A7A7A', linetype="dashed") +
geom_point(aes(percent.mt, nFeature_RNA, color = density), size = 0.2) +
scale_color_viridis() + theme_classic() +
theme(plot.title = element_text(size = 25))+
theme(text = element_text( size = 12),
axis.text.x = element_text( size = 10),
axis.text.y = element_text( size = 10),
legend.text = element_blank())
There are two main densities in the plot, one between 25 and 65, and
the second around 95%.
We keep all cells <65%MT and get rid of the density around 95%, which
will bring no information to the data set.
seudataf <- seudataf[,seudataf$percent.mt < 65]
seudataf## An object of class Seurat
## 28287 features across 51056 samples within 1 assay
## Active assay: RNA (28287 features, 0 variable features)
ta3 <- as.data.frame(table(seudataf$sample_name, seudataf$Health))
colnames(ta3) <- c('sample','Health','n_cells')
ta3 <- ta3[ta3$n_cells != 0,]
ta3$sample <- factor(ta3$sample, levels = c("HC 1", "HC 2", "HC 3", "HC 4",
"HC 5", "HC 6", "CD 1", "CD 2",
"CD 3", "CD 4", "CD 5", "CD 6",
"UC 1", "UC 2", "UC 3", "UC 4",
"UC 5", "UC 6"))
ggplot(data=ta3, aes(x=sample, y=n_cells, fill=Health)) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=n_cells), vjust=1.6, color="black",
position = position_dodge(0.9), size=2.5)+
scale_fill_manual(values = colors_health_alpha)+
labs(y = 'number of cells after MT filter') +
theme_classic()+
theme(axis.text.x = element_text(angle=90, vjust=0.5),
legend.title = element_blank(),
axis.title.x= element_blank())
counts <- seudataf@assays$RNA@counts
pp <- which(Matrix::rowSums(counts)==0)
length(pp)## [1] 278
xx <-setdiff(rownames(seudataf), names(pp))
seudataf <- subset(seudataf, features = xx)
seudataf## An object of class Seurat
## 28009 features across 51056 samples within 1 assay
## Active assay: RNA (28009 features, 0 variable features)
setwd('~/000_GitHub/ibd-bcn_single_cell/')
saveRDS(seudataf, file = 'Analysis of our data/02_Samples_Together/filtered_data_together.RDS')sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /opt/R/4.1.2/lib/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8
## [5] LC_MONETARY=C.UTF-8 LC_MESSAGES=C LC_PAPER=C.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Matrix_1.4-0 nnet_7.3-17 matchSCore2_0.1.0 harmony_0.1.0
## [5] Rcpp_1.0.9 rmarkdown_2.18 pandoc_0.1.0 readxl_1.3.1
## [9] magick_2.7.3 data.table_1.14.2 BiocParallel_1.28.3 RColorBrewer_1.1-3
## [13] ggrepel_0.9.1 ggrastr_1.0.1 usethis_2.1.5 clustree_0.4.4
## [17] ggraph_2.0.5 readr_2.1.2 dplyr_1.0.10 cowplot_1.1.1
## [21] reshape_0.8.8 formulaic_0.0.8 patchwork_1.1.2 MASS_7.3-55
## [25] viridis_0.6.2 viridisLite_0.4.1 scDblFinder_1.8.0 scran_1.22.1
## [29] scater_1.22.0 scuttle_1.4.0 celda_1.10.0 beepr_1.3
## [33] DropletUtils_1.14.2 SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0 Biobase_2.54.0
## [37] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1 IRanges_2.28.0 S4Vectors_0.32.4
## [41] BiocGenerics_0.40.0 MatrixGenerics_1.6.0 matrixStats_0.62.0 ggplot2_3.3.6
## [45] plyr_1.8.7 sp_1.5-0 SeuratObject_4.1.1 Seurat_4.1.0.9007
##
## loaded via a namespace (and not attached):
## [1] rsvd_1.0.5 ica_1.0-3 corpcor_1.6.10 assertive.properties_0.0-4
## [5] foreach_1.5.2 lmtest_0.9-40 rprojroot_2.0.3 crayon_1.5.2
## [9] spatstat.core_2.4-4 rhdf5filters_1.6.0 backports_1.4.1 nlme_3.1-155
## [13] rlang_1.0.6 XVector_0.34.0 ROCR_1.0-11 irlba_2.3.5
## [17] SparseM_1.81 limma_3.50.3 xgboost_1.5.0.2 rjson_0.2.21
## [21] bit64_4.0.5 glue_1.6.2 sctransform_0.3.4 parallel_4.1.2
## [25] vipor_0.4.5 spatstat.sparse_2.1-1 AnnotationDbi_1.56.2 spatstat.geom_2.4-0
## [29] tidyselect_1.1.2 fitdistrplus_1.1-8 tidyr_1.2.1 assertive.types_0.0-3
## [33] zoo_1.8-10 org.Mm.eg.db_3.14.0 xtable_1.8-4 magrittr_2.0.3
## [37] evaluate_0.18 cli_3.4.0 zlibbioc_1.40.0 rstudioapi_0.13
## [41] miniUI_0.1.1.1 bslib_0.4.1 rpart_4.1.16 RcppEigen_0.3.3.9.2
## [45] shiny_1.7.3 BiocSingular_1.10.0 xfun_0.34 clue_0.3-60
## [49] cluster_2.1.2 tidygraph_1.2.1 KEGGREST_1.34.0 tibble_3.1.8
## [53] listenv_0.8.0 Biostrings_2.62.0 png_0.1-7 future_1.28.0
## [57] withr_2.5.0 bitops_1.0-7 ggforce_0.3.3 cellranger_1.1.0
## [61] assertive.base_0.0-9 dqrng_0.3.0 pillar_1.8.1 GlobalOptions_0.1.2
## [65] cachem_1.0.6 fs_1.5.2 GetoptLong_1.0.5 DelayedMatrixStats_1.16.0
## [69] vctrs_0.4.1 ellipsis_0.3.2 generics_0.1.3 tools_4.1.2
## [73] beeswarm_0.4.0 munsell_0.5.0 tweenr_1.0.2 DelayedArray_0.20.0
## [77] fastmap_1.1.0 compiler_4.1.2 pkgload_1.2.4 abind_1.4-5
## [81] httpuv_1.6.6 plotly_4.10.0 rgeos_0.5-9 GenomeInfoDbData_1.2.7
## [85] gridExtra_2.3 enrichR_3.0 edgeR_3.36.0 lattice_0.20-45
## [89] deldir_1.0-6 utf8_1.2.2 later_1.3.0 jsonlite_1.8.3
## [93] multipanelfigure_2.1.2 scales_1.2.1 graph_1.72.0 ScaledMatrix_1.2.0
## [97] pbapply_1.5-0 sparseMatrixStats_1.6.0 lazyeval_0.2.2 promises_1.2.0.1
## [101] doParallel_1.0.17 R.utils_2.12.0 goftest_1.2-3 checkmate_2.0.0
## [105] spatstat.utils_2.3-1 reticulate_1.26 textshaping_0.3.6 statmod_1.4.36
## [109] Rtsne_0.16 uwot_0.1.14 igraph_1.3.4 HDF5Array_1.22.1
## [113] survival_3.2-13 rsconnect_0.8.25 yaml_2.3.6 systemfonts_1.0.4
## [117] htmltools_0.5.3 memoise_2.0.1 locfit_1.5-9.6 graphlayouts_0.8.0
## [121] digest_0.6.30 assertthat_0.2.1 rappdirs_0.3.3 mime_0.12
## [125] RSQLite_2.2.17 future.apply_1.9.1 blob_1.2.3 R.oo_1.25.0
## [129] ragg_1.2.1 splines_4.1.2 labeling_0.4.2 Rhdf5lib_1.16.0
## [133] RCurl_1.98-1.8 assertive.numbers_0.0-2 hms_1.1.1 rhdf5_2.38.1
## [137] colorspace_2.0-3 ggbeeswarm_0.6.0 shape_1.4.6 assertive.files_0.0-2
## [141] sass_0.4.2 RANN_2.6.1 circlize_0.4.14 audio_0.1-10
## [145] fansi_1.0.3 tzdb_0.2.0 brio_1.1.3 parallelly_1.32.1
## [149] R6_2.5.1 grid_4.1.2 ggridges_0.5.3 lifecycle_1.0.3
## [153] formatR_1.12 bluster_1.4.0 curl_4.3.2 jquerylib_0.1.4
## [157] leiden_0.4.3 testthat_3.1.2 desc_1.4.0 RcppAnnoy_0.0.19
## [161] org.Hs.eg.db_3.14.0 iterators_1.0.14 stringr_1.4.1 topGO_2.46.0
## [165] htmlwidgets_1.5.4 beachmat_2.10.0 polyclip_1.10-0 purrr_0.3.4
## [169] gridGraphics_0.5-1 ComplexHeatmap_2.10.0 mgcv_1.8-38 globals_0.16.1
## [173] spatstat.random_2.2-0 progressr_0.11.0 codetools_0.2-18 GO.db_3.14.0
## [177] metapod_1.2.0 MCMCprecision_0.4.0 R.methodsS3_1.8.2 gtable_0.3.1
## [181] DBI_1.1.3 highr_0.9 tensor_1.5 httr_1.4.4
## [185] KernSmooth_2.23-20 vroom_1.5.7 stringi_1.7.8 reshape2_1.4.4
## [189] farver_2.1.1 combinat_0.0-8 BiocNeighbors_1.12.0 scattermore_0.8
## [193] bit_4.0.4 spatstat.data_2.2-0 pkgconfig_2.0.3 corrplot_0.92
## [197] knitr_1.40