Install required R packages
# # Install the remotes package
# if (!requireNamespace("remotes", quietly = TRUE)) {
# install.packages("remotes")
# }
# # Install Seurat
# if (!requireNamespace("Seurat", quietly = TRUE)) {
# remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE)
# }
# # Install BiocManager
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# # Install SingleR package
# if (!require("hdf5r", quietly = TRUE)){
# BiocManager::install("hdf5r")
# }
# # Install SingleR package
# if (!require("presto", quietly = TRUE)){
# remotes::install_github("immunogenomics/presto")
# }
# # Install SingleR package
# if (!require("SingleR", quietly = TRUE)){
# BiocManager::install("SingleR")
# }
# if (!require("celldex", quietly = TRUE)){
# BiocManager::install("celldex")
# }
# if (!require("SingleCellExperiment", quietly = TRUE)){
# BiocManager::install("SingleCellExperiment")
# }
# if (!require("scater", quietly = TRUE)){
# BiocManager::install("scater")
# }
## Installing the R packages could take around 51 minutes
## To speed up this process, you can download the R lib files
## saved from a working Google Colab session
## https://drive.google.com/file/d/1EQvZnsV6P0eNjbW0hwYhz0P5z0iH3bsL/view?usp=drive_link
system("gdown 1EQvZnsV6P0eNjbW0hwYhz0P5z0iH3bsL")
system("md5sum R_lib4scRNA.tar.gz", intern = TRUE)
'5898c04fca5e680710cd6728ef9b1422 R_lib4scRNA.tar.gz'
## required by scater package
system("apt-get install libx11-dev libcairo2-dev") #, intern = TRUE)
system("tar zxvf R_lib4scRNA.tar.gz")
.libPaths(c("/content/usr/local/lib/R/site-library", .libPaths()))
.libPaths()
- '/content/usr/local/lib/R/site-library'
- '/usr/local/lib/R/site-library'
- '/usr/lib/R/site-library'
- '/usr/lib/R/library'
Load required R packages
library(Seurat)
library(dplyr)
library(SingleR)
library(celldex)
library(scater)
library(SingleCellExperiment)
list.files()
- 'R_lib4scRNA.tar.gz'
- 'sample_data'
- 'usr'
# https://drive.google.com/file/d/1-CvmcLvKMYW-OcLuGfFuGQMK2b5_VMFk/view?usp=drive_link
# Download "filtered_feature_bc_matrix.h5"
# Output of cellranger
system("gdown 1-CvmcLvKMYW-OcLuGfFuGQMK2b5_VMFk")
system("md5sum filtered_feature_bc_matrix.h5", intern = TRUE)
'360fc0760ebb9e6dd253d808a427b20d filtered_feature_bc_matrix.h5'
count_mtx_scrna <- Read10X_h5("filtered_feature_bc_matrix.h5")
# If you have the filtered_feature_bc_matrix/ folder, you can use
# Read10X to create 'count_mtx_scrna'
# system("mkdir filtered_feature_bc_matrix/; mv filtered_feature_bc_matrix.zip filtered_feature_bc_matrix")
# system("cd filtered_feature_bc_matrix; unzip filtered_feature_bc_matrix.zip")
# count_mtx_scrna <- Read10X("filtered_feature_bc_matrix/")
class(count_mtx_scrna)
'dgCMatrix'
The dgCMatrix class is a specific data structure in R's Matrix package, designed to store sparse matrices in a memory-efficient format. Sparse matrices are those with many zeros, making them ideal for high-dimensional data in applications like bioinformatics, where gene expression matrices often contain a lot of zeroes.
Why Use dgCMatrix?
- Memory Efficiency: Storing only non-zero values saves memory, especially in high-dimensional matrices.
- Computational Speed: Some operations on sparse matrices can be faster, as computations are limited to non-zero entries.
print(format(object.size(count_mtx_scrna), units = "MB"))
[1] "168.7 Mb"
# Check a few genes in the first 20 cells
count_mtx_scrna[c("CD3D", "TCL1A", "MS4A1"), 100:140]
[[ suppressing 41 column names ‘AACCATGCACTCAAGT-1’, ‘AACCATGGTAGCTTGT-1’, ‘AACCATGTCAATCCGA-1’ ... ]]
3 x 41 sparse Matrix of class "dgCMatrix"
CD3D 8 1 . . . 3 . 2 10 . 3 . . . 2 7 1 . . . 1 1 . 8 . . 2 4 . . . 12 11 .
TCL1A . . . . . . 6 . . . . . . . . . . . . . . . 10 . . . . . . . . . . .
MS4A1 . . . . . . 9 . . 16 . . . . . . . . . . . . 5 . . . . . . . . . . .
CD3D . 5 . . . 3 .
TCL1A . . . . . . .
MS4A1 20 . . . 39 . .
# non-normalized da# Initialize the Seurat object with the raw count matrix
pbmc <- CreateSeuratObject(counts = count_mtx_scrna,
project = "pbmc5k",
min.cells = 3, min.features = 200)
pbmc
An object of class Seurat
24785 features across 4884 samples within 1 assay
Active assay: RNA (24785 features, 0 variable features)
1 layer present: counts
Understand Seurat object
Seurat slots https://github.com/satijalab/seurat/wiki/seurat
str(pbmc)
Formal class 'Seurat' [package "SeuratObject"] with 13 slots
..@ assays :List of 1
.. ..$ RNA:Formal class 'Assay5' [package "SeuratObject"] with 8 slots
.. .. .. ..@ layers :List of 1
.. .. .. .. ..$ counts:Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
.. .. .. .. .. .. ..@ i : int [1:14449622] 6 17 42 62 79 83 85 94 100 109 ...
.. .. .. .. .. .. ..@ p : int [1:4885] 0 3378 5344 5581 8581 10897 13921 16902 20299 22669 ...
.. .. .. .. .. .. ..@ Dim : int [1:2] 24785 4884
.. .. .. .. .. .. ..@ Dimnames:List of 2
.. .. .. .. .. .. .. ..$ : NULL
.. .. .. .. .. .. .. ..$ : NULL
.. .. .. .. .. .. ..@ x : num [1:14449622] 1 1 4 1 1 2 1 1 1 1 ...
.. .. .. .. .. .. ..@ factors : list()
.. .. .. ..@ cells :Formal class 'LogMap' [package "SeuratObject"] with 1 slot
.. .. .. .. .. ..@ .Data: logi [1:4884, 1] TRUE TRUE TRUE TRUE TRUE TRUE ...
.. .. .. .. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. .. .. .. ..$ : chr [1:4884] "AAACCCATCAGATGCT-1" "AAACGAAAGTGCTACT-1" "AAACGAAGTCGTAATC-1" "AAACGAAGTTGCCAAT-1" ...
.. .. .. .. .. .. .. ..$ : chr "counts"
.. .. .. .. .. ..$ dim : int [1:2] 4884 1
.. .. .. .. .. ..$ dimnames:List of 2
.. .. .. .. .. .. ..$ : chr [1:4884] "AAACCCATCAGATGCT-1" "AAACGAAAGTGCTACT-1" "AAACGAAGTCGTAATC-1" "AAACGAAGTTGCCAAT-1" ...
.. .. .. .. .. .. ..$ : chr "counts"
.. .. .. ..@ features :Formal class 'LogMap' [package "SeuratObject"] with 1 slot
.. .. .. .. .. ..@ .Data: logi [1:24785, 1] TRUE TRUE TRUE TRUE TRUE TRUE ...
.. .. .. .. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. .. .. .. ..$ : chr [1:24785] "ENSG00000238009" "ENSG00000241860" "ENSG00000290385" "ENSG00000291215" ...
.. .. .. .. .. .. .. ..$ : chr "counts"
.. .. .. .. .. ..$ dim : int [1:2] 24785 1
.. .. .. .. .. ..$ dimnames:List of 2
.. .. .. .. .. .. ..$ : chr [1:24785] "ENSG00000238009" "ENSG00000241860" "ENSG00000290385" "ENSG00000291215" ...
.. .. .. .. .. .. ..$ : chr "counts"
.. .. .. ..@ default : int 1
.. .. .. ..@ assay.orig: chr(0)
.. .. .. ..@ meta.data :'data.frame': 24785 obs. of 0 variables
.. .. .. ..@ misc :List of 1
.. .. .. .. ..$ calcN: logi TRUE
.. .. .. ..@ key : chr "rna_"
..@ meta.data :'data.frame': 4884 obs. of 3 variables:
.. ..$ orig.ident : Factor w/ 1 level "pbmc5k": 1 1 1 1 1 1 1 1 1 1 ...
.. ..$ nCount_RNA : num [1:4884] 11578 5655 14728 10903 6174 ...
.. ..$ nFeature_RNA: int [1:4884] 3378 1966 237 3000 2316 3024 2981 3397 2370 2811 ...
..@ active.assay: chr "RNA"
..@ active.ident: Factor w/ 1 level "pbmc5k": 1 1 1 1 1 1 1 1 1 1 ...
.. ..- attr(*, "names")= chr [1:4884] "AAACCCATCAGATGCT-1" "AAACGAAAGTGCTACT-1" "AAACGAAGTCGTAATC-1" "AAACGAAGTTGCCAAT-1" ...
..@ graphs : list()
..@ neighbors : list()
..@ reductions : list()
..@ images : list()
..@ project.name: chr "pbmc5k"
..@ misc : list()
..@ version :Classes 'package_version', 'numeric_version' hidden list of 1
.. ..$ : int [1:3] 5 0 2
..@ commands : list()
..@ tools : list()
slotNames(pbmc)
- 'assays'
- 'meta.data'
- 'active.assay'
- 'active.ident'
- 'graphs'
- 'neighbors'
- 'reductions'
- 'images'
- 'project.name'
- 'misc'
- 'version'
- 'commands'
- 'tools'
Access Seurat object
pbmc@active.assay
'RNA'
class(pbmc@meta.data)
head(pbmc@meta.data, 4)
'data.frame'
orig.ident | nCount_RNA | nFeature_RNA | |
---|---|---|---|
<fct> | <dbl> | <int> | |
AAACCCATCAGATGCT-1 | pbmc5k | 11578 | 3378 |
AAACGAAAGTGCTACT-1 | pbmc5k | 5655 | 1966 |
AAACGAAGTCGTAATC-1 | pbmc5k | 14728 | 237 |
AAACGAAGTTGCCAAT-1 | pbmc5k | 10903 | 3000 |
colSums(pbmc@assays$RNA$counts)[1:3]
- AAACCCATCAGATGCT-1
- 11578
- AAACGAAAGTGCTACT-1
- 5655
- AAACGAAGTCGTAATC-1
- 14728
Layers(pbmc)
- 'counts'
- 'data'
- 'scale.data'
Data preprocessing
# Use $ operator to add columns to object metadata.
pbmc$percent.mt <- PercentageFeatureSet(pbmc,
pattern = "^MT-")
colnames(pbmc@meta.data)
- 'orig.ident'
- 'nCount_RNA'
- 'nFeature_RNA'
- 'percent.mt'
head(pbmc, 4)
orig.ident | nCount_RNA | nFeature_RNA | percent.mt | RNA_snn_res.0.1 | seurat_clusters | singleR_hpca | singleR_blueprint | |
---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <int> | <dbl> | <fct> | <fct> | <chr> | <chr> | |
AAACCCATCAGATGCT-1 | pbmc5k | 11578 | 3378 | 3.62756953 | 1 | 1 | T_cells | CD4+ T-cells |
AAACGAAAGTGCTACT-1 | pbmc5k | 5655 | 1966 | 4.29708223 | 0 | 0 | T_cells | CD4+ T-cells |
AAACGAAGTCGTAATC-1 | pbmc5k | 14728 | 237 | 0.02715915 | 6 | 6 | Erythroblast | Erythrocytes |
AAACGAAGTTGCCAAT-1 | pbmc5k | 10903 | 3000 | 4.89773457 | 5 | 5 | T_cells | CD8+ T-cells |
# Use violin plot to visualize QC metrics
VlnPlot(pbmc,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3)
Warning message:
“Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.”
How to read the Violin Plot
-
Shape: Each violin plot shows the distribution of values for each feature across the cells in your dataset. The shape of the plot indicates the density of cells with particular values for that feature.
Wider sections indicate more cells with those values. Narrow sections indicate fewer cells with those values.
-
Vertical Axis: Represents the range of values for each feature. For instance:
nFeature_RNA
and nCount_RNA
: Higher values suggest more gene diversity and RNA content, respectively.
percent.mt
: Higher values indicate higher mitochondrial content, which may point to stressed or dying cells.
- Horizontal Axis (Groups): If your dataset is separated into clusters or groups (e.g., cell types or conditions), each group will have its own violin, allowing you to compare distributions between groups.
How to interpret QC plot
nFeature_RNA
: The number of unique features (genes) detected per cell.
Extremely high values could suggest potential doublets (two cells mistakenly captured as one), as two cells would have more unique genes combined.
Low number of detected genes - potential ambient mRNA (not real cells)
nCount_RNA
: The total number of RNA molecules (or unique molecular identifiers, UMIs) detected per cell.
Higher counts generally indicate higher RNA content, but they could also result from cell doublets. Cells with very low nCount_RNA might represent poor-quality cells with low RNA capture, while very high counts may also suggest doublets.
percent.mt
: The percentage of reads mapping to mitochondrial genes.
High mitochondrial content often indicates cell stress or apoptosis, as damaged cells tend to release mitochondrial RNA.
Filtering cells with high percent.mt
values is common to exclude potentially dying cells.
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# Load necessary libraries
library(Seurat)
library(ggplot2)
# Define the function to calculate median and MAD values
calculate_thresholds <- function(seurat_obj) {
# Extract relevant columns
nFeature_values <- seurat_obj@meta.data$nFeature_RNA
nCount_values <- seurat_obj@meta.data$nCount_RNA
percent_mt_values <- seurat_obj@meta.data$percent.mt
# Calculate medians and MADs
nFeature_median <- median(nFeature_values, na.rm = TRUE)
nFeature_mad <- mad(nFeature_values, constant = 1, na.rm = TRUE)
nCount_median <- median(nCount_values, na.rm = TRUE)
nCount_mad <- mad(nCount_values, constant = 1, na.rm = TRUE)
percent_mt_median <- median(percent_mt_values, na.rm = TRUE)
percent_mt_mad <- mad(percent_mt_values, constant = 1, na.rm = TRUE)
# Calculate thresholds for horizontal lines
thresholds <- list(
nFeature_upper = nFeature_median + 4 * nFeature_mad,
nFeature_lower = nFeature_median - 4 * nFeature_mad,
nCount_upper = nCount_median + 4 * nCount_mad,
nCount_lower = nCount_median - 4 * nCount_mad,
percent_mt_upper = percent_mt_median + 4 * percent_mt_mad
)
return(thresholds)
}
# Calculate thresholds
thresholds <- calculate_thresholds(pbmc)
thresholds
- $nFeature_upper
- 5243.5
- $nFeature_lower
- 583.5
- $nCount_upper
- 19044
- $nCount_lower
- -1224
- $percent_mt_upper
- 8.44783722411371
vplot1 <- VlnPlot(pbmc, features = c("nFeature_RNA"), ncol = 2) +
geom_hline(yintercept = thresholds$nFeature_upper,
color = "blue", linetype = "solid") +
geom_hline(yintercept = thresholds$nFeature_lower,
color = "blue", linetype = "solid") +
theme(legend.position="none")
vplot2 <- VlnPlot(pbmc, features = c("percent.mt"), ncol = 2) +
geom_hline(yintercept = thresholds$percent_mt_upper,
color = "blue", linetype = "solid") +
theme(legend.position="none")
vplot1 + vplot2
Warning message:
“Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.”
Warning message:
“Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.”
Filter out potential doublets, empty droplets and dying cells
pbmc <- subset(pbmc,
subset = nFeature_RNA > thresholds$nCount_lower &
nFeature_RNA < thresholds$nFeature_upper &
percent.mt < thresholds$percent_mt_upper)
# Use violin plot to visualize QC metrics after QC
VlnPlot(pbmc,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3)
Instead of using an arbitrary number, you can also use statistical algorithm to predict doublets and empty droplets to filter the cells, such as DoubletFinder
and EmptyDrops
.
Normalization and Scaling of the data
Normalization
After removing unwanted cells from the dataset, the next step is to normalize the data. By default, a global-scaling normalization method “LogNormalize” that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. In Seurat v5, Normalized values are stored in pbmc[["RNA"]]$data
.
pbmc <- NormalizeData(pbmc) # normalization.method = "LogNormalize", scale.factor = 10000
Normalizing layer: counts
While this method of normalization is standard and widely used in scRNA-seq analysis, global-scaling relies on an assumption that each cell originally contains the same number of RNA molecules.
Next, we identify a subset of features that show high variation across cells in the dataset—meaning they are highly expressed in some cells and lowly expressed in others. Prior work, including our own, has shown that focusing on these variable genes in downstream analyses can enhance the detection of biological signals in single-cell datasets.
The approach used in Seurat improves upon previous versions by directly modeling the inherent mean-variance relationship in single-cell data. This method is implemented in the FindVariableFeatures() function, which, by default, selects 2,000 variable features per dataset. These features will then be used in downstream analyses, such as PCA.
pbmc <- FindVariableFeatures(pbmc,
selection.method = "vst",
nfeatures = 2000)
Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
options(repr.plot.width=10, repr.plot.height= 6)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
When using repel, set xnudge and ynudge to 0 for optimal results
Warning message in scale_x_log10():
“[1m[22m[32mlog-10[39m transformation introduced infinite values.”
Warning message in scale_x_log10():
“[1m[22m[32mlog-10[39m transformation introduced infinite values.”
Scaling the data
Next, we apply a linear transformation (scaling
) that is a standard pre-processing step prior to dimensional reduction techniques like PCA. The ScaleData()
function:
Shifts the expression of each gene, so that the mean expression across cells is 0 Scales the expression of each gene, so that the variance across cells is 1
This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate
The results of this are stored in pbmc[["RNA"]]$scale.data
By default, only variable features are scaled. You can specify the features argument to scale additional features.
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
Centering and scaling data matrix
Perform linear dimensional reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
PC_ 1
Positive: CD247, IL32, IL7R, RORA, CAMK4, LTB, INPP4B, STAT4, BCL2, ANK3
ZEB1, LEF1, TRBC1, CARD11, THEMIS, BACH2, MLLT3, RNF125, RASGRF2, NR3C2
NELL2, PDE3B, LINC01934, ENSG00000290067, PRKCA, TAFA1, PYHIN1, CTSW, CSGALNACT1, SAMD3
Negative: LYZ, FCN1, IRAK3, SLC8A1, CLEC7A, PLXDC2, IFI30, S100A9, SPI1, CYBB
MNDA, LRMDA, FGL2, VCAN, CTSS, RBM47, CSF3R, MCTP1, NCF2, TYMP
CYRIA, CST3, HCK, SLC11A1, WDFY3, S100A8, MS4A6A, MPEG1, LST1, CSTA
PC_ 2
Positive: CD247, S100A4, STAT4, NKG7, CST7, CTSW, GZMA, SYTL3, RNF125, SAMD3
NCALD, MYO1F, MYBL1, KLRD1, PLCB1, TGFBR3, PRF1, GNLY, RAP1GAP2, RORA
CCL5, HOPX, FGFBP2, YES1, PYHIN1, FNDC3B, GNG2, SYNE1, KLRF1, SPON2
Negative: BANK1, MS4A1, CD79A, FCRL1, PAX5, IGHM, AFF3, LINC00926, NIBAN3, EBF1
IGHD, BLK, CD22, OSBPL10, HLA-DQA1, COL19A1, GNG7, KHDRBS2, RUBCNL, TNFRSF13C
COBLL1, RALGPS2, TCL1A, BCL11A, CDK14, CD79B, PLEKHG1, HLA-DQB1, IGKC, BLNK
PC_ 3
Positive: TUBB1, GP9, GP1BB, PF4, CAVIN2, GNG11, NRGN, PPBP, RGS18, PRKAR2B
H2AC6, ACRBP, PTCRA, TMEM40, TREML1, CLU, LEF1, GPX1, CMTM5, SMANTIS
MPIG6B, CAMK4, MPP1, SPARC, ENSG00000289621, ITGB3, MYL9, MYL4, ITGA2B, F13A1
Negative: NKG7, CST7, GNLY, PRF1, KLRD1, GZMA, KLRF1, MCTP2, GZMB, FGFBP2
HOPX, SPON2, C1orf21, TGFBR3, VAV3, MYBL1, CTSW, SYNE1, NCALD, IL2RB
SAMD3, GNG2, BNC2, CEP78, YES1, RAP1GAP2, PDGFD, LINC02384, CARD11, CLIC3
PC_ 4
Positive: CAMK4, INPP4B, IL7R, LEF1, PRKCA, PDE3B, MAML2, LTB, ANK3, PLCL1
BCL2, CDC14A, THEMIS, FHIT, NELL2, VIM, ENSG00000290067, MLLT3, TSHZ2, NR3C2
IL32, CMTM8, ENSG00000249806, ZEB1, SESN3, CSGALNACT1, TAFA1, LEF1-AS1, SLC16A10, LDLRAD4
Negative: GP1BB, GP9, TUBB1, PF4, CAVIN2, GNG11, PPBP, H2AC6, PTCRA, NRGN
ACRBP, TMEM40, PRKAR2B, RGS18, TREML1, MPIG6B, SMANTIS, CMTM5, CLU, SPARC
ITGA2B, ITGB3, ENSG00000289621, MYL9, CAPN1-AS1, MYL4, ENSG00000288758, DAB2, PDGFA-DT, CTTN
PC_ 5
Positive: CDKN1C, HES4, FCGR3A, PELATON, CSF1R, IFITM3, SIGLEC10, TCF7L2, ZNF703, MS4A7
UICLM, ENSG00000287682, NEURL1, RHOC, FMNL2, CKB, FTL, CALHM6, HMOX1, BATF3
ACTB, MYOF, CCDC26, IFITM2, PAPSS2, RRAS, LST1, VMO1, SERPINA1, LRRC25
Negative: LINC02458, AKAP12, CA8, ENSG00000250696, SLC24A3, HDC, IL3RA, EPAS1, ENPP3, OSBPL1A
TRPM6, CCR3, CSF2RB, SEMA3C, THSD7A, ATP10D, DACH1, CRPPA, ATP8B4, TMEM164
ABHD5, CLC, CR1, ITGB8, LIN7A, TAFA2, MBOAT2, GATA2, DAPK2, GCSAML
You have several useful ways to visualize both cells and features that define the PCA, including VizDimReduction()
, DimPlot()
, and DimHeatmap()
.
DimPlot(pbmc, reduction = "pca") + NoLegend()
DimHeatmap()
draws a heatmap focusing on a principal component. Both cells and genes are sorted by their principal component scores
DimHeatmap(pbmc, dims = 1:3, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 20:22, cells = 500, balanced = TRUE)
Determine the ‘dimensionality’ of the dataset
The elbow plot is a useful tool for determining the number of principal components (PCs) needed to capture the majority of variation in the data. It displays the standard deviation of each PC, with the "elbow" point typically serving as the threshold for selecting the most informative PCs. However, identifying the exact location of the elbow can be somewhat subjective.
ElbowPlot(pbmc, ndims = 50)
# Determine the percentage of variation associated with each PC
pct_var <- pbmc[["pca"]]@stdev / sum(pbmc[["pca"]]@stdev) * 100
# Calculate cumulative percentages for each PC
cumu_pct <- cumsum(pct_var)
# Identify the first PC where cumulative percentage exceeds 90% and individual variance is less than 5%
pc_number <- which(cumu_pct > 90 & pct_var < 5)[1]
pc_number
41
Cluster the cells
Seurat embeds cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected quasi-cliques
or communities
.
Seurat first constructs a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the FindNeighbors()
function, and takes as input the previously defined dimensionality of the dataset.
To cluster the cells, Seurat next applies modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function. The FindClusters() function implements this procedure, and contains a resolution parameter that sets the granularity
of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 1 typically returns good results for single-cell datasets of around 5k cells. Optimal resolution often increases for larger datasets. The clusters can be found using the Idents()
function.
pbmc <- FindNeighbors(pbmc, dims = 1:pc_number)
pbmc <- FindClusters(pbmc, resolution = 0.1)
Computing nearest neighbor graph
Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 4559
Number of edges: 184776
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9713
Number of communities: 10
Elapsed time: 0 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
- AAACCCATCAGATGCT-1
- 1
- AAACGAAAGTGCTACT-1
- 0
- AAACGAAGTCGTAATC-1
- 6
- AAACGAAGTTGCCAAT-1
- 5
- AAACGAATCCGAGGCT-1
- 4
Levels:
- '0'
- '1'
- '2'
- '3'
- '4'
- '5'
- '6'
- '7'
- '8'
- '9'
Run non-linear dimensional reduction (UMAP/tSNE)
To visualize and explore these datasets, Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP.
The goal of tSNE/UMAP is to learn underlying structure in the dataset, in order to place similar cells together in low-dimensional space. Therefore, cells that are grouped together within graph-based clusters determined above should co-localize on these dimension reduction plots.
pbmc <- RunUMAP(pbmc, dims = 1:pc_number)
01:21:37 UMAP embedding parameters a = 0.9922 b = 1.112
01:21:37 Read 4559 rows and found 41 numeric columns
01:21:37 Using Annoy for neighbor search, n_neighbors = 30
01:21:38 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|
01:21:38 Writing NN index file to temp file /tmp/RtmpwNVWV1/file10a4d98f0d9
01:21:38 Searching Annoy index using 1 thread, search_k = 3000
01:21:40 Annoy recall = 100%
01:21:41 Commencing smooth kNN distance calibration using 1 thread
with target n_neighbors = 30
01:21:43 Found 2 connected components,
falling back to 'spca' initialization with init_sdev = 1
01:21:43 Using 'irlba' for PCA
01:21:43 PCA: 2 components explained 46.09% variance
01:21:43 Scaling init to sdev = 1
01:21:43 Commencing optimization for 500 epochs, with 188320 positive edges
01:21:49 Optimization finished
DimPlot(pbmc, reduction = "umap")
Finding differentially expressed features (cluster biomarkers)
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
pbmc.markers %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1)
Calculating cluster 0
Calculating cluster 1
Calculating cluster 2
Calculating cluster 3
Calculating cluster 4
Calculating cluster 5
Calculating cluster 6
Calculating cluster 7
Calculating cluster 8
Calculating cluster 9
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | cluster | gene |
---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <fct> | <chr> |
0.000000e+00 | 2.193938 | 0.974 | 0.426 | 0.000000e+00 | 0 | IL32 |
3.852209e-275 | 1.679326 | 0.951 | 0.362 | 9.547700e-271 | 0 | CD3D |
9.596093e-260 | 1.803885 | 0.888 | 0.352 | 2.378392e-255 | 0 | TRAC |
4.781243e-252 | 1.573425 | 0.959 | 0.532 | 1.185031e-247 | 0 | SYNE2 |
3.690404e-235 | 1.512388 | 0.917 | 0.348 | 9.146665e-231 | 0 | CD3G |
9.967907e-232 | 1.809323 | 0.910 | 0.485 | 2.470546e-227 | 0 | RORA |
1.117947e-227 | 1.378699 | 0.951 | 0.380 | 2.770831e-223 | 0 | CD3E |
1.039791e-208 | 2.253821 | 0.681 | 0.233 | 2.577123e-204 | 0 | NIBAN1 |
2.027326e-187 | 1.595996 | 0.810 | 0.357 | 5.024728e-183 | 0 | CD2 |
8.928063e-181 | 1.512822 | 0.846 | 0.383 | 2.212820e-176 | 0 | IL7R |
2.768362e-179 | 2.583397 | 0.463 | 0.091 | 6.861385e-175 | 0 | PRDM1 |
2.609892e-166 | 1.825963 | 0.777 | 0.389 | 6.468617e-162 | 0 | PPP1R16B |
8.007361e-166 | 3.888615 | 0.310 | 0.031 | 1.984624e-161 | 0 | TRGC2 |
1.861258e-163 | 1.694587 | 0.717 | 0.296 | 4.613128e-159 | 0 | CD6 |
1.403718e-158 | 1.072541 | 0.969 | 0.807 | 3.479115e-154 | 0 | PPP2R5C |
9.430895e-158 | 2.200303 | 0.525 | 0.147 | 2.337447e-153 | 0 | CD5 |
5.325382e-153 | 1.314341 | 0.835 | 0.433 | 1.319896e-148 | 0 | SPOCK2 |
5.541697e-151 | 1.501378 | 0.757 | 0.363 | 1.373510e-146 | 0 | GPRIN3 |
1.783366e-148 | 1.524168 | 0.815 | 0.492 | 4.420072e-144 | 0 | LRRC8C |
5.511823e-146 | 1.130197 | 0.937 | 0.772 | 1.366105e-141 | 0 | EML4 |
4.406921e-145 | 1.807049 | 0.821 | 0.535 | 1.092255e-140 | 0 | TNFAIP3 |
8.596339e-145 | 1.322956 | 0.800 | 0.391 | 2.130603e-140 | 0 | LIME1 |
1.601435e-143 | 1.301753 | 0.861 | 0.505 | 3.969156e-139 | 0 | ATXN1 |
3.308730e-140 | 4.011786 | 0.283 | 0.033 | 8.200689e-136 | 0 | GZMK |
1.881698e-138 | 1.586776 | 0.770 | 0.413 | 4.663787e-134 | 0 | PHACTR2 |
8.271921e-138 | 1.486293 | 0.791 | 0.479 | 2.050196e-133 | 0 | TTC39C |
8.346144e-135 | 3.699470 | 0.245 | 0.020 | 2.068592e-130 | 0 | MIAT |
2.291535e-126 | 1.420132 | 0.725 | 0.348 | 5.679570e-122 | 0 | ANK3 |
2.583870e-126 | 2.075113 | 0.477 | 0.151 | 6.404121e-122 | 0 | PBX4 |
3.042390e-125 | 1.333502 | 0.694 | 0.316 | 7.540564e-121 | 0 | OPTN |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
0.009116629 | 3.631680 | 0.021 | 0.002 | 1 | 9 | H4C1 |
0.009116629 | 3.608558 | 0.021 | 0.002 | 1 | 9 | ENSG00000289291 |
0.009116629 | 3.605235 | 0.021 | 0.002 | 1 | 9 | VAMP1-AS1 |
0.009116629 | 3.591495 | 0.021 | 0.002 | 1 | 9 | ENSG00000249328 |
0.009116629 | 3.567020 | 0.021 | 0.002 | 1 | 9 | ERICH2-DT |
0.009116629 | 3.550000 | 0.021 | 0.002 | 1 | 9 | NLRP10 |
0.009116629 | 3.518027 | 0.021 | 0.002 | 1 | 9 | ENSG00000270087 |
0.009116629 | 3.496166 | 0.021 | 0.002 | 1 | 9 | ENSG00000253593 |
0.009116629 | 3.393207 | 0.021 | 0.002 | 1 | 9 | ADAM11 |
0.009116629 | 3.272524 | 0.021 | 0.002 | 1 | 9 | ENSG00000228150 |
0.009116629 | 3.266281 | 0.021 | 0.002 | 1 | 9 | LINC03065 |
0.009116629 | 3.221456 | 0.021 | 0.002 | 1 | 9 | STARD13-AS |
0.009116629 | 3.109640 | 0.021 | 0.002 | 1 | 9 | FGGY-DT |
0.009116629 | 2.116185 | 0.021 | 0.002 | 1 | 9 | ENSG00000285751 |
0.009353183 | 1.486986 | 0.146 | 0.057 | 1 | 9 | TRPM2 |
0.009560927 | 1.198107 | 0.083 | 0.024 | 1 | 9 | SYCP3 |
0.009563781 | 1.367524 | 0.062 | 0.015 | 1 | 9 | MAP7 |
0.009595402 | 1.283987 | 0.083 | 0.024 | 1 | 9 | PPP1R13L |
0.009686754 | 2.584749 | 0.042 | 0.008 | 1 | 9 | PRRG2 |
0.009706714 | 2.407445 | 0.042 | 0.008 | 1 | 9 | LINC02185 |
0.009746744 | 2.473951 | 0.042 | 0.008 | 1 | 9 | LINC02901 |
0.009766814 | 2.300928 | 0.042 | 0.008 | 1 | 9 | WNK3 |
0.009766814 | 1.640083 | 0.042 | 0.008 | 1 | 9 | CALCRL-AS1 |
0.009786921 | 2.434878 | 0.042 | 0.008 | 1 | 9 | ENSG00000272112 |
0.009786922 | 2.438294 | 0.042 | 0.008 | 1 | 9 | ENSG00000277589 |
0.009847464 | 2.262934 | 0.042 | 0.008 | 1 | 9 | PCDH15 |
0.009847464 | 2.064514 | 0.042 | 0.008 | 1 | 9 | CIBAR1 |
0.009888011 | 2.173570 | 0.042 | 0.008 | 1 | 9 | SEZ6 |
0.009908340 | 1.746860 | 0.042 | 0.008 | 1 | 9 | RTKN |
0.009993542 | 1.340809 | 0.146 | 0.059 | 1 | 9 | ENSG00000287100 |
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))
head(cluster5.markers, n = 5)
p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | |
---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
PTPRK | 4.438261e-196 | 5.477436 | 0.594 | 0.020 | 1.100023e-191 |
NRCAM | 9.537544e-196 | 7.815745 | 0.496 | 0.005 | 2.363880e-191 |
AIF1 | 1.738908e-192 | 4.287096 | 0.804 | 0.077 | 4.309884e-188 |
LINC02446 | 3.824707e-176 | 4.782107 | 0.638 | 0.039 | 9.479537e-172 |
NELL2 | 7.816637e-150 | 3.058779 | 0.969 | 0.213 | 1.937354e-145 |
VlnPlot(pbmc, features = c("PTPRK", "NRCAM"))
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
Warning message:
“[1m[22mThe `slot` argument of `VlnPlot()` is deprecated as of Seurat 5.0.0.
[36mℹ[39m Please use the `layer` argument instead.”
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
pbmc.markers %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1) %>%
slice_head(n = 10) %>%
ungroup() -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
Cell type annotation using SingleR
library(SingleCellExperiment )
sce <- as.SingleCellExperiment(pbmc)
sce <- scater::logNormCounts(sce)
sce
class: SingleCellExperiment
dim: 24785 4559
metadata(0):
assays(3): counts logcounts scaledata
rownames(24785): ENSG00000238009 ENSG00000241860 ... ENSG00000276345
ENSG00000278817
rowData names(0):
colnames(4559): AAACCCATCAGATGCT-1 AAACGAAAGTGCTACT-1 ...
TTTGTTGTCGAAGTGG-1 TTTGTTGTCGCATAGT-1
colData names(10): orig.ident nCount_RNA ... ident sizeFactor
reducedDimNames(2): PCA UMAP
mainExpName: RNA
altExpNames(0):
# Download and cache the normalized expression values of the data
# stored in the Human Primary Cell Atlas. The data will be
# downloaded from ExperimentHub, returning a SummarizedExperiment
# object for further use.
hpca <- HumanPrimaryCellAtlasData()
# Obtain human bulk RNA-seq data from Blueprint and ENCODE
blueprint <- BlueprintEncodeData()
pred.hpca <- SingleR(test = sce,
ref = hpca, labels = hpca$label.main)
tab_hpca <- table(pred.hpca$pruned.labels)
write.csv(sort(tab_hpca, decreasing=TRUE), 'pbmc_annotations_HPCA_general.csv', row.names=FALSE)
tab_hpca
B_cell BM DC Endothelial_cells
451 30 1 1
Erythroblast GMP HSC_-G-CSF HSC_CD34+
60 2 11 1
Monocyte Neutrophils NK_cell Platelets
759 39 677 93
Pre-B_cell_CD34- T_cells
2 2350
Each row of the output DataFrame contains prediction results for a single cell. Labels are shown before (labels) and after pruning (pruned.labels), along with the associated scores.
head(pred.hpca)
DataFrame with 6 rows and 4 columns
scores labels delta.next
<matrix> <character> <numeric>
AAACCCATCAGATGCT-1 0.1409902:0.3257687:0.281000:... T_cells 0.0708860
AAACGAAAGTGCTACT-1 0.1407268:0.3072562:0.264148:... T_cells 0.6026772
AAACGAAGTCGTAATC-1 0.0604399:0.0725122:0.184863:... Erythroblast 0.1268946
AAACGAAGTTGCCAAT-1 0.1585486:0.3228307:0.278787:... T_cells 0.6492489
AAACGAATCCGAGGCT-1 0.1166524:0.3565152:0.277855:... B_cell 0.0505309
AAACGAATCGAACGCC-1 0.1437411:0.3427680:0.299201:... NK_cell 0.3155681
pruned.labels
<character>
AAACCCATCAGATGCT-1 T_cells
AAACGAAAGTGCTACT-1 T_cells
AAACGAAGTCGTAATC-1 Erythroblast
AAACGAAGTTGCCAAT-1 T_cells
AAACGAATCCGAGGCT-1 B_cell
AAACGAATCGAACGCC-1 NK_cell
pred.blueprint <- SingleR(test = sce,
ref = blueprint, labels = blueprint$label.main)
tab_blueprint <- table(pred.blueprint$pruned.labels)
write.csv(sort(tab_blueprint, decreasing=TRUE), 'pbmc_annotations_BlueprintENCODE_general.csv', row.names=FALSE)
tab_blueprint
B-cells CD4+ T-cells CD8+ T-cells DC
455 1749 576 3
Endothelial cells Eosinophils Erythrocytes HSC
1 50 128 4
Monocytes Neutrophils NK cells
768 3 692
head(pred.blueprint)
DataFrame with 6 rows and 4 columns
scores labels delta.next
<matrix> <character> <numeric>
AAACCCATCAGATGCT-1 0.2145648:0.1136181:0.437872:... CD4+ T-cells 0.0512150
AAACGAAAGTGCTACT-1 0.2311086:0.1664737:0.393066:... CD4+ T-cells 0.3283657
AAACGAAGTCGTAATC-1 0.0977069:0.0728724:0.100641:... Erythrocytes 0.0757261
AAACGAAGTTGCCAAT-1 0.2289000:0.1565938:0.423090:... CD8+ T-cells 0.0620951
AAACGAATCCGAGGCT-1 0.2353403:0.1291495:0.500443:... B-cells 0.1276654
AAACGAATCGAACGCC-1 0.2308036:0.1288775:0.417440:... NK cells 0.1486502
pruned.labels
<character>
AAACCCATCAGATGCT-1 CD4+ T-cells
AAACGAAAGTGCTACT-1 CD4+ T-cells
AAACGAAGTCGTAATC-1 Erythrocytes
AAACGAAGTTGCCAAT-1 CD8+ T-cells
AAACGAATCCGAGGCT-1 B-cells
AAACGAATCGAACGCC-1 NK cells
pbmc$singleR_hpca = pred.hpca$pruned.labels
pbmc$singleR_blueprint = pred.blueprint$pruned.labels
Idents(pbmc) = pbmc$singleR_hpca
DimPlot(pbmc, reduction = "umap",
label = TRUE, pt.size = 0.5,
repel = TRUE) + NoLegend()
# Change back to cluster snn_res.0.1
Idents(pbmc) = pbmc$`RNA_snn_res.0.1`
Idents(pbmc) = pbmc$singleR_blueprint
DimPlot(pbmc, reduction = "umap",
label = TRUE, pt.size = 0.5,
repel = TRUE) + NoLegend()
# Change back to cluster snn_res.0.1
Idents(pbmc) = pbmc$`RNA_snn_res.0.1`
Manual annotation
Markers | Cell Type |
---|---|
IL7R, CCR7 | Naive CD4+ T |
CD14, LYZ | CD14+ Mono |
IL7R, S100A4 | Memory CD4+ |
MS4A1 | B |
CD8A | CD8+ T |
FCGR3A, MS4A7 | FCGR3A+ Mono |
GNLY, NKG7 | NK |
FCER1A, CST3 | DC |
PPBP | Platelet |
table(Idents(pbmc))
0 1 2 3 4 5 6 7 8 9
1215 938 761 662 465 224 98 94 54 48
FeaturePlot(pbmc, features = c("IL7R", "CD14", "CCR7", "S100A4", "MS4A1"))
DimPlot(pbmc, reduction = "umap",
label = TRUE, pt.size = 0.5,
repel = TRUE) + NoLegend()
pbmc = RenameIdents(pbmc, "2"="CD14+ Mono")
pbmc = RenameIdents(pbmc, "1"="Naive CD4+ T")
pbmc = RenameIdents(pbmc, "0"="Memory CD4+")
### Add more manual annotation based on your checking
DimPlot(pbmc, reduction = "umap",
label = TRUE, pt.size = 0.5,
repel = TRUE) + NoLegend()
Save the Seurat object
saveRDS(pbmc, file = "Seurat_object_pbmc_final.rds")
remotes::install_github("10xGenomics/loupeR")
loupeR::setup()
Downloading GitHub repo 10xGenomics/loupeR@HEAD
[36m──[39m [36mR CMD build[39m [36m─────────────────────────────────────────────────────────────────[39m
* checking for file ‘/tmp/RtmpwNVWV1/remotes10a53185aac/10XGenomics-loupeR-a169417/DESCRIPTION’ ... OK
* preparing ‘loupeR’:
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
* building ‘loupeR_1.1.2.tar.gz’
Installing package into ‘/content/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
Warning message in fun(libname, pkgname):
“Please call `loupeR::setup()` to install the Louper executable and to agree to the EULA before continuing”
Installing Executable
2024/11/18 22:56:23 Downloading executable
The LoupeR executable is subject to the 10x End User Software License, available at:
https://10xgen.com/EULA
Do you accept the End-User License Agreement
(y/yes or n/no): yes
EULA
library(loupeR)
create_loupe_from_seurat(pbmc, output_name = "Seurat_object_pbmc_cloupe")
2024/11/18 22:57:18 extracting matrix, clusters, and projections
2024/11/18 22:57:18 selected assay: RNA
2024/11/18 22:57:18 selected clusters: active_cluster orig.ident RNA_snn_res.0.1 seurat_clusters singleR_hpca singleR_blueprint
2024/11/18 22:57:18 selected projections: umap
2024/11/18 22:57:18 validating count matrix
2024/11/18 22:57:19 validating clusters
2024/11/18 22:57:19 validating projections
2024/11/18 22:57:19 creating temporary hdf5 file: /tmp/RtmpwNVWV1/file10aaeba1e8.h5
2024/11/18 22:57:23 invoking louper executable
2024/11/18 22:57:23 running command: "/root/.local/share/R/loupeR/louper create --input='/tmp/RtmpwNVWV1/file10aaeba1e8.h5' --output='Seurat_object_pbmc_cloupe.cloupe'"
Reference
https://monashbioinformaticsplatform.github.io/Single-Cell-Workshop/pbmc3k_tutorial.html
https://bioinformatics.ccr.cancer.gov/docs/getting-started-with-scrna-seq/IntroToR_Seurat/
https://hbctraining.github.io/scRNA-seq/lessons/elbow_plot_metric.html