CSsingle: A Unified Tool for Robust Decomposition of Bulk and Spatial Transcriptomic Data Across Diverse Single-Cell References

Example 1: Cell type decomposition in estimating neutrophil-to-lymphocyte ratio (NLR) from whole blood samples

Load single-cell reference data in the form of a `SeuratObject`

data("GSE107011_sc")

Calculate cell size by using ERCC spike-in controls

cellType <- "cellType"
cellSize <- cal_cellSize(blood_ref, cellType)
# remove ERCC spikes-ins
is.spikes <- grepl("^ERCC-", rownames(blood_ref))
blood_ref <- blood_ref[!is.spikes, ]

Load bulk data in the form of an `ExpressionSet`

data("GSE73072_SI_bulk")

Data preprocessing

This step involves excluding cell types that are not of interest from the scRNA-seq data, removing sparse features and cells, and trimming both bulk and scRNA-seq data to ensure they share the same features.

select.ct <- c("lymphocyte", "Neutrophils_LD", "myeloid")
prepData <- preprocessData(blood_bulk, blood_ref, cellType, select.ct)
bulk.eset <- prepData$bulk.eset
sc.eset <- prepData$sc.eset
cellSize <- cellSize[select.ct]

Build a differentially expressed matrix on the scRNA-seq data

res1 <- build_diffexpMat(sc.eset, cellType, test.use = "bimod")
ctDEGs <- res1$ctDEGs
diffexpMat <- res1$diffexpMat

Estimation of cell type proportions by `CSsingle`

Set cellSize = NULL when ERCC spike-ins are unavailable from scRNA-seq reference.

# Build a signature matrix by selecting the top 100 markers for each cell type
res2 <- build_sigMat(diffexpMat, ctDEGs, num.top = 100, sort.by = "p.value", rm.dupli = TRUE)
markers <- res2$markers
sigMat <-  res2$sigMat
# Estimation of cell type proportions
Est.prop.CSsingle <- CSsingle(sigMat, exprs(bulk.eset), markers, cellSize = cellSize, dampened = TRUE)

Estimation of cell type proportions by `CSsingle_optimize`

We created multiple signature matrices by varying the number of marker genes from 50 to 200 with step 50 for each cell type by default. In the presence of multiple signature matrices, we combined CSsingle with each signature matrix to get estimates of the cell type proportions. The optimal signature matrix \(S^*\) is chosen as the candidate matrix whose inferred bulk/ST gene expression data has the maximal Spearman correlation with the real bulk/ST gene expression data.

Est.prop.CSsingle <- CSsingle_optimize(diffexpMat, exprs(bulk.eset), ctDEGs, cellSize = cellSize, dampened=TRUE)

Plot the temporal alterations in the `NLR`, as assessed by `CSsingle`, against those observed in laboratory white blood cell counts

library(ggrepel)
library(ggpubr)
library(ggplot2)
library(gridExtra)
# We collected the laboratory measurements of neutrophil/lymphocyte counts from Table S6 of a previous published study (PLoS Genet. 2011 Aug;7(8):e1002234.)
data("wbc_counts")

neut <- wbc_counts[,"Neutrophils"]
neut <- neut[seq(2,length(neut),2)]
neut <- c(mean(neut[c(1,2)]),neut[c(-1,-2)])
lymp <- wbc_counts[,"Lymphocytes"]
lymp <- lymp[seq(2,length(lymp),2)]
lymp <- c(mean(lymp[c(1,2)]),lymp[c(-1,-2)])
nlr <- neut/lymp
time.slot <- list(c(-Inf,0),c(0,24),c(24,2*24),c(2*24,3*24),c(3*24,4*24),c(4*24,5*24),c(5*24,6*24),c(6*24,7*24))
label <- c("Baseline","Day1","Day2","Day3","Day4","Day5","Day6","Day7")
Lymp <- Est.prop.CSsingle[, "lymphocyte"]
Neut <- Est.prop.CSsingle[, "Neutrophils_LD"]
NLR <- Neut/Lymp
meta <- pData(bulk.eset)[names(Neut),]

mean.neut <- mean.lymp <- mean.nlr <- c()
for (j in 1:length(time.slot)){
  idx <- as.numeric(meta$time.hours) <= time.slot[[j]][2] & as.numeric(meta$time.hours) > time.slot[[j]][1]
  mean.neut[j] <- mean(Neut[idx])*100
  mean.lymp[j] <- mean(Lymp[idx])*100
  mean.nlr[j] <- mean(NLR[idx])
}
dataf <- cbind.data.frame(neut, lymp, nlr, mean.neut, mean.lymp, mean.nlr, label)

gplotn <- ggscatter(dataf,x="neut",y="mean.neut",fill="label",
                   shape=21,size=3.5,add = "reg.line",add.params = list(fill="darkgrey",color="black"),conf.int=T,ylab="Mean estimated neutrophil proportion (%)",
                   xlab="% of Neutrophil (laboratory measured)", title = NULL)+
  stat_cor(method="pearson")+theme(legend.position = "none")+
  geom_text_repel(aes(y=mean.neut, x=neut, label = label))

gplotl <- ggscatter(dataf,x="lymp",y="mean.lymp",fill="label",
                   shape=21,size=3.5,add = "reg.line",add.params = list(fill="darkgrey",color="black"),conf.int=T,ylab="Mean estimated lymphocyte proportion (%)",
                   xlab="% of Lymphocyte (laboratory measured)", title = NULL)+
  stat_cor(method="pearson")+theme(legend.position = "none")+
  geom_text_repel(aes(y=mean.lymp, x=lymp, label = label))

gplotr <- ggscatter(dataf,x="nlr",y="mean.nlr",fill="label",
                   shape=21,size=3.5,add = "reg.line",add.params = list(fill="darkgrey",color="black"),conf.int=T,ylab="Mean estimated NLR",
                   xlab="NLR (laboratory measured)", title = NULL)+ ylim(c(0,4))+ geom_hline(yintercept=1,lty=2, lwd=1)+
  stat_cor(method="pearson")+theme(legend.position = "none")+
  geom_text_repel(aes(y=mean.nlr, x=nlr, label = label))

gplot <- ggarrange(gplotn, gplotl, gplotr, ncol = 2, nrow = 2)
print(gplot)

Example 2: Cell type decomposition in human Barrett’s esophagus tissue

We reproduce here the analysis detailed in CSsingle manuscript:

Load single-cell reference data in the form of a `SeuratObject`

data("EGAS00001003144_sc")

t-SNE projection of scRNA-seq data from samples of BE (Barrett’s Esophagus), NE (Normal Esophagus), NGC (Normal Gastric Cardia), and ND (Normal Duodenum)

is.spikes <- grepl("^ERCC-", rownames(barrett_ref))
data <- barrett_ref[!is.spikes, ]
data <- SCTransform(data, verbose = FALSE)
data <- RunPCA(object = data)
opt_dim <- 12
data <- FindNeighbors(data, dims = 1:opt_dim)
data <- FindClusters(object = data, resolution = 0.8)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2340
## Number of edges: 67805
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8958
## Number of communities: 16
## Elapsed time: 0 seconds

data <- RunTSNE(object = data, dims = 1:opt_dim)
tsne.plot1=DimPlot(object = data, reduction = 'tsne', label=F, group.by = "patient",raster=FALSE) + labs(title="Patient ID") +
  theme(legend.title = element_text(size = 9), legend.text = element_text (size=8))
tsne.plot2=DimPlot(object = data, reduction = 'tsne', label=F, group.by = "tissue",raster=FALSE) + labs(title="Tissue") +
  theme(legend.title = element_text(size = 9), legend.text = element_text (size=8))
tsne.plot3=DimPlot(object = data, reduction = 'tsne', label=F, cols=c("orange", "darkblue", "purple", "darkgreen"), group.by = "global_cellType",raster=FALSE) + labs(title="Global cell type") + theme(legend.title = element_text(size = 9), legend.text = element_text (size=8))
tsne.plot4=DimPlot(object = data, reduction = 'tsne', label=F, cols=c("orange", "darkblue", "purple", "darkgreen"), group.by = "cellType",raster=FALSE) + labs(title="Cell type") + theme(legend.title = element_text(size = 9), legend.text = element_text (size=8))

tsne.plot <- ggarrange(tsne.plot1, tsne.plot2, tsne.plot3, tsne.plot4, ncol = 2, nrow =2)
print(tsne.plot)

Violin plots of marker gene expression in mosaic, gastric, and intestinal columnar cells

idx <- data@meta.data$cellType %in% c( "Mosaic", "Gastric columnar", "Intestinal columnar")
data.sub <- data[, idx]
markers <- c("PGC","MUC5AC","TFF1", "OLFM4","GPA33", "APOA4")
data.sub$cellType <- factor(data.sub$cellType, levels=c("Mosaic", "Gastric columnar", "Intestinal columnar"))
vplot <- VlnPlot(data.sub, markers, group.by = "cellType", cols = c("purple", "orange", "darkblue"), ncol = 3) & theme(axis.title.x = element_blank(), axis.text.x = element_blank())
vplot <- ggarrange(vplot, common.legend=TRUE)
print(vplot)

Estimate absolute RNA content by using ERCC spike-in controls

counts <- as.matrix(barrett_ref@assays$RNA@counts)
spike.counts <- counts[is.spikes, ]
gene.counts <- counts[!is.spikes, ]
normFactor <- apply(spike.counts,MARGIN=2,
                   FUN=quantile,probs=0.75,na.rm=TRUE)
gene.norm.counts <- t(t(gene.counts)/(normFactor+1))
abs.RNAcontent <- colSums(gene.norm.counts)

Celltype <- barrett_ref@meta.data[["cellType"]]
Celltype <- gsub("_", " ", Celltype)
Celltype <- factor(Celltype, levels = c( "Squamous", "Mosaic", "Gastric columnar", "Intestinal columnar"))
data <- cbind.data.frame(abs.RNAcontent, Celltype)
pal <- c("darkgreen", "purple", "orange", "darkblue")
gbplot <- ggbarplot(data, fill="Celltype",x = "Celltype", y = "abs.RNAcontent", add = "mean_se",
                color = "Celltype", palette = pal,
                position = position_dodge(0.8))+
  labs(fill="") +
  xlab("Cell type") + ylab("Total RNA content \n with spike-in normalization")+
  theme(legend.position="none") +
  theme(axis.title.x=element_blank(), axis.text.x=element_text(size=12, angle=45,vjust=0.6)) +
  stat_compare_means(method =  "kruskal.test", label.y = 1e5, size = 4) +
  scale_y_continuous(labels=scales::scientific)
print(gbplot)

Calulate cell size by using ERCC spike-in controls

cellType <- "cellType"
cellSize <- cal_cellSize(barrett_ref, cellType)

# remove ERCC spikes-ins
is.spikes <- grepl("^ERCC-", rownames(barrett_ref))
barrett_ref <- barrett_ref[!is.spikes, ]

Load bulk data in the form of an `ExpressionSet`

# bulk data of 233 epithelium samples derived from NE, NGC and BE
data("Barrett_bulk")

Data preprocessing

prepData <- preprocessData(barrett_bulk, barrett_ref, cellType)
bulk.eset <- prepData$bulk.eset
sc.eset <- prepData$sc.eset

Build a differentially expressed matrix

# Build a differentially expressed matrix by performing differential expression analysis on the scRNA-seq data
res1 <- build_diffexpMat(sc.eset, cellType, test.use = "bimod")
ctDEGs <- res1$ctDEGs
diffexpMat <- res1$diffexpMat

Estimation of cell type proportions by `CSsingle`

# Build a signature matrix by selecting the top 50 markers for each cell type
res2 <- build_sigMat(diffexpMat, ctDEGs, num.top = 50)
markers <- res2$markers
sigMat <-  res2$sigMat
# Estimation of cell type proportions
Est.prop.CSsingle <- CSsingle(sigMat, exprs(bulk.eset), markers, cellSize = cellSize, dampened = TRUE)

Plots of decomposition estimates for 233 epithelium samples derived from NE, NGC and BE.

Est.prop <- Est.prop.CSsingle
Celltype <- rep(colnames(Est.prop), each=nrow(Est.prop))
Celltype <- gsub("_", " ", Celltype)
Celltype <- factor(Celltype, levels = c( "Squamous", "Mosaic", "Gastric columnar", "Intestinal columnar"))
Tissue <- as.character(pData(bulk.eset)[,"Tissue"])
tissue <- rep(Tissue, ncol(Est.prop))
data <- data.frame(prop = c(Est.prop), Celltype = Celltype, tissue = factor(tissue, levels = c("Squamous esophagus", "Gastric cardia", "Barrett esophagus")))
gbplot <- ggbarplot(data, fill="Celltype",x = "Celltype", y = "prop", add = "mean_se",
                   color = "Celltype", palette = c("darkgreen", "purple", "orange", "darkblue"), facet.by = "tissue",
                   position = position_dodge(0.8))+ ylim(0, 1) +
  theme(axis.text.x=element_blank()) +
  theme(
    panel.background = element_blank(),
    axis.line.x = element_line(color="black"),
    axis.line.y = element_line(color="black"),
    panel.grid.major = element_blank()) +
  labs(title = "CSsingle",x="Cell type", y = "Mean estimated proportion") +
  theme(legend.position="top", legend.title = element_text(size = 10), legend.text = element_text(size=10))
print(gbplot)

Comparison of the estimated proportions of mosaic columnar cell (MCC) in NE, NGC, and BE.

Est.prop <- Est.prop.CSsingle[, "Mosaic"]
dat <- data.frame(est.prop = c(Est.prop), Tissue = factor(Tissue, levels=c("Squamous esophagus", "Gastric cardia", "Barrett esophagus")))
my_comparisons <- list(c("Squamous esophagus","Barrett esophagus"),c("Gastric cardia","Barrett esophagus"))
gbxplot <- ggplot(dat, aes(x = Tissue, y = est.prop, fill = Tissue)) +
  geom_boxplot() +
  stat_compare_means(comparisons = my_comparisons,label =  "p.format", method = "wilcox.test", method.args = list(alternative = "less")) +
  geom_jitter(aes(fill=Tissue),width =0.2,shape = 21,size=2) +
  labs(x="Tissue", y="Estimated proportion of mosaic cell population", title = "CSsingle") + theme_classic() +
  scale_fill_brewer() + theme(legend.position="none", legend.title = element_text(size = 10), legend.text = element_text(size=10))
print(gbxplot)

Example 3: Cell type decomposition in human fetal pancreas (real spatial transcriptomic data)

Load single-cell reference data in the form of a `SeuratObject`

data("GSE85241_sc")

Load spatial transcriptomic data in the form of a `SeuratObject`

data("GSM5914540_st")

Build a differentially expressed matrix on the scRNA-seq data

com.genes <- intersect(toupper(rownames(pancreas_ref)), rownames(pancreas_st))
cellType <- "cellType"
res1 <- build_diffexpMat(pancreas_ref[com.genes, ], cellType, test.use = "wilcox")
ctDEGs <- res1$ctDEGs
diffexpMat <- res1$diffexpMat

Estimation of cell type proportions by `CSsingle`.

When estimating cell type proportions using CSsingle, omit cell size correction (cellSize = NULL) for spot-based ST deconvolution. This is because incomplete cellular capture by 2D slides and stochastic cell overlaps dominate RNA contributions, obscuring biological size differences.

# Build a signature matrix by selecting the top 200 markers for each cell type
Est.prop.CSsingle <- CSsingle_optimize(diffexpMat, as.matrix(pancreas_st[["Spatial"]]$counts), ctDEGs, cellSize = NULL, enrichment = TRUE, enrich.thres = 0, dampened = TRUE, increment = 200)

# Automatically select the optimal signature matrix
start_time <- Sys.time()
Est.prop.CSsingle <- CSsingle_optimize(diffexpMat, as.matrix(pancreas_st[["Spatial"]]$counts), ctDEGs, cellSize = NULL, enrichment = TRUE, enrich.thres = 0, dampened = TRUE)
end_time <- Sys.time()
print(end_time-start_time)

## Time difference of 5.264478 mins

Visulization of cell-type spatial distribution

library(scatterpie)
mycol <- c("deepskyblue", "goldenrod2", "purple",
           "green2", "royalblue", "pink",
           "red2", "yellow", "orange")

plot_data <- as.data.frame(Est.prop.CSsingle)
plot_col <- sort(colnames(plot_data))
# scatterpie
spatial_loc <- data.frame(pancreas_st@images[["X12PCW_S2"]]@boundaries$centroids@coords)
spatial_loc <- data.frame(x=spatial_loc$x,
                          y=spatial_loc$y,
                          row.names = colnames(pancreas_st))
spot_coords <- spatial_loc[rownames(Est.prop.CSsingle), ]
plot_data$x <- as.numeric(as.character(spot_coords$x))
plot_data$y <- as.numeric(as.character(spot_coords$y))
min_x <- min(plot_data$x)
rad = 55
plot_data$radius <- rad
df <- data.frame()
p <- ggplot(df) + geom_point() + xlim(min(plot_data$x)-rad, max(plot_data$x)+rad) + ylim(min(plot_data$y)-rad, max(plot_data$y)+rad) + labs(title = "12PCW_S2") + xlab("x coordinates") + ylab("y coordinates")
pieplot <- p + geom_scatterpie(aes(x=x, y=y, r=radius), data=plot_data, cols=plot_col, color=NA, alpha=.8) +
  geom_scatterpie_legend(plot_data$radius, x=1, y=1) +
  theme_classic() +
  scale_fill_manual(name = "Cell type", values = mycol)
print(pieplot)

CSsingle: A Unified Tool for Robust Decomposition of Bulk and Spatial Transcriptomic Data Across Diverse Single-Cell References

Wenjun Shen, Cheng Liu, Yunfei Hu, Yuanfang Lei, Hau-San Wong, Si Wu, and Xin Maizie Zhou

2025-05-31

Introduction

Installation

Example 1: Cell type decomposition in estimating neutrophil-to-lymphocyte ratio (NLR) from whole blood samples

Load single-cell reference data in the form of a `SeuratObject`

Calculate cell size by using ERCC spike-in controls

Load bulk data in the form of an `ExpressionSet`

Data preprocessing

Build a differentially expressed matrix on the scRNA-seq data

Estimation of cell type proportions by `CSsingle`

Estimation of cell type proportions by `CSsingle_optimize`

Plot the temporal alterations in the `NLR`, as assessed by `CSsingle`, against those observed in laboratory white blood cell counts

Example 2: Cell type decomposition in human Barrett’s esophagus tissue

Load single-cell reference data in the form of a `SeuratObject`

t-SNE projection of scRNA-seq data from samples of BE (Barrett’s Esophagus), NE (Normal Esophagus), NGC (Normal Gastric Cardia), and ND (Normal Duodenum)

Violin plots of marker gene expression in mosaic, gastric, and intestinal columnar cells

Estimate absolute RNA content by using ERCC spike-in controls

Calulate cell size by using ERCC spike-in controls

Load bulk data in the form of an `ExpressionSet`

Data preprocessing

Build a differentially expressed matrix

Estimation of cell type proportions by `CSsingle`

Plots of decomposition estimates for 233 epithelium samples derived from NE, NGC and BE.

Comparison of the estimated proportions of mosaic columnar cell (MCC) in NE, NGC, and BE.

Example 3: Cell type decomposition in human fetal pancreas (real spatial transcriptomic data)

Load single-cell reference data in the form of a `SeuratObject`

Load spatial transcriptomic data in the form of a `SeuratObject`

Build a differentially expressed matrix on the scRNA-seq data

Estimation of cell type proportions by `CSsingle`.

Visulization of cell-type spatial distribution

CSsingle: A Unified Tool for Robust Decomposition of Bulk and Spatial Transcriptomic Data Across Diverse Single-Cell References

Wenjun Shen, Cheng Liu, Yunfei Hu, Yuanfang Lei, Hau-San Wong, Si Wu, and Xin Maizie Zhou

2025-05-31

Introduction

Installation

Example 1: Cell type decomposition in estimating neutrophil-to-lymphocyte ratio (NLR) from whole blood samples

Load single-cell reference data in the form of a SeuratObject

Calculate cell size by using ERCC spike-in controls

Load bulk data in the form of an ExpressionSet

Data preprocessing

Build a differentially expressed matrix on the scRNA-seq data

Estimation of cell type proportions by CSsingle

Estimation of cell type proportions by CSsingle_optimize

Plot the temporal alterations in the NLR, as assessed by CSsingle, against those observed in laboratory white blood cell counts

Example 2: Cell type decomposition in human Barrett’s esophagus tissue

Load single-cell reference data in the form of a SeuratObject

t-SNE projection of scRNA-seq data from samples of BE (Barrett’s Esophagus), NE (Normal Esophagus), NGC (Normal Gastric Cardia), and ND (Normal Duodenum)

Violin plots of marker gene expression in mosaic, gastric, and intestinal columnar cells

Estimate absolute RNA content by using ERCC spike-in controls

Calulate cell size by using ERCC spike-in controls

Load bulk data in the form of an ExpressionSet

Data preprocessing

Build a differentially expressed matrix

Estimation of cell type proportions by CSsingle

Plots of decomposition estimates for 233 epithelium samples derived from NE, NGC and BE.

Comparison of the estimated proportions of mosaic columnar cell (MCC) in NE, NGC, and BE.

Example 3: Cell type decomposition in human fetal pancreas (real spatial transcriptomic data)

Load single-cell reference data in the form of a SeuratObject

Load spatial transcriptomic data in the form of a SeuratObject

Build a differentially expressed matrix on the scRNA-seq data

Estimation of cell type proportions by CSsingle.

Visulization of cell-type spatial distribution

Load single-cell reference data in the form of a `SeuratObject`

Load bulk data in the form of an `ExpressionSet`

Estimation of cell type proportions by `CSsingle`

Estimation of cell type proportions by `CSsingle_optimize`

Plot the temporal alterations in the `NLR`, as assessed by `CSsingle`, against those observed in laboratory white blood cell counts

Load single-cell reference data in the form of a `SeuratObject`

Load bulk data in the form of an `ExpressionSet`

Estimation of cell type proportions by `CSsingle`

Load single-cell reference data in the form of a `SeuratObject`

Load spatial transcriptomic data in the form of a `SeuratObject`

Estimation of cell type proportions by `CSsingle`.