本文描述了bulk RNA-seq分析流程中降维聚类子模块,该模块可以得到样本的降维聚类结果,可以用来评估组间差异及组内样本重复情况。该模块用R语言编写。
该模块需要输入两方面的信息,利用基因水平的表达量和分组信息来完成降维和聚类的分析。表达量可用来自于DataSummary模块生成的raw.RData文件,分组信息需要自行提供,如下图所示。
suppressWarnings(library("DESeq2"))
suppressWarnings(library("ggplot2"))
suppressWarnings(library("umap"))
suppressWarnings(library("Rtsne"))
suppressWarnings(library("ggpubr"))
suppressWarnings(library("openxlsx"))
options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 200)
rawRdFile = "RNAseq/DataSummary/WPS_raw.RData"
samplesheet = "sampleSheet/LUSC_samplesheet_pass.txt"
outFolder = "RNAseq/DimReduction"
if(!dir.exists(outFolder))
dir.create(outFolder)
load(rawRdFile)
assign("raw", get("WPS_raw"))
ExM = t(log2(raw$txi$abundance + 1))
colnames(ExM) = raw$geneAnn$geneString
Tx = read.table(file = samplesheet, row.names = 1, header = TRUE, sep = "\t")
cNames = intersect(rownames(ExM), rownames(Tx))
Tx = Tx[cNames, , drop = FALSE]
ExM = ExM[cNames,]
ExS = ExM[, apply(ExM, 2, sd) > 0]
ExS[1:4, 1:4]
pca <- summary(prcomp(ExS, center=TRUE, scale= TRUE))
pcadata <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
pca_frame <- data.frame(pcadata, Tx)
xlab = paste0("PC1 (", pca$importance[2,"PC1"]*100, "%)")
ylab = paste0("PC2 (", pca$importance[2,"PC2"]*100, "%)")
p <- ggplot(pca_frame, aes(x=PC1, y=PC2, color=group)) + geom_point(size=1.5)
p <- p + theme_bw() + labs(x = xlab, y = ylab, title = "PCA")
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank())
p <- p + theme(legend.title=element_blank())
p
UMAP是非线性的降维方式,一般情况下也会得到更好的展示效果。
umap_out <- umap(ExS)
umap_frame = data.frame(UMAP1 = umap_out$layout[,1], UMAP2 = umap_out$layout[,2], Tx)
p <- ggplot(umap_frame, aes(x = UMAP1, y = UMAP2, color = group)) + geom_point(size=1.5)
p <- p + theme_bw() + labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP")
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank())
p
t-SNE也是非常流行的非线性降维方式。
tsne_out = Rtsne(ExS, dims = 2, pca = TRUE, perplexity = 30, verbose = FALSE, max_iter = 500)
tsne_frame = data.frame(tSNE1 = tsne_out$Y[,1], tSNE2 = tsne_out$Y[,2], Tx)
p <- ggplot(tsne_frame, aes(x = tSNE1, y = tSNE2, color = group)) + geom_point(size = 1.5)
p <- p + theme_bw() + labs(x = "t-SNE 1", y = "t-SNE 2", title = "t-SNE")
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + theme(plot.title = element_text(hjust = 0.5)) + theme(legend.title=element_blank())
p <- p + theme(legend.position="none")
p
sessionInfo()