本文描述了bulk RNA-seq分析流程中差异分析子模块,该模块可以得到差异表达基因,是后续功能富集等分析的基础。该模块用R语言编写。
该模块需要输入两方面的信息,利用基因水平的表达量和分组信息来完成差异基因的分析。表达量可用来自于DataSummary模块生成的raw.RData文件,分组信息需要自行提供,如下图所示。
suppressWarnings(library("DESeq2"))
suppressWarnings(library("ggplot2"))
options(repr.plot.width=4, repr.plot.height=3, repr.plot.res = 200)
rawRdFile = "RNAseq/DataSummary/WPS_raw.RData"
samplesheet = "sampleSheet/LUSC_samplesheet_pass.txt"
outFolder = "RNAseq/DGE"
if(!dir.exists(outFolder))
dir.create(outFolder)
首先导入数据和sample sheet
load(rawRdFile)
assign("raw", WPS_raw)
txi = raw$txi
sampleT = read.table(file = samplesheet, row.names = 1, header = TRUE, sep = "\t")
kSamples = intersect(colnames(txi$counts), rownames(sampleT))
sampleTable = sampleT[kSamples, , drop = FALSE]
newTxi = list(abundance = txi$abundance[, kSamples],
counts = txi$counts[, kSamples],
length = txi$length[, kSamples],
countsFromAbundance = "no")
head(sampleT)
我们要比较Cancer和Normal这两类的基因表达差异。
dds <- DESeqDataSetFromTximport(newTxi, sampleTable, ~group)
dds <- DESeq(dds)
res <- results(dds, contrast=c("group", "Cancer", "Normal"))
DESeq2的差异表达分析结果如下所示:
rTable <- data.frame(res)
head(rTable)
gene <- raw$geneAnn[rownames(rTable), "geneString"]
s1 <- rownames(sampleTable)[sampleTable[, "group"] == "Cancer"]
s2 <- rownames(sampleTable)[sampleTable[, "group"] == "Normal"]
treatment_TPM <- signif(apply(txi$abundance[rownames(rTable), s1], 1, mean), 4)
control_TPM <- signif(apply(txi$abundance[rownames(rTable), s2], 1, mean), 4)
DGE <- rep("NC", nrow(rTable))
DGE[((rTable$padj) < 0.05) & (rTable$log2FoldChange > 0)] = "UP"
DGE[((rTable$padj) < 0.05) & (rTable$log2FoldChange < 0)] = "DN"
rTable = data.frame(gene, treatment_TPM, control_TPM, rTable[, c("log2FoldChange", "pvalue", "padj")], DGE)
head(rTable)
write.table(rTable, file = paste0(outFolder, "/WPS_Cancer_vs_Normal_DESeq2.txt"),
row.names = FALSE, col.names = TRUE, sep = "\t")
rTable$LogFDR = -log10(rTable$padj)
p <- ggplot(rTable, aes(x = log2FoldChange, y = LogFDR, color = DGE)) + xlim(-10, 10)
p <- p + theme_bw() + labs(x = "Log2 fold-change", y = "-log10(FDR)", title = "")
p <- p + geom_point(size = 0.1) + theme(plot.title = element_text(hjust = 0.5))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + scale_color_manual(values = c("UP" = "red", "DN" = "dodgerblue4", "NC" = "grey"))
p <- p + theme(legend.title=element_blank())
p
由于Cancer和Normal之间的巨大差异,相当大一部分基因在这两组之间存在差异表达(FDR < 5%)。
sessionInfo()