本文描述了bulk RNA-seq分析流程中GSEA(Gene Set Enrichment Analysis)富集子模块,该模块可以得到排序基因列表在基因集中的富集情况,可用来评估特定的基因集与表型或处理之间的关联性。
进行GSEA分析至少需要两个输入文件:(1)全基因组基因排序文件。一般可以根据差异表达分析的P-value来排序,当样本数比较少的时候,可以根据表达变化Log2 Flold-change来排序。(2)基因集。可以自定义,也可以从MSigDB数据库提取。
%load_ext rpy2.ipython
%%R
suppressPackageStartupMessages(library("msigdbr"))
suppressPackageStartupMessages(library("GSEABase"))
DGEtable = "RNAseq/DGE/WPS_Cancer_vs_Normal_DESeq2.txt"
outFolder = "RNAseq/GSEA"
gsName = "HALLMARK_MYC_TARGETS_V2"
outPrefix = "WPS"
if(!dir.exists(outFolder))
dir.create(outFolder)
write.gmt <- function(gsList, gmt_file){
sink(gmt_file)
for (i in 1:length(gsList)){
cat(names(gsList)[i])
cat('\tNA\t')
cat(paste(gsList[[i]],collapse = '\t'))
cat('\n')
}
sink()
}
%%R
Tx = read.table(file = DGEtable, row.names = 1, header = TRUE, sep = "\t")
head(Tx)
treatment_TPM control_TPM log2FoldChange ENSG00000000003.15 TSPAN6 20.04000 11.7700 0.9682494 ENSG00000000005.6 TNMD 0.03598 0.2433 -1.9451103 ENSG00000000419.12 DPM1 73.99000 48.8200 0.8308568 ENSG00000000457.14 SCYL3 7.25400 7.1880 0.1868544 ENSG00000000460.17 C1orf112 10.13000 3.3990 1.7626303 ENSG00000000938.13 FGR 7.99000 65.7600 -2.7090187 pvalue padj DGE ENSG00000000003.15 TSPAN6 1.887758e-27 1.549429e-26 UP ENSG00000000005.6 TNMD 6.022261e-06 1.476815e-05 DN ENSG00000000419.12 DPM1 2.719630e-30 2.509226e-29 UP ENSG00000000457.14 SCYL3 5.718560e-03 1.003688e-02 UP ENSG00000000460.17 C1orf112 1.641644e-110 1.511235e-108 UP ENSG00000000938.13 FGR 2.587081e-56 5.884496e-55 DN
%%R
Tx <- Tx[!is.na(Tx$pvalue), ]
Tx <- Tx[!is.na(Tx$log2FoldChange), ]
ID <- rownames(Tx)
ENSG <- sapply(strsplit(ID, " "), function(z) z[1])
Symbol <- sapply(strsplit(ID, " "), function(z) z[2])
pvalue <- as.numeric(Tx$pvalue)
pvalue[pvalue < 10^-300] <- 10^-300
zscore <- qnorm(pvalue/2, mean = 0, sd = 1, lower.tail = FALSE, log.p = FALSE)*sign(Tx$log2FoldChange)
oTable <- data.frame(ENSG, zscore)
rnkFile <- paste0(outFolder, "/", outPrefix,".rnk")
write.table(oTable, file = rnkFile, row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)
head(oTable)
ENSG zscore 1 ENSG00000000003.15 10.855015 2 ENSG00000000005.6 -4.525606 3 ENSG00000000419.12 11.437384 4 ENSG00000000457.14 2.763497 5 ENSG00000000460.17 22.336278 6 ENSG00000000938.13 -15.811625
%%R
chip = data.frame(ENSG, Symbol, Symbol)
colnames(chip) = c("Probe Set ID", "Gene Symbol", "Gene Title")
chipFile = paste0(outFolder, "/", outPrefix, ".chip")
write.table(chip, file = chipFile, row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
head(chip)
Probe Set ID Gene Symbol Gene Title 1 ENSG00000000003.15 TSPAN6 TSPAN6 2 ENSG00000000005.6 TNMD TNMD 3 ENSG00000000419.12 DPM1 DPM1 4 ENSG00000000457.14 SCYL3 SCYL3 5 ENSG00000000460.17 C1orf112 C1orf112 6 ENSG00000000938.13 FGR FGR
%%R
m_df = msigdbr(species = "Homo sapiens", category = "H")
gmtFile = paste0(outFolder, "/", outPrefix, ".gmt")
gsList = split(m_df$human_gene_symbol, factor(m_df$gs_name))[gsName]
write.gmt(gsList, gmtFile)
%%R
cmd = "java -cp /sibcb2/bioinformatics/software/GSEA/gsea2-2.2.4.jar xtools.gsea.GseaPreranked"
cmd = paste(cmd, "-gmx", gmtFile)
cmd = paste(cmd, "-rnk", rnkFile)
cmd = paste(cmd, "-chip", chipFile)
cmd = paste(cmd, "-rpt_label", outPrefix)
cmd = paste(cmd, "-out", outFolder)
cmd = paste(cmd, "-scoring_scheme weighted -collapse true -mode Max_probe -norm None ")
cmd = paste(cmd, "-nperm 1000 -include_only_symbols true")
cmd = paste(cmd, "-make_sets true -plot_top_x 20 -rnd_seed timestamp -set_max 500 -set_min 15 -zip_report false -gui false")
system(cmd)
GSEA运行会产生report文件,是网页的形式,非常直观,其中的图可以直接用于文章的发表。
这个图示PNG格式,网上有个很方便的工具将分析的结果转成PDF高清格式。
%%R
cmd = paste("/sibcb2/bioinformatics/software/BcbioNG/anaconda/bin/gseapy replot -i RNAseq/GSEA/WPS.GseaPreranked.1598414006189 -o RNAseq/GSEA/WPS.GseaPreranked.1598414006189")
system(cmd)
重新产生的图是PDF格式的,可以使用Adobe Illustrator编辑。