作者:中国科学院分子细胞科学卓越创新中心生物信息平台
本文描述了bulk RNA-seq分析流程中基因调控网络子模块,该模块可以得到转录因子与靶基因之间的调控网络关系。
该模块需要输入两方面的信息,利用基因表达量和转录因子列表来完成调控网络分析,表达量可用来自于DataSummary模块生成的raw.RData文件,转录因子列表需要自行提供,如下图所示。
%load_ext rpy2.ipython
%%R
rawRdFile = "RNAseq/DataSummary/WPS_raw.RData"
samplesheet = "sampleSheet/LUSC_samplesheet_pass.txt"
outFolder = "RNAseq/ARACNe"
tfFile = "RNAseq/ARACNe/VIPER_Regulators.txt"
studyTag = "WPS"
if(!dir.exists(outFolder))
dir.create(outFolder)
%%R
sampleT = read.table(file = samplesheet, row.names = 1, header = TRUE, sep = "\t")
head(sampleT)
%%R
load(rawRdFile)
assign("raw", WPS_raw)
TPM = log2(raw$txi$abundance + 1)
geneAnn = raw$geneAnn
Symbol = sapply(strsplit(as.character(geneAnn[rownames(TPM), "geneString"]), " "), function(z) z[2])
aggT = aggregate(TPM, by = list(factor(Symbol)), FUN="mean")
ExM = as.matrix(aggT[, -1])
rownames(ExM) = as.character(aggT[,1])
kSamples = intersect(colnames(ExM), rownames(sampleT))
ExM = ExM[, kSamples]
ExM[1:4, 1:4]
一般需要对基因进行一定的筛选,保留变化大的基因
%%R
Idx1 = apply(ExM, 1, max) - apply(ExM, 1, min) > 1
sdValue = apply(ExM, 1, sd)
Idx2 = sdValue > quantile(sdValue, 0.25)
ExS = signif(ExM[Idx1 & Idx2, ], 4)
gene = rownames(ExS)
eFile = paste0(outFolder, "/", studyTag, "_expression.txt")
write.table(data.frame(gene, ExS), file = eFile, row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
dim(ExS)
首先计算当前样本量下MI阈值。
%%R
Res = paste0(outFolder, "/Res")
Log = paste0(outFolder, "/Log")
if(!dir.exists(Res))
dir.create(Res)
if(!dir.exists(Log))
dir.create(Log)
bin = "java -Xmx16g -jar /sibcb2/bioinformatics/software/ARACNe-AP/Aracne.jar"
cmd = paste(bin, "-e", eFile, "-o", Res, "--tfs", tfFile, "--pvalue 1E-8 --seed 1 --calculateThreshold\n")
system(cmd)
进行100次网络构建,每次采用不同的起始参数。
%%R
for(i in 2:100){
cmd = paste(bin, "-e", eFile, "-o", Res, "--tfs", tfFile, "--pvalue 1E-8 --seed", i)
jobName = paste0("run_",i)
outFile = paste0(Log,"/", jobName, ".out")
errFile = paste0(Log,"/", jobName, ".out")
submitCMD = paste0("echo ", "\"", cmd, "\"")
submitCMD = paste(submitCMD, "| qsub -N", jobName, "-cwd")
submitCMD = paste(submitCMD, "-o", outFile, "-e", errFile)
system(submitCMD)
#cat(submitCMD)
}
合并100次bootstrap的结果。
%%R
cmd = paste(bin, "-o", Res, "--consolidate\n")
system(cmd)
生成的Network如下所示:
%%R
network = read.table(file = "RNAseq/ARACNe/Res/network.txt", row.names = NULL, header = TRUE, sep = "\t")
head(network)
我们会有一个独立的文档来说明如何利用网络进行整合分析。