本文描述了bulk RNA-seq分析流程中数据汇集子模块,该模块收集个样本质量控制及基因定量的信息,是后续差异表达分析、聚类、分类、功能富集分析的基础。该模块用R语言编写。
如下图所示,该模块汇集了两个方面的信息:质量控制和基因定量。质量控制主要来自于STAR比对及后续的RNASeQC分析,转录本定量来自Kallisto。
suppressWarnings(library("ggplot2"))
suppressWarnings(library("ggpubr"))
suppressWarnings(library("tximport"))
suppressWarnings(library("reshape2"))
tx2g = "/sibcb2/bioinformatics/iGenome/STAR/GENCODE/human_hg38/ID/tx2g.txt"
runFolder = "/sibcb2/bioinformatics/WPS/Run_RNAseq/workflow/"
outFolder = "RNAseq/DataSummary"
studyTag = "WPS"
if(!dir.exists(outFolder))
dir.create(outFolder)
SRX = list.files(runFolder)
Ns = length(SRX)
cM = matrix(NA, 4, Ns)
for(i in 1:Ns){
tag = SRX[i]
fileIn = paste0(runFolder, "/", tag, "/STAR/", tag, "_STAR_QC.txt")
text = readLines(fileIn)
totalReads = as.numeric(gsub("\t", "", strsplit(grep("Number of input reads", text, value = TRUE), "\\|")[[1]][2]))
umapReads = as.numeric(gsub("\t", "", strsplit(grep("Uniquely mapped reads number", text, value = TRUE), "\\|")[[1]][2]))
multiReads = as.numeric(gsub("\t", "", strsplit(grep("Number of reads mapped to multiple loci", text, value = TRUE), "\\|")[[1]][2]))
chimericReads = as.numeric(gsub("\t", "", strsplit(grep("Number of chimeric reads", text, value = TRUE), "\\|")[[1]][2]))
unmappedReads = totalReads - umapReads - multiReads - chimericReads
cM[,i] = c(umapReads, multiReads, chimericReads, unmappedReads)
}
dimnames(cM) = list(c("unique_mapping", "multi_mapping", "chimericReads", "unmapped"), SRX)
write.table(cM, file = paste0(outFolder,"/Alignment_summary.txt"), row.names = TRUE, col.names = TRUE, sep = "\t")
合并的结果保存在Alignment_summary.txt文件中,其中的内容示例如下:
t(cM[,1:8])
mM = matrix(NA, 3, Ns)
for(i in 1:Ns){
tag = SRX[i]
fileIn = paste0(runFolder, "/", tag, "/report/", tag, ".bam.metrics.tsv")
text = readLines(fileIn)
ExonicReads = as.numeric(gsub("\t", "", strsplit(grep("Exonic Reads", text, value = TRUE), "\t")[[1]][2]))
IntronicReads = as.numeric(gsub("\t", "", strsplit(grep("Intronic Reads", text, value = TRUE), "\t")[[1]][2]))
IntergenicReads = as.numeric(gsub("\t", "", strsplit(grep("Intergenic Reads", text, value = TRUE), "\t")[[1]][2]))
mM[,i] = c(ExonicReads, IntronicReads, IntergenicReads)
}
dimnames(mM) = list(c("ExonicReads", "IntronicReads", "IntergenicReads"), SRX)
write.table(cM, file = paste0(outFolder,"/RNASeQC_summary.txt"), row.names = TRUE, col.names = TRUE, sep = "\t")
合并的结果保存在RNASeQC_summary.txt文件中,其中的内容示例如下:
t(mM[,1:8])
files = paste0(runFolder, "/", SRX, "/kallisto/", SRX, ".tsv")
names(files) = SRX
tx2gene = read.table(file = tx2g, row.names = NULL, header = TRUE, sep = "\t")
# txi <- tximport(files, type = "kallisto", tx2gene = tx2gene, ignoreAfterBar = FALSE)
geneString = unique(paste(tx2gene$ENSG, tx2gene$Symbol))
geneAnn = data.frame(geneString)
rownames(geneAnn) = sapply(strsplit(geneString, " "), function(z) z[1])
findType = function(z) paste0(sort(unique(z)), collapse = " ")
aggT = aggregate(tx2gene$mType, by = list(tx2gene$ENSG),FUN="findType")
rownames(aggT) = as.character(aggT[,1])
geneAnn$mType = aggT[rownames(geneAnn),2]
geneAnn = geneAnn[rownames(txi$abundance),]
基因定量的结果保存在txi中:
names(txi)
txi$abundance[1:4, 1:4]
基因注释保存在geneAnn中:
geneAnn[1:4, ]
assign("WPS_raw", list(geneAnn = geneAnn, txi = txi))
save(list = "WPS_raw", file = paste0(outFolder, "/WPS_raw.RData"))