loading required packages:
# Input CpGs
data_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/Methyl_bdg/"
samples=list.files(data_path,pattern="Count_CpG.bedGraph.gz")
# hg19 CGI
session <- browserSession()
genome(session) <- "hg19"
query <- ucscTableQuery(session, table = "cpgIslandExt")
CpGislands.gr <- track(query)
genome(CpGislands.gr) <- NA
# Remove CGI +/-5K CpGs
CpGislands.gr <-suppressWarnings(resize(CpGislands.gr, 5000, fix="center"))
for ( i in samples){
# Load Methylation
x <- fread(paste0(data_path,i)) %>% toGRanges()
names(mcols(x)) = c("M","Um")
mcols(x)[,"T"] = mcols(x)[,1] + mcols(x)[,2]
ranges(x) <- end(x)
mcols(x) <- mcols(x)[,c("T","M")]
tag <- gsub("_Merged_Count_CpG.bedGraph","",i)
# PMD
PMDsegments<-segmentPMDs(m=x, chr.sel="chr22",seqLengths=sLengths,num.cores=10)
# FDR cut-off
stats <- suppressWarnings(calculateFDRs(m=x, CGIs=CpGislands.gr,
PMDs=PMDsegments, num.cores=10))
FDR.cutoff <- 5
m.sel <- 0.5
n.sel=as.integer(names(stats$FDRs[as.character(m.sel), ]
[stats$FDRs[as.character(m.sel), ]<FDR.cutoff])[1])
# UMR LMR
UMRLMRsegments <- segmentUMRsLMRs(m=x, meth.cutoff=m.sel,
nCpG.cutoff=n.sel, PMDs=PMDsegments,
num.cores=10, myGenomeSeq=Hsapiens,minCover=5,
seqLengths=sLengths)
# Saving
# PMD
write.table(as.data.frame(PMDsegments[PMDsegments$type=="PMD"])[c(1:3)],file=paste0("res/",tag,"_PMDs.txt"),
sep="\t",quote=F,col.names=F,row.names=F)
# LMR & UMR
write.table(granges(UMRLMRsegments[UMRLMRsegments$type=="LMR",]),file=paste0("res/",tag,"_LMRs.txt"),
sep="\t",quote=F,col.names=F,row.names=F)
write.table(granges(UMRLMRsegments[UMRLMRsegments$type=="UMR",]),file=paste0("res/",tag,"_UMRs.txt"),
sep="\t",quote=F,col.names=F,row.names=F)
}
cancer_types <- list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/") %>%
str_split_i(pattern="[_.]",i=2) %>% setdiff(c("OV","PACA"))
mn = list()
for ( sample in cancer_types ){
mhb_peak = fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/MHB_",sample,".bed.gz")) %>% toGRanges()
mhb_tag = sample
# Read the TCGA ATAC-seq Peaks
ATAC_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ATAC/"
if (mhb_tag == "BRCA"){
i = "BRCA_hg19_peak.bed.gz"
}else if(mhb_tag == "CESC"){
i = "CESC_hg19_peak.bed.gz"
}else if (mhb_tag == "COAD"){
i = "COAD_hg19_peak.bed.gz"
}else if (mhb_tag == "ESCA"){
i = "ESCA_hg19_peak.bed.gz"
}else if (mhb_tag =="HNSC"){
i = "HNSC_hg19_peak.bed.gz"
}else if (mhb_tag == "LIHC"){
i = "LIHC_hg19_peak.bed.gz"
}else if(mhb_tag == "NSCLC"){
i = "LUAD_hg19_peak.bed.gz"
}else if (mhb_tag == "STAD"){
i = "STAD_hg19_peak.bed.gz"
}else if(mhb_tag == "THCA"){
i = "THCA_hg19_peak.bed.gz"
}
tag = strsplit(i,"_")[[1]][1]
# ATAC-seq peaks
assign(tag,fread(paste0(ATAC_path,i)) %>% toGRanges())
# Cancer specific MHB
x=findOverlaps(get(tag),mhb_peak,type = "any", ignore.strand = TRUE)
# Overlapped
cM=length(unique(subjectHits(x)))
# total MHB
cL=length(mhb_peak)
m = round(100*cM/cL,2)
mn[[tag]] = m
}
# add LUSC
i="LUSC_hg19_peak.bed.gz"
tag = strsplit(i,"_")[[1]][1]
assign(tag,fread(paste0(ATAC_path,i)) %>% toGRanges())
x=findOverlaps(get(tag),mhb_peak,type = "any", ignore.strand = TRUE)
cM=length(unique(subjectHits(x)))
cL=length(mhb_peak)
m = round(100*cM/cL,0)
mn[[tag]] = m
MHB_Ov_ATAC=as.data.frame(do.call(rbind,mn))
MHB_Ov_ATAC$type = rownames(MHB_Ov_ATAC)
MHB_Ov_ATAC = MHB_Ov_ATAC[order(rownames(MHB_Ov_ATAC)),]
# PLOT
p<-ggplot(MHB_Ov_ATAC,aes(type,V1)) +
geom_bar(stat="identity",fill="steelblue",width=0.8)+
scale_y_continuous(expand=c(0,0),limits=c(0,80))+
labs(y="The Frequency of Cancer-type MHBs\n Overlap with TCGA ATAC peaks (%)")+
theme_bw() +
theme(panel.grid= element_blank(),panel.background=element_blank(),
panel.border=element_blank(),axis.line=element_line(color="black"),
axis.title.x=element_blank(),
axis.text.x = element_text(color="black",size=10,angle=45,vjust=1,hjust=1),
axis.text.y = element_text(color="black",size=10)) +
geom_text(aes(label=paste0(V1,"%")), vjust=-0.3, size=3.5,color="black")
print(p)
cancer_type = list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/")
mn =list()
for (i in cancer_type ){
mhb = fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",i)) %>% toGRanges()
mhb_tag = i %>% str_split_i(pattern="[_.]",i=2)
# Read the cancer type PMD HMR LMR UMR regions
MR_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/"
if (mhb_tag == "BRCA"){
tag = "breast_T"
}else if(mhb_tag == "CESC"){
tag = "cervix_T"
}else if (mhb_tag == "COAD"){
tag = "colon_T"
}else if (mhb_tag == "ESCA"){
tag = "esophagus_T"
}else if (mhb_tag =="HNSC"){
tag = "head_and_neck_T"
}else if (mhb_tag == "LIHC"){
tag = "liver_T"
}else if(mhb_tag == "NSCLC"){
tag = "lung_T"
}else if (mhb_tag == "STAD"){
tag = "stomach_T"
}else if(mhb_tag == "THCA"){
tag = "thyroid_T"
}else if(mhb_tag == "OV"){
tag = "ovary_T"
}else if(mhb_tag == "PACA"){
tag = "pancreas_T"
}
# Genomic feature regions
GFR = fread(paste0(MR_path,tag,"_genomic_segments.bed.gz")) %>% toGRanges()
# Cancer-type MHB overlaps
# PMD
x1=findOverlaps(subset(GFR,V4=="PMD"),mhb,type = "any", ignore.strand = TRUE)
m1=length(unique(subjectHits(x1)))
# HMR
x2=findOverlaps(subset(GFR,V4=="HMR"),mhb,type = "any", ignore.strand = TRUE)
m2=length(unique(subjectHits(x2)))
# LMR
x3=findOverlaps(subset(GFR,V4=="LMR"),mhb,type = "any", ignore.strand = TRUE)
m3=length(unique(subjectHits(x3)))
# UMR
x4=findOverlaps(subset(GFR,V4=="UMR"),mhb,type = "any", ignore.strand = TRUE)
m4=length(unique(subjectHits(x4)))
m = data.frame(freq = c(m1,m2,m3,m4)/sum(m1+m2+m3+m4))
rownames(m) = c("PMD","HMR","LMR","UMR")
names(m) = tag
mn[[tag]] = m
}
res = do.call(cbind,mn) %>% as.data.frame() %>% rownames_to_column(var="segments") %>%
pivot_longer(!segments,names_to="variable",values_to="value")
# PLOT
p<-ggplot(res,aes(x=variable,y=value*100,fill=factor(segments,level=c("HMR","PMD","LMR","UMR"))))+
geom_bar(stat="identity",position="stack")+
scale_fill_brewer(palette = "Set1") +
ggtitle("The Genomic distribution of Cancer-type MHBs")+
labs(x="",y="Percent (%)") +
theme_bw()+
theme(panel.grid = element_blank(),panel.background=element_blank(),
axis.ticks.length=unit(5,'pt'),axis.line=element_line(color="black"),
axis.title.x=element_blank(),
axis.text.x = element_text(color="black",size=10,angle=45,vjust=1,hjust=1),
axis.text.y = element_text(color="black",size=10),
axis.ticks = element_line(color="black")) +
guides(fill=guide_legend(title=NULL))
print(p)
mhb = "/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
files = list.files(mhb)
# LOLA enrichment
locResults = lapply(files,function(i){
# Input
mhb=fread(paste0(mhb,i)) %>% toGRanges()
# Universe_Sets Background
universe_set=fread(background) %>% toGRanges()
# Genomic features
states=loadRegionDB("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig2/genomic_segments/LOLA")
# Runing LOLA
runLOLA(mhb, universe_set, states, cores=10)
})
names(locResults) <-files
# enrichment output
res = lapply(files,function(i){
mx = locResults[[i]][,c("qValue","filename")]
mx$Type = i
mx
})
data <- as.data.frame(do.call(rbind,res))
# filter FDR < 0.01
data <- data %>% mutate(qValue=ifelse(qValue<=0.01,qValue,NA),qValue=-log10(qValue)) %>%
pivot_wider(names_from=filename,values_from=qValue)
# significance ranking
rank_t = function(x) {
m = rank(array(-x),na.last="keep",ties.method = "min")
return(m)
}
# UMR
umr <- as.data.frame(t(apply(data %>% select(contains("UMR")),1,rank_t)))
rownames(umr) <- files
names(umr) <- data %>% select(contains("UMR")) %>% names()
umr <- umr %>% select(sort(names(.)))
# LMR
lmr <- as.data.frame(t(apply(data %>% select(contains("LMR")),1,rank_t)))
rownames(lmr) <- files
names(lmr) <- data %>% select(contains("LMR")) %>% names()
lmr <- lmr %>% select(sort(names(.)))
# PMD
pmd <- as.data.frame(t(apply(data %>% select(contains("PMD")),1,rank_t)))
rownames(pmd) <- files
names(pmd) <- data %>% select(contains("PMD")) %>% names()
pmd <- pmd %>% select(sort(names(.)))
# HMR
hmr <- as.data.frame(t(apply(data %>% select(contains("HMR")),1,rank_t)))
rownames(hmr) <- files
names(hmr) <- data %>% select(contains("HMR")) %>% names()
hmr <- hmr %>% select(sort(names(.)))
# PLOT LMR, PMD
p1<- pheatmap(lmr,
main="Cancer type MHBs Overlap with LMRs",
show_colnames=T,show_rownames=T,
cluster_rows=F,cluster_cols=F,
scale="none",angle_col=45,
color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000))
print(p1)
p2<- pheatmap(pmd,
main="Cancer type MHBs Overlap with PMDs",
show_colnames=T,show_rownames=T,
cluster_rows=F,cluster_cols=F,
scale="none",angle_col=45,
color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000))
print(p2)
# Cancer Type ATAC-seq Peak
ATAC_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ATAC/"
# Genomic segments
segments_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments"
# CpG sites
hg19_CpG="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/hg19_CpG.bed"
for i in `ls ${segments_path}/*_T_MHB_genomic_segments.bed`
do
a=${i##*/}
if [[ "${a}" == *"breast"* ]];then
tag="BRCA_hg19_peak.bed"
elif [[ "${a}" == *"cervix"* ]];then
tag="CESC_hg19_peak.bed"
elif [[ "${a}" == *"colon"* ]];then
tag="COAD_hg19_peak.bed"
elif [[ "${a}" == *"esophagus"* ]];then
tag="ESCA_hg19_peak.bed"
elif [[ "${a}" == *"head_and_neck"* ]];then
tag="HNSC_hg19_peak.bed"
elif [[ "${a}" == *"lung"* ]];then
tag="LUAD_hg19_peak.bed"
elif [[ "${a}" == *"liver"* ]];then
tag="LIHC_hg19_peak.bed"
elif [[ "${a}" == *"stomach"* ]];then
tag="STAD_hg19_peak.bed"
elif [[ "${a}" == *"thyroid"* ]];then
tag="THCA_hg19_peak.bed"
fi
# MHB LMR UMR PMD HMR
less ${i} |grep MHB | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a - -b \
${ATAC_path}/${tag} > ${tag%%_*}_segments_ATAC_MHB_coverage.bed
less ${i} |grep LMR | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a - -b \
${ATAC_path}/${tag} > ${tag%%_*}_segments_ATAC_LMR_coverage.bed
less ${i} |grep UMR | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a - -b \
${ATAC_path}/${tag} > ${tag%%_*}_segments_ATAC_UMR_coverage.bed
less ${i} |grep PMD | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a - -b \
${ATAC_path}/${tag} > ${tag%%_*}_segments_ATAC_PMD_coverage.bed
less ${i} |grep HMR | bedtools intersect -a ${hg19_CpG} -b - | bedtools coverage -a - -b \
${ATAC_path}/${tag} > ${tag%%_*}_segments_ATAC_HMR_coverage.bed
done
# add LUSC
less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep MHB | bedtools intersect -a ${hg19_CpG} -b - | \
bedtools coverage -a - -b ${ATAC_path}/LUSC_hg19_peak.bed >LUSC_segments_ATAC_MHB_coverage.bed
less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep LMR | bedtools intersect -a ${hg19_CpG} -b - | \
bedtools coverage -a - -b ${ATAC_path}/LUSC_hg19_peak.bed > LUSC_segments_ATAC_LMR_coverage.bed
less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep UMR | bedtools intersect -a ${hg19_CpG} -b - | \
bedtools coverage -a - -b ${ATAC_path}/LUSC_hg19_peak.bed > LUSC_segments_ATAC_UMR_coverage.bed
less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep PMD | bedtools intersect -a ${hg19_CpG} -b - | \
bedtools coverage -a - -b ${ATAC_path}/LUSC_hg19_peak.bed > LUSC_segments_ATAC_PMD_coverage.bed
less ${segments_path}/lung_T_MHB_genomic_segments.bed |grep HMR | bedtools intersect -a ${hg19_CpG} -b - | \
bedtools coverage -a - -b ${ATAC_path}/LUSC_hg19_peak.bed > LUSC_segments_ATAC_HMR_coverage.bed The Genomic segments overlap with TCGA ATAC-seq peaks in CpG base pairs level
path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig2/ATAC_segment_coverage/"
samples=list.files(path,pattern="_MHB_coverage.bed")
# Methylation
beta_path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/Methyl_bdg/"
CpG_segment_ATAC_rate<-lapply(samples,function(x){
if(str_detect(x,"BRCA")){
beta<-paste0(beta_path,"breast_T_Merged_Count_CpG.bedGraph.gz")
}else if(str_detect(x,"CESC")){
beta<-paste0(beta_path,"cervix_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"COAD")){
beta<-paste0(beta_path,"colon_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"ESCA")){
beta<-paste0(beta_path,"esophagus_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"HNSC")){
beta<-paste0(beta_path,"head_and_neck_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"LIHC")){
beta<-paste0(beta_path,"liver_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"LUAD")){
beta<-paste0(beta_path,"lung_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"LUSC")){
beta<-paste0(beta_path,"lung_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"STAD")){
beta<-paste0(beta_path,"stomach_T_Merged_Count_CpG.bedGraph.gz")
} else if(str_detect(x,"THCA")){
beta<-paste0(beta_path,"thyroid_T_Merged_Count_CpG.bedGraph.gz")
}
# read mhb data
mhb_coverage<- fread(paste0(path,x)) %>%
select(c(1:3)) %>% mutate(MHB="MHB")
# Genomic features were divided into MHB overlapping or not
gf <- c("LMR","UMR","PMD","HMR")
Genomic_coverage <- lapply(gf, function(m) {
tag<-str_replace_all(x,"MHB",m)
fread(paste0(path,tag)) %>% select(c(1:3,6,7)) %>% mutate(Type=m) %>%
left_join(mhb_coverage,by=c("V1","V2","V3")) %>%
mutate(MHB = ifelse(is.na(MHB),paste0(m,"_NonMHB"),paste0(m,"_MHB")))
})
coverage <- do.call(rbind,Genomic_coverage) %>% as.data.frame()
MM <- fread(beta) %>% transmute(V1=V1,V2=V2,V3=V3,V4=ifelse(V4+V5>=10,round(V4/(V4+V5),2),NA))
# merge
left_join(coverage,MM,by=intersect(names(coverage),names(MM))) %>% drop_na()
})
names(CpG_segment_ATAC_rate) <- samples %>% str_split_i(pattern="_",i=1)
# In cancer type
Cancer_segment_ATAC<- lapply(names(CpG_segment_ATAC_rate),function(i){
CpG_segment_ATAC_rate[[i]] %>% dplyr::rename("olen"="V6","tlen"="V7","Beta"="V4") %>% arrange(Beta)
} )
names(Cancer_segment_ATAC) <- names(CpG_segment_ATAC_rate)
# All the Cancer merged
Tx=do.call(rbind,Cancer_segment_ATAC)
Tx$Tag = sapply(strsplit(rownames(Tx),"\\."),function(x) x[1])
### HMR
tHMR = Tx[Tx$Type == "HMR",] %>% arrange(Beta)
tHMR$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tHMR)/10)+1))[1:nrow(tHMR)]
trHMR = as.data.frame(tHMR[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
cluster_tHMR = as.data.frame(tHMR[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
trHMR = inner_join(trHMR,cluster_tHMR,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
### PMD
tPMD = Tx[Tx$Type == "PMD",] %>% arrange(Beta)
tPMD$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tPMD)/10)+1))[1:nrow(tPMD)]
trPMD = as.data.frame(tPMD[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
cluster_tPMD = as.data.frame(tPMD[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
trPMD = inner_join(trPMD,cluster_tPMD,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
### LMR
tLMR = Tx[Tx$Type == "LMR",] %>% arrange(Beta)
tLMR$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tLMR)/10)+1))[1:nrow(tLMR)]
trLMR = as.data.frame(tLMR[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
cluster_tLMR = as.data.frame(tLMR[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
trLMR = inner_join(trLMR,cluster_tLMR,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
###UMR
tUMR = Tx[Tx$Type == "UMR",] %>% arrange(Beta)
tUMR$Cluster = paste0("cluster_",rep(1:10,each=trunc(nrow(tUMR)/10)+1))[1:nrow(tUMR)]
trUMR = as.data.frame(tUMR[c("olen","tlen","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=sum))
cluster_tUMR = as.data.frame(tUMR[c("Beta","Cluster","MHB")] %>% group_by(Cluster,MHB) %>% summarise_each(funs=median))
trUMR = inner_join(trUMR,cluster_tUMR,by=c("Cluster","MHB")) %>% mutate(Freq=100*round(olen/tlen,4))
TMR= do.call(rbind,list("UMR"=trUMR,"LMR"=trLMR,"PMD"=trPMD,"HMR"=trHMR))
TMR$Segment = factor(sapply(strsplit(rownames(TMR),"\\."),function(x) x[1]),levels=c("UMR","LMR","PMD","HMR"))
save(TMR,file="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig2/Genomic_segments_ATAC_MHB.RData")
# load pre-processed data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig2/Genomic_segments_ATAC_MHB.RData")
p<- TMR %>% mutate(Type=str_split_i(MHB,pattern="_",i=2),Segment=fct_inorder(Segment)) %>%
ggplot(aes(x=Beta,y=Freq,color=Type)) +
geom_smooth(aes(color=Type),se=F,linewidth=0.8) +
scale_y_continuous(expand=c(0,0),limits=c(0,80))+
scale_x_continuous(limits=c(0,1))+
scale_color_manual(values=c("#df8f44","#00a1d5")) +
labs(x="Mean Methylation",y="The Percentage of CpGs covered by TCGA ATAC-seq Peaks(%)",
title="Pan-Cancer") +
theme_bw()+
theme(panel.grid = element_blank(),panel.background = element_blank(),
axis.ticks.length=unit(5,'pt'), axis.line=element_line(color="black"),
plot.title = element_text(hjust=0.5,vjust=0.5),
strip.background.x = element_rect(color="black",fill="gray", size=0.8, linetype="solid")) +
geom_vline(aes(xintercept = 0.05),colour="black",linetype="dashed") +
facet_wrap(~Segment,scales="fixed")
print(p)
Enrichment of MHBs in ChIA-PET by permutation test, the ChIApet_enrichment.R script can be downloaded from here.
Rscript ChIApet_enrichment.R \
--forward MCF-7_BRCA_ChIA_PET_Forward.bed \
--reverse MCF-7_BRCA_ChIA_PET_Reverse.bed \
--genome hg19.chrom.sizes \
--testBED BRCA_MHB \
--nTimes 1000 \
--tmp ./ \
--outFile BRCA_MHB_ChIA_enrichment.txt
done# Enrichment score ChIA-PET MCF7/MCF-10A
# The scores are estimated by the fold enrichment in Pol II ChIA-PET of between MHB and random regions.
MCF7_N=28.17;MCF7_T=40.06
MCF10A_N=60;MCF10A_T=53.62
data = data.frame(MCF7=c(MCF7_N,MCF7_T),MCF10A=c(MCF10A_N,MCF10A_T)) %>%
mutate(Type=factor(c("Normal_MHB","Tumor_MHB"),levels=c("Tumor_MHB","Normal_MHB"))) %>%
pivot_longer(!Type,names_to="variable",values_to="value")
p<-ggplot(data,aes(x=variable,y=value,fill=Type))+
geom_bar(stat="identity", position=position_dodge()) +
scale_fill_brewer(palette="Set1") +
labs(y="Enrichment Score")+
theme_bw()+
theme(panel.grid= element_blank(),panel.background=element_blank(),
panel.border=element_blank(),axis.line=element_line(color="black"),
axis.title.x=element_blank(),axis.text.x = element_text(color="black",size=10,
angle=45,vjust=1,hjust=1),
axis.text.y = element_text(color="black",size=10),
axis.ticks = element_line(color="black"),axis.ticks.length = unit(5, "pt"))+
geom_text(aes(label=value), vjust=-0.3, size=3.5,color="black")
print(p)
# BRCA Genomic Features LMR/UMR/PMD/HMR
GF_BRCA <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/breast_T_genomic_segments.bed.gz") %>%
toGRanges()
# CpG sites
CpG <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig2/genomic_segments/hg19_CpG.bed") %>% toGRanges()
Tx <- findOverlaps(CpG,GF_BRCA,type="any",ignore.strand=T)
GF <- CpG[queryHits(Tx)]
mcols(GF)$Type <- mcols(GF_BRCA[subjectHits(Tx)])$V4
# ChIA-PET BRCA
# forward
Chia_pet_BRCA_F <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ChIA-PET/MCF-7_ENCFF377SXL_Pol2_forward.bed.gz") %>%
toGRanges()
Td_F <- findOverlaps(Chia_pet_BRCA_F,GF,type="any",ignore.strand=T)
Chia_GF_F <- GF[subjectHits(Td_F)] %>% unique()
# reverse
Chia_pet_BRCA_R <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ChIA-PET/MCF-7_ENCFF377SXL_Pol2_reverse.bed.gz") %>%
toGRanges()
Td_R <- findOverlaps(Chia_pet_BRCA_R,GF,type="any",ignore.strand=T)
Chia_GF_R <- GF[subjectHits(Td_R)] %>% unique()
# MHB BRCA
MHB_BRCA <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/MHB_BRCA.bed.gz") %>% toGRanges()
Th <- findOverlaps(CpG,MHB_BRCA,type="any",ignore.strand=T)
MHB_CpG <- CpG[queryHits(Th)]
# fold enrichment (permutation test)
genomic_enrichment <- function(Chia_GF_F,Chia_GF_R,MHB,CpG_sites,ntimes=100) {
### input features
## Forward
T.F <- findOverlaps(MHB,Chia_GF_F,type="any",ignore.strand=T)
a = queryHits(T.F) %>% length()
## Reverse
T.R <- findOverlaps(MHB,Chia_GF_R,type="any",ignore.strand=T)
b = queryHits(T.R) %>% length()
## Observe
obs = a + b
# MHB random times: Random_fold_enrichment
xx <- seqnames(MHB) %>% as.data.frame() %>% group_by(value) %>% count() %>% dplyr::rename("chr"="value","count"="n")
chrom <- xx %>% pull(chr) %>% as.character()
RFE <- pbapply::pblapply(1:ntimes,function(times){
z <-lapply(chrom,function(y){
##chromsome counts
size <- xx %>% filter(chr==y) %>% pull(count)
CpG_sites %>% plyranges::filter(seqnames==y) %>% sample(size=size)
})
# random regions
random.MHB <- GRangesList(z) %>% unlist()
# random.MHB overlap with ChIA-PET forward & reverse
# Forward
T.RF <- findOverlaps(random.MHB,Chia_GF_F,type="any",ignore.strand=T)
a1 = queryHits(T.RF) %>% length()
# Reverse
T.RR <- findOverlaps(random.MHB,Chia_GF_R,type="any",ignore.strand=T)
b1 = queryHits(T.RR) %>% length()
# expect
exp <- a1 + b1
})
# random test : median enrichemt score
random <- unlist(RFE) %>% median()
res <- data.frame("obs"= obs,"random"=random) %>% mutate(fold_enrichment = (obs+1)/(random+1) )
return(res)
}
# Methylation features
features=c("HMR","PMD","UMR","LMR")
features <- combn(features,m=2,simplify= F) %>% c(features)
## 1. Do enrichment BRCA MHB & BRCA ChIA-PET
res1 = lapply(features,function(x){
y <- Chia_GF_F %>% plyranges::filter(Type %in% x )
z <- Chia_GF_R %>% plyranges::filter(Type %in% x )
genomic_enrichment(y,z,MHB_CpG,CpG_sites=CpG,ntimes=100)
})
names(res1) <- sapply(features,function(x){paste0(x,collapse="_")})
res.TT <- do.call(rbind,res1) %>%rownames_to_column(var="Type") %>% mutate(fold_enrichment=obs/random)
fwrite(res.TT,file="BRCA_ChIA-PET_GF_T_MHB_T_enrichment100.txt",sep="\t",quote=F,row.names=F)
# BRCA MHB & normal breast ChIA-PET
# breast nomral Genomic Features lmr/umr/pmd ...
GF_breast <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/genomic_segments/breast_N_genomic_segments.bed.gz") %>%
toGRanges()
Tx.N <- findOverlaps(CpG,GF_breast,type="any",ignore.strand=T)
GF.N <- CpG[queryHits(Tx.N)]
mcols(GF.N)$Type <- mcols(GF_breast[subjectHits(Tx.N)])$V4
# ChIA-PET normal
# forward
Chia_pet_N_F <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ChIA-PET/MCF-10A_ENCFF750BGL_Pol2_forward.bed.gz") %>%
toGRanges()
Td_NF <- findOverlaps(Chia_pet_N_F,GF.N,type="any",ignore.strand=T)
Chia_GF_NF <- GF.N[subjectHits(Td_NF)] %>% unique()
# reverse
Chia_pet_N_R <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig2/ChIA-PET/MCF-10A_ENCFF750BGL_Pol2_reverse.bed.gz") %>%
toGRanges()
Td_NR <- findOverlaps(Chia_pet_N_R,GF.N,type="any",ignore.strand=T)
Chia_GF_NR <- GF.N[subjectHits(Td_NR)] %>% unique()
## 2. Do enrichment BRCA MHB & Normal ChIA-PET
res2 = lapply(features,function(x){
y <- Chia_GF_NF %>% plyranges::filter(Type %in% x )
z <- Chia_GF_NR %>% plyranges::filter(Type %in% x )
genomic_enrichment(y,z,MHB_CpG,CpG_sites=CpG,ntimes=100)
})
names(res2) <- sapply(features,function(x){paste0(x,collapse="_")})
res.TN <- do.call(rbind,res2) %>%rownames_to_column(var="Type")
fwrite(res.TN,file="BRCA_ChIA-PET_GF_N_MHB_T_enrichment100.txt",sep="\t",quote=F,row.names=F)
# Breast normal MHB & BRCA ChIA-PET
MHB_N <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/normal/MHB_breast_normal.bed.gz") %>% toGRanges()
Th.N <- findOverlaps(CpG,MHB_N,type="any",ignore.strand=T)
MHB_CpG.N <- CpG[queryHits(Th.N)]
## 3. Do enrichment breast normal MHB & BRCA ChIA-PET
res3 <- lapply(features,function(x){
y <- Chia_GF_F %>% plyranges::filter(Type %in% x )
z <- Chia_GF_R %>% plyranges::filter(Type %in% x )
genomic_enrichment(y,z,MHB_CpG.N,CpG_sites=CpG,ntimes=100)
})
names(res3) <- sapply(features,function(x){paste0(x,collapse="_")})
res.NT <- do.call(rbind,res3) %>%rownames_to_column(var="Type")
fwrite(res.NT,file="BRCA_ChIA-PET_GF_T_MHB_N_enrichment100.txt",sep="\t",quote=F,row.names=F)
# 4. Do enrichment breast normal MHB & normal ChIA-PET
res4 = lapply(features,function(x){
y <- Chia_GF_NF %>% plyranges::filter(Type %in% x )
z <- Chia_GF_NR %>% plyranges::filter(Type %in% x )
genomic_enrichment(y,z,MHB_CpG.N,CpG_sites=CpG,ntimes=100)
})
names(res4) <- sapply(features,function(x){paste0(x,collapse="_")})
res.NN <- do.call(rbind,res4) %>%rownames_to_column(var="Type")
fwrite(res.NN,file="BRCA_ChIA-PET_GF_N_MHB_N_enrichment100.txt",sep="\t",quote=F,row.names=F)
# load pre-processed data
path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig2/ChIA-PET/"
res.TT = fread(paste0(path,"BRCA_ChIA-PET_GF_T_MHB_T_enrichment100.txt.gz"))
res.TN = fread(paste0(path,"BRCA_ChIA-PET_GF_N_MHB_T_enrichment100.txt.gz"))
res.NT = fread(paste0(path,"BRCA_ChIA-PET_GF_T_MHB_N_enrichment100.txt.gz"))
res.NN = fread(paste0(path,"BRCA_ChIA-PET_GF_N_MHB_N_enrichment100.txt.gz"))
# PLOT
data <- rbind(res.TT,res.NT,res.TN,res.NN) %>% mutate(Clusters = rep(c("TT","NT","TN","NN"),each=10)) %>%
mutate(MHB=str_sub(Clusters,1,1),Tissue=str_sub(Clusters,2,2),
Tissue=ifelse(Tissue=="N","MCF10A","MCF7") %>%fct_rev(),
MHB = ifelse(MHB=="T","BRCA MHBs","Normal MHBs"),
Type = factor(Type,levels=c("LMR","PMD","UMR","HMR","PMD_LMR","UMR_LMR","PMD_UMR","HMR_PMD","HMR_UMR","HMR_LMR")))
p <- ggplot(data,aes(x=Type,y=fold_enrichment,fill=MHB)) +
geom_bar(stat="identity",position=position_dodge()) +
scale_fill_brewer(palette="Set1") +
labs(x="",y="fold enrichment",title="MHB in ChIA-PET enrichment") +
geom_hline(yintercept = 1,linetype="dashed",colour = "grey",linewidth=0.5) +
theme_bw() +
theme(axis.text.x =element_text(angle=60,hjust=1),
panel.background=element_blank(),
panel.grid=element_blank(),
plot.title=element_text(hjust=0.5)) +
facet_wrap(~Tissue,ncol=1)
print(p)