loading required packages:
The DNA Methylation haplotype blocks (MHBs) were identified by mHapSuite.
java -jar mHapSuite-2.0-jar-with-dependencies.jar MHBDiscovery \
-mhapPath BRCA.mhap.gz \
-cpgPath hg19_CpG.gz \
-tag MHB_BRCA \
-outputDir MHB/tumorcancer_type = list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",pattern=".bed")
counts <- lapply(cancer_type,function(x) {
fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",x),sep="\t") %>% nrow()})
names(counts) <- cancer_type %>% str_split_i("[_.]",i=2)
Ncount=as.data.frame(do.call(rbind,counts)) %>% rownames_to_column(var="Type")
p<-ggplot(Ncount,aes(Type,V1)) +
geom_bar(stat="identity",fill="steelblue",width=0.8)+
scale_y_continuous(expand=c(0,0),limits=c(0,30000))+
labs(y="The Number of MHBs")+
theme_bw()+
theme(panel.grid= element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
axis.line=element_line(color="black"),
axis.title.x=element_blank(),
axis.text.x = element_text(color="black",size=10, angle=45,vjust=1,hjust=1),
axis.text.y = element_text(color="black",size=10),
axis.ticks = element_line(color="black"),
axis.ticks.length = unit(5, "pt") )+
geom_text(aes(label=V1), vjust=-0.3, size=3.5,color="black")
print(p)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
mhb_clusters_Anno <- annotatePeak("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz",
tssRegion=c(-2000, 2000),
TxDb=txdb,
annoDb="org.Hs.eg.db")
>> loading peak file... 2024-07-01 04:41:24 PM
>> preparing features information... 2024-07-01 04:41:24 PM
>> identifying nearest features... 2024-07-01 04:41:25 PM
>> calculating distance from peak to TSS... 2024-07-01 04:41:27 PM
>> assigning genomic annotation... 2024-07-01 04:41:27 PM
>> adding gene annotation... 2024-07-01 04:41:42 PM
>> assigning chromosome lengths 2024-07-01 04:41:42 PM
>> done... 2024-07-01 04:41:42 PM
plotAnnoPie(mhb_clusters_Anno)
MHB_length = as.data.frame(mhb_clusters_Anno)$width
hist(MHB_length,
main ="",
xlab="The Length of MHB (bp)",
ylab="Frequency",col="steelblue",
ylim = c(0,60000),
breaks=seq(0,2500,100))
text(1500,40000, paste0("mean = ",round(mean(MHB_length),0)," bp"))
text(1500,45000, paste0("median = ",round(median(MHB_length),0)," bp"))
mhb_CT_matrix = read.table("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_CT_matrix.txt.gz",sep = "\t",header=T)
names(mhb_CT_matrix) = gsub("MHB_","",names(mhb_CT_matrix))
mhb_CT_matrix = mhb_CT_matrix[c(1:6,11:31,8:10,7)]
mhb_CT_matrix=mhb_CT_matrix[order(mhb_CT_matrix$num,mhb_CT_matrix$cluster,decreasing=F),]
# transform 1-0 matrix
mhb_CT_matrix[-c(1:6)][mhb_CT_matrix[-c(1:6)]>=1]<-1
rownames(mhb_CT_matrix) = paste(mhb_CT_matrix$chr,mhb_CT_matrix$start,mhb_CT_matrix$end,sep="_")
# Ordering function
order_f <-function(x,Tag,cluster){
num = table(x[Tag][x$cluster==paste0(cluster,"_cluster"),])
x[Tag][x$cluster==paste0(cluster,"_cluster"),]<-c(rep(0,num["0"]),rep(1,num["1"]))
data = x
return(data)
}
# Public GEO data : CRC_P_RRBS ESCC_P LIHC_P
mhb_CT_matrix = order_f(mhb_CT_matrix,"CRC_P","COAD")
mhb_CT_matrix = order_f(mhb_CT_matrix,"ESCC_P","ESCA")
mhb_CT_matrix = order_f(mhb_CT_matrix,"LIHC_P","LIHC")
# Our previous data : Cancer of Unknown Primary data (CUP)
cup<-c("BRCA","CESC","COADREAD","ESCA","HNSC","LIHC","LUSCLUAD","OV","STAD","THCA")
for (tag in cup) {
if (tag == "COADREAD" ) {
flag <- "COAD"
}else if (tag == "LUSCLUAD") {
flag <- "NSCLC"
}else{
flag <- tag
}
mhb_CT_matrix = order_f(mhb_CT_matrix,paste0("CUP_",tag),flag)
}
# MHB Map plot
annotation_row = mhb_CT_matrix[c(6)]
annotation_col = data.frame(Assay = factor(c(rep("WGBS",11),rep("RRBS",11),rep("WGBS",3))),
Tissue_Type = factor(c(rep("Tumor",24),rep("Normal",1))))
rownames(annotation_col) = names(mhb_CT_matrix[-c(1:6)])
annotation_colors = list(Assay=c(WGBS="#4DAF4A",RRBS="#984EA3"),
Tissue_Type=c(Tumor="#E41A1C",Normal="#377EB8"),
cluster=c(BRCA_cluster="#A6CEE3",CESC_cluster="#1F78B4",
COAD_cluster="#B2DF8A",ESCA_cluster="#FDBF6F",
HNSC_cluster="#33A02C",LIHC_cluster="#FB9A99",
NSCLC_cluster="#E31A1C",OV_cluster="#FF7F00",
PACA_cluster="#CAB2D6", STAD_cluster="#6A3D9A",
THCA_cluster="#B15928",cluster12="#1B9E77",
cluster13="#D95F02",cluster14="#7570B3",
cluster15="#E7298A",cluster16="#66A61E"))
data = as.data.frame(mhb_CT_matrix[-c(1:6)])
p <- pheatmap(data,show_rownames=F,cluster_cols= F,cluster_row=F,
annotation_legend=T,legend=T,
annotation_row=annotation_row,annotation_col=annotation_col,
annotation_colors=annotation_colors,
annotation_names_row=T,border = F,border_color = F,
gaps_col = c(11,21,24),scale = "none",angle_col = 45,
color = colorRampPalette(colors = c("white","red"))(1000)
)
print(p)
mhb_T="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/mhb_clusters/"
background="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_11_cancers_clusters.bed"
files = list.files(mhb_T)
locResults <- lapply(files,function(i){
# Read input
mhb=readBed(paste0(mhb_T,i))
# Universe_Sets Background
universe_set=readBed(background)
# Overlapped normal mhb
states=loadRegionDB("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/LOLA_Normal_DB")
# Runing LOLA
runLOLA(mhb, universe_set, states, cores=10)
})
names(locResults) <- files
data <- lapply(files,function(x){ locResults[[x]][,c("filename","pValueLog")] %>% mutate(Type=x)})
data = as.data.frame(do.call(rbind,data))
dta = dcast(data,Type~filename,value.var = "pValueLog")
rownames(dta) = dta$Type;dta$Type=NULL
dta = dta[c("BRCA_cluster.bed","CESC_cluster.bed","COAD_cluster.bed",
"ESCA_cluster.bed","HNSC_cluster.bed","LIHC_cluster.bed",
"NSCLC_cluster.bed","OV_cluster.bed","PACA_cluster.bed",
"STAD_cluster.bed", "THCA_cluster.bed","cluster12","cluster13",
"cluster14","cluster15","cluster16"),]
# filter P value < 0.05
dta[dta< -log10(0.05)]<-NA
# enrichemnt ranking
rank_t = function(x) {
m = rank(array(-x),na.last="keep",ties.method = "min")
return(m)
}
data_rank= as.data.frame(t(apply(dta,1,rank_t)))
names(data_rank) = names(dta)
tag<-names(data_rank)[-c(9)]
data_rank<- data_rank[c(tag,"MHB_placenta_normal.bed")]
# The enrichment rank of Cancer MHBs vs. Normal MHBs
p<- pheatmap(data_rank,
main="Cancer MHBs clusters vs. Normal MHBs",
show_colnames=T,show_rownames=T,
cluster_rows=F,cluster_cols=F,
scale="none",angle_col=45,
color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000)
)
print(p)
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/RnBeads_450K_hg19_Probes_GR.RData")
RnBeads_450K_hg19_GR = RnBeads_450K_hg19_GR[RnBeads_450K_hg19_GR$QC]
sourceFolder = "/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/TCGA_DNAm"
# CpG correlation
ArrayIMCorrelation <-function(MM,IntervalGR,cpgGR,Tag){
# check seqlevels
if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
stop("No shared seqlevels found.\n")
# only keep CpGs that locate in intervals
Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
Idx2 = names(cpgGR) %in% rownames(MM)
Idx = Idx1 & Idx2
if(sum(Idx) == 0)
stop("No CpGs found in pre-defined regions.\n")
cpgGR = cpgGR[Idx, ]
# keep shared CpGs
cNames = intersect(rownames(MM), names(cpgGR))
cpgGR = cpgGR[cNames]
MM = MM[cNames, ]
# Only select the tumor samples
MM = MM[,!grepl("11",substr(colnames(MM),14,15))]
# Sample-by-sample processing to control memory requirements
mergedM = matrix(NA, length(IntervalGR), 1)
x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
cM = list()
for (i in unique(subjectHits(x)) ){
a = which(subjectHits(x) == i)
tag = paste0("S",i)
if (length(a)<2){
cM[[tag]] = NA
}else if (nrow(na.omit(MM[a,]))<2) {
cM[[tag]] = NA
}else {
b = as.vector(cor(t(na.omit(MM[a,]))))
cM[[tag]] = median(b[b!=1])
}
}
aT = do.call(rbind,cM)
aID = as.character(gsub("S","",rownames(aT)))
aMM = as.matrix(aT[,1])
mergedM[as.integer(aID), ] = aMM
colnames(mergedM) = Tag
return(mergedM)
}
# general MHBs from Guo.et al.2017 (Nature genetics: NG)
NG_MHB = toGRanges("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/NG_MHB.bed")
vFiles = list.files(sourceFolder)
Ns = length(vFiles)
ngList = list()
for(i in 1:Ns){
Tag = gsub(".RData", "", vFiles[i])
cat("Processing", Tag, "\n")
fileIn = paste0(sourceFolder, "/", vFiles[i])
load(fileIn)
assign("MM", get(Tag))
rm(list = Tag)
ngList[[i]] = ArrayIMCorrelation(MM, NG_MHB, RnBeads_450K_hg19_GR,Tag)
}
names(ngList) = gsub(".RData", "", vFiles)
# Intra-MHB correlation
NG_List = do.call(cbind,ngList)
mcols(NG_MHB) = NG_List
# Cancer MHBs from In-house WGBS data
path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor"
files = list.files(path)
cTList=list()
for (cT in files ){
# Read Cancer MHB
Cancer_MHB = toGRanges(paste0(path,"/",cT))
Type = gsub("MHB_","",gsub(".bed","",cT))
Ns = length(vFiles)
cList = list()
for(i in 1:Ns){
Tag = gsub(".RData", "", vFiles[i])
cat("Processing", Tag, "\n")
fileIn = paste0(sourceFolder, "/", vFiles[i])
load(fileIn)
assign("MM", get(Tag))
rm(list = Tag)
cList[[i]] = ArrayIMCorrelation(MM, Cancer_MHB, RnBeads_450K_hg19_GR,Tag)
}
names(cList) = gsub(".RData", "", vFiles)
mcols(Cancer_MHB)= do.call(cbind,cList)
cTList[[Type]] = Cancer_MHB
}
names(cTList) = gsub("MHB_","",gsub(".bed","",files))
Plot intra-MHB CpGs correlation between general MHBs and In-house MHBs
# load pre-processed data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/TCGA_Cancer_specific_MHB_ArrayCorrelation.RData")
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/TCGA_NG_MHB_ArrayCorrelation.RData")
vFiles=names(cTList)
data = list()
for (i in vFiles){
if ( i=="BRCA" ) {
tag = i
}else if(i=="CESC"){
tag = i
}else if(i=="COAD") {
tag = c("COAD","COADREAD","READ")
}else if(i=="ESCA"){
tag = c("ESCA","STES")
}else if(i=="HNSC"){
tag = "HNSC"
}else if( i=="LIHC"){
tag = "LIHC"
}else if(i=="NSCLC"){
tag = c("LUAD","LUSC")
}else if ( i=="OV"){
tag = "OV"
}else if (i=="PACA"){
tag = "PAAD"
}else if(i=="STAD"){
tag = c("STAD","STES")
}else if( i=="THCA") {
tag = "THCA"
}
for (j in tag){
# Cancer-specific MHB
x = na.omit(mcols(cTList[[i]])[j])
x$Group = "CT_MHB"
# NG MHB
y= na.omit(mcols(NG_MHB)[j])
y$Group = "NG_MHB"
mat = as.data.frame(rbind(x,y))
data[[j]] = mat
}
}
# Merged data, showing in boxplot
for (i in names(data)){
names(data[[i]]) <- c("r","Group")
}
MHB_Cor = as.data.frame(do.call(rbind,data))
MHB_Cor$Tissue = sapply(strsplit(rownames(MHB_Cor),"\\."),function(x) x[1])
MHB_Cor$Group = factor(MHB_Cor$Group,levels=c("NG_MHB","CT_MHB"))
# Plot Cancer intra-MHB CpGs Methylation correlation
p<-ggplot(MHB_Cor, aes(x=r, y=Tissue, fill= Group)) +
geom_boxplot(outlier.shape = NA) +
labs(x="Pearson's Correlation Coefficience (r)") +
scale_x_continuous(position = "top") +
scale_fill_manual(values=c("#377EB8","#E41A1C")) +
scale_y_discrete(limits=rev) +
theme_bw() +
theme(panel.background = element_blank(),panel.border = element_blank(),
panel.grid = element_blank(),axis.line=element_line(color="black"),
axis.title.y=element_blank(),axis.text.x = element_text(color="black",size=10),
axis.text.y = element_text(color="black",size=10),axis.ticks = element_line(color="black"),
axis.ticks.length = unit(5, "pt"),legend.position="bottom")
print(p)
The enrichment of Cancer MHBs in genomic features (CGIs, promoters, enhancers and et al.) by using permutation test.
CGI="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/hg19_cpgIsland.bed"
shelf="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/hg19/hg19_cpg_shlves.bed"
shore="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/hg19_cpg_shores.bed"
LAD="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/laminB1Lads.bed"
FANTOM5_enhancer="/genome/hg19/enhancer/FANTOM5/FANTOM5_enhancer.bed"
UCSC_promoter="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/UCSC_promoters_hg19.bed"
TAD_IMR90="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/TAD_IMR90.bed"
TAD_HMEC="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/TAD_HMEC.bed"
TAD_A549="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/TAD_A549.bed"
UCE="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/Ultraconserved_elements_hg19.bed"
# MHB
MHB="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed"
# do Permutation
for i in $CGI $shelf $shore $LAD $FANTOM5_enhancer $UCSC_promoter $TAD_IMR90 $TAD_HMEC $TAD_A549 $UCE
do
tag=${i##*/}
Rscript /sibcb2/bioinformatics2/zhangzhiqiang/script/DNAme_pipeline/ChIApet_enrichment.R \
--forward ${i} \
--reverse ${i} \
--genome /sibcb2/bioinformatics2/zhangzhiqiang/genome/hg19/hg19.chrom.sizes \
--testBED ${MHB} \
--nTimes 1000 \
--tmp ./ \
--outFile res/${tag%.*}_MHB_enrichment.txt
done# plot the genomic ranges permutation test enrichment
data = data.frame(CGI=20.752,shelf=0.973,shore=3.439,LAD=0.9297,
FANTOM5_enhancer=6.383,UCSC_promoter=4.727,
TAD_IMR90=0.952,TAD_HMEC=0.947,TAD_A549=0.981,UCE=5)
data = as.data.frame(t(data))
names(data) = "Enrichment"
data$Type = rownames(data)
p <- ggplot(data,aes(y=reorder(Type,Enrichment),x=Enrichment)) +
geom_bar(stat="identity",fill="steelblue",width=0.8) +
scale_x_continuous(expand=c(0,0),limits=c(0,30),position = "top")+
labs(x="Enrichment Score")+
theme_bw()+
theme(panel.background = element_blank(),panel.border = element_blank(),
panel.grid = element_blank(),axis.line=element_line(color="black"),
axis.title.y=element_blank(),axis.text.x = element_text(color="black",size=10),
axis.text.y = element_text(color="black",size=10),
axis.ticks = element_line(color="black"),
axis.ticks.length = unit(5, "pt")) +
geom_text(aes(label=round(Enrichment,2)), vjust=0.5,hjust=0, size=3.5,color="black")
print(p)