Identification of MHBs in cancers

loading required packages:

library(tidyverse)
library(stringr)
library(data.table)
library(ggplot2)
library(pheatmap)
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(LOLA)
library(GenomicRanges)
library(reshape2)
library(regioneR)

Identification of MHBs

The DNA Methylation haplotype blocks (MHBs) were identified by mHapSuite.

java -jar mHapSuite-2.0-jar-with-dependencies.jar MHBDiscovery \
                                  -mhapPath BRCA.mhap.gz \
                                  -cpgPath hg19_CpG.gz \
                                  -tag MHB_BRCA  \
                                  -outputDir MHB/tumor

Figure 1C

cancer_type = list.files("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",pattern=".bed")
counts <- lapply(cancer_type,function(x) {
            fread(paste0("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor/",x),sep="\t") %>% nrow()})
names(counts) <- cancer_type %>% str_split_i("[_.]",i=2)
Ncount=as.data.frame(do.call(rbind,counts)) %>% rownames_to_column(var="Type")

p<-ggplot(Ncount,aes(Type,V1)) + 
    geom_bar(stat="identity",fill="steelblue",width=0.8)+
    scale_y_continuous(expand=c(0,0),limits=c(0,30000))+
    labs(y="The Number of MHBs")+
    theme_bw()+
    theme(panel.grid= element_blank(),
          panel.background=element_blank(),
          panel.border=element_blank(),
          axis.line=element_line(color="black"),
          axis.title.x=element_blank(),
          axis.text.x = element_text(color="black",size=10, angle=45,vjust=1,hjust=1),
          axis.text.y = element_text(color="black",size=10),
          axis.ticks = element_line(color="black"),
          axis.ticks.length = unit(5, "pt") )+
    geom_text(aes(label=V1), vjust=-0.3, size=3.5,color="black")
print(p)

Figure 1E

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

mhb_clusters_Anno <- annotatePeak("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz",
                                  tssRegion=c(-2000, 2000),
                                  TxDb=txdb, 
                                  annoDb="org.Hs.eg.db")

>> loading peak file...              2024-07-01 04:41:24 PM 
>> preparing features information...         2024-07-01 04:41:24 PM 
>> identifying nearest features...       2024-07-01 04:41:25 PM 
>> calculating distance from peak to TSS...  2024-07-01 04:41:27 PM 
>> assigning genomic annotation...       2024-07-01 04:41:27 PM 
>> adding gene annotation...             2024-07-01 04:41:42 PM 
>> assigning chromosome lengths          2024-07-01 04:41:42 PM 
>> done...                   2024-07-01 04:41:42 PM

plotAnnoPie(mhb_clusters_Anno)

Figure 1D

MHB_length = as.data.frame(mhb_clusters_Anno)$width
hist(MHB_length,
            main ="",
            xlab="The Length of MHB (bp)",
            ylab="Frequency",col="steelblue",
            ylim = c(0,60000),
           breaks=seq(0,2500,100))
      text(1500,40000, paste0("mean = ",round(mean(MHB_length),0)," bp"))
      text(1500,45000, paste0("median = ",round(median(MHB_length),0)," bp"))

Figure 1F

mhb_CT_matrix = read.table("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_CT_matrix.txt.gz",sep = "\t",header=T)
names(mhb_CT_matrix) = gsub("MHB_","",names(mhb_CT_matrix))
mhb_CT_matrix = mhb_CT_matrix[c(1:6,11:31,8:10,7)]
mhb_CT_matrix=mhb_CT_matrix[order(mhb_CT_matrix$num,mhb_CT_matrix$cluster,decreasing=F),]  
# transform 1-0 matrix
mhb_CT_matrix[-c(1:6)][mhb_CT_matrix[-c(1:6)]>=1]<-1
rownames(mhb_CT_matrix) = paste(mhb_CT_matrix$chr,mhb_CT_matrix$start,mhb_CT_matrix$end,sep="_")
# Ordering function
order_f <-function(x,Tag,cluster){
    num = table(x[Tag][x$cluster==paste0(cluster,"_cluster"),])
    x[Tag][x$cluster==paste0(cluster,"_cluster"),]<-c(rep(0,num["0"]),rep(1,num["1"]))
    data = x
  return(data)
}

# Public GEO data : CRC_P_RRBS ESCC_P LIHC_P
mhb_CT_matrix = order_f(mhb_CT_matrix,"CRC_P","COAD")
mhb_CT_matrix = order_f(mhb_CT_matrix,"ESCC_P","ESCA")
mhb_CT_matrix = order_f(mhb_CT_matrix,"LIHC_P","LIHC")
# Our previous data : Cancer of Unknown Primary data (CUP)
cup<-c("BRCA","CESC","COADREAD","ESCA","HNSC","LIHC","LUSCLUAD","OV","STAD","THCA")
for (tag in cup) {
   if (tag == "COADREAD" ) {
         flag <- "COAD"
   }else if (tag == "LUSCLUAD") {
         flag <- "NSCLC"
   }else{
         flag <- tag
   }
     mhb_CT_matrix = order_f(mhb_CT_matrix,paste0("CUP_",tag),flag)
}

# MHB Map plot
annotation_row = mhb_CT_matrix[c(6)]
annotation_col = data.frame(Assay = factor(c(rep("WGBS",11),rep("RRBS",11),rep("WGBS",3))), 
                                          Tissue_Type = factor(c(rep("Tumor",24),rep("Normal",1))))
rownames(annotation_col) = names(mhb_CT_matrix[-c(1:6)])
annotation_colors = list(Assay=c(WGBS="#4DAF4A",RRBS="#984EA3"),
                        Tissue_Type=c(Tumor="#E41A1C",Normal="#377EB8"),
                        cluster=c(BRCA_cluster="#A6CEE3",CESC_cluster="#1F78B4",
                        COAD_cluster="#B2DF8A",ESCA_cluster="#FDBF6F",
                        HNSC_cluster="#33A02C",LIHC_cluster="#FB9A99",
                        NSCLC_cluster="#E31A1C",OV_cluster="#FF7F00",
                        PACA_cluster="#CAB2D6", STAD_cluster="#6A3D9A",
                        THCA_cluster="#B15928",cluster12="#1B9E77",
                        cluster13="#D95F02",cluster14="#7570B3",
                        cluster15="#E7298A",cluster16="#66A61E"))
data = as.data.frame(mhb_CT_matrix[-c(1:6)])

p <- pheatmap(data,show_rownames=F,cluster_cols= F,cluster_row=F,
                annotation_legend=T,legend=T,
                annotation_row=annotation_row,annotation_col=annotation_col,
                annotation_colors=annotation_colors,
                annotation_names_row=T,border = F,border_color = F,
                gaps_col = c(11,21,24),scale = "none",angle_col = 45,
                color = colorRampPalette(colors = c("white","red"))(1000)
              )
print(p)

Figure 1G

mhb_T="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/mhb_clusters/"
background="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_11_cancers_clusters.bed"
files = list.files(mhb_T)
locResults <- lapply(files,function(i){
            # Read input
            mhb=readBed(paste0(mhb_T,i))
            # Universe_Sets Background
            universe_set=readBed(background)
            # Overlapped normal mhb
            states=loadRegionDB("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/LOLA_Normal_DB")
            # Runing LOLA
            runLOLA(mhb, universe_set, states, cores=10)
            })
names(locResults) <- files

data <- lapply(files,function(x){ locResults[[x]][,c("filename","pValueLog")] %>% mutate(Type=x)})
data = as.data.frame(do.call(rbind,data))
dta = dcast(data,Type~filename,value.var = "pValueLog")
rownames(dta) = dta$Type;dta$Type=NULL
dta = dta[c("BRCA_cluster.bed","CESC_cluster.bed","COAD_cluster.bed",
            "ESCA_cluster.bed","HNSC_cluster.bed","LIHC_cluster.bed",
            "NSCLC_cluster.bed","OV_cluster.bed","PACA_cluster.bed",
            "STAD_cluster.bed", "THCA_cluster.bed","cluster12","cluster13",
            "cluster14","cluster15","cluster16"),]
# filter P value < 0.05
dta[dta< -log10(0.05)]<-NA
# enrichemnt ranking
rank_t = function(x) {
      m = rank(array(-x),na.last="keep",ties.method = "min")
      return(m)
  }
data_rank= as.data.frame(t(apply(dta,1,rank_t)))
names(data_rank) = names(dta)
tag<-names(data_rank)[-c(9)]
data_rank<- data_rank[c(tag,"MHB_placenta_normal.bed")]

# The enrichment rank of Cancer MHBs vs. Normal MHBs 
p<- pheatmap(data_rank,
                main="Cancer MHBs clusters vs. Normal MHBs",
                show_colnames=T,show_rownames=T,
                cluster_rows=F,cluster_cols=F,
                scale="none",angle_col=45,
                color=colorRampPalette(colors = c("firebrick","white","darkblue"))(1000)
                )
print(p)

Figure 1H

load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/RnBeads_450K_hg19_Probes_GR.RData")
RnBeads_450K_hg19_GR = RnBeads_450K_hg19_GR[RnBeads_450K_hg19_GR$QC]
sourceFolder = "/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/TCGA_DNAm"
# CpG correlation
ArrayIMCorrelation <-function(MM,IntervalGR,cpgGR,Tag){
  # check seqlevels
    if(length(intersect(seqlevels(IntervalGR), seqlevels(cpgGR))) == 0)
            stop("No shared seqlevels found.\n")
    # only keep CpGs that locate in intervals
    Idx1 = countOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE) > 0
    Idx2 = names(cpgGR) %in% rownames(MM)
    Idx  = Idx1 & Idx2
    if(sum(Idx) == 0)
        stop("No CpGs found in pre-defined regions.\n")
    cpgGR = cpgGR[Idx, ]
    # keep shared CpGs
    cNames = intersect(rownames(MM), names(cpgGR))
    cpgGR  = cpgGR[cNames]
    MM    = MM[cNames, ]
    # Only select the tumor samples
    MM = MM[,!grepl("11",substr(colnames(MM),14,15))]
    # Sample-by-sample processing to control memory requirements
    mergedM = matrix(NA, length(IntervalGR), 1)
    x = findOverlaps(cpgGR, IntervalGR, type = "any", ignore.strand = TRUE)
    cM = list()
    for (i in unique(subjectHits(x)) ){
              a  = which(subjectHits(x) == i)
              tag = paste0("S",i)
        if (length(a)<2){
              cM[[tag]] = NA
        }else if (nrow(na.omit(MM[a,]))<2) {
              cM[[tag]] = NA
        }else {
              b = as.vector(cor(t(na.omit(MM[a,]))))
              cM[[tag]]  = median(b[b!=1])
          }
    }
  aT = do.call(rbind,cM)
  aID = as.character(gsub("S","",rownames(aT)))
  aMM = as.matrix(aT[,1])
  mergedM[as.integer(aID), ] = aMM
  colnames(mergedM) = Tag
  return(mergedM)
}

# general MHBs from Guo.et al.2017 (Nature genetics: NG)
NG_MHB = toGRanges("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/NG_MHB.bed")
vFiles = list.files(sourceFolder)
Ns = length(vFiles)
ngList = list()

for(i in 1:Ns){
    Tag = gsub(".RData", "", vFiles[i])
    cat("Processing", Tag, "\n")
    fileIn = paste0(sourceFolder, "/", vFiles[i])
    load(fileIn)
    assign("MM", get(Tag))
    rm(list = Tag)
    ngList[[i]] = ArrayIMCorrelation(MM, NG_MHB, RnBeads_450K_hg19_GR,Tag)
}
names(ngList) = gsub(".RData", "", vFiles)
# Intra-MHB correlation
NG_List = do.call(cbind,ngList)
mcols(NG_MHB) = NG_List

# Cancer MHBs from In-house WGBS data
path="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/tumor"
files = list.files(path)
cTList=list()
for (cT in files ){
    # Read Cancer MHB
    Cancer_MHB = toGRanges(paste0(path,"/",cT))
    Type = gsub("MHB_","",gsub(".bed","",cT))
    Ns = length(vFiles)
    cList = list()
    for(i in 1:Ns){
            Tag = gsub(".RData", "", vFiles[i])
            cat("Processing", Tag, "\n")
            fileIn = paste0(sourceFolder, "/", vFiles[i])
            load(fileIn)
            assign("MM", get(Tag))
            rm(list = Tag)
            cList[[i]] = ArrayIMCorrelation(MM, Cancer_MHB, RnBeads_450K_hg19_GR,Tag)
        }
    names(cList) = gsub(".RData", "", vFiles)
    mcols(Cancer_MHB)= do.call(cbind,cList)
    cTList[[Type]] = Cancer_MHB
    }
names(cTList) = gsub("MHB_","",gsub(".bed","",files))

Plot intra-MHB CpGs correlation between general MHBs and In-house MHBs

# load pre-processed data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/TCGA_Cancer_specific_MHB_ArrayCorrelation.RData")
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/MHB/MHB_correlation/TCGA_NG_MHB_ArrayCorrelation.RData")

vFiles=names(cTList)
data = list()
for (i in vFiles){
        if ( i=="BRCA" ) {
            tag = i
        }else if(i=="CESC"){
            tag = i
        }else if(i=="COAD") {
            tag = c("COAD","COADREAD","READ")
        }else if(i=="ESCA"){
            tag = c("ESCA","STES")
        }else if(i=="HNSC"){
            tag = "HNSC"
        }else if( i=="LIHC"){
            tag = "LIHC"
        }else if(i=="NSCLC"){
            tag = c("LUAD","LUSC")
        }else if ( i=="OV"){
            tag = "OV"
        }else if (i=="PACA"){
            tag = "PAAD"
        }else if(i=="STAD"){
            tag = c("STAD","STES")
        }else if( i=="THCA") {
            tag = "THCA"
        }
    for (j in tag){
      # Cancer-specific MHB
        x = na.omit(mcols(cTList[[i]])[j])
        x$Group = "CT_MHB"
        # NG MHB
        y= na.omit(mcols(NG_MHB)[j])
        y$Group = "NG_MHB"
        mat = as.data.frame(rbind(x,y))
        data[[j]] = mat
    }
}
# Merged data, showing in boxplot
for (i in names(data)){
    names(data[[i]]) <- c("r","Group")
}

MHB_Cor = as.data.frame(do.call(rbind,data))
MHB_Cor$Tissue = sapply(strsplit(rownames(MHB_Cor),"\\."),function(x) x[1])
MHB_Cor$Group = factor(MHB_Cor$Group,levels=c("NG_MHB","CT_MHB"))

# Plot Cancer intra-MHB CpGs Methylation correlation
p<-ggplot(MHB_Cor, aes(x=r, y=Tissue, fill= Group)) + 
        geom_boxplot(outlier.shape = NA) +
        labs(x="Pearson's Correlation Coefficience (r)") +
        scale_x_continuous(position = "top") +
        scale_fill_manual(values=c("#377EB8","#E41A1C")) +
        scale_y_discrete(limits=rev) +
        theme_bw() +
        theme(panel.background = element_blank(),panel.border = element_blank(),
              panel.grid = element_blank(),axis.line=element_line(color="black"),
              axis.title.y=element_blank(),axis.text.x = element_text(color="black",size=10),
              axis.text.y = element_text(color="black",size=10),axis.ticks = element_line(color="black"),
              axis.ticks.length = unit(5, "pt"),legend.position="bottom") 
print(p)

Figure 1I

The enrichment of Cancer MHBs in genomic features (CGIs, promoters, enhancers and et al.) by using permutation test.

CGI="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/hg19_cpgIsland.bed"
shelf="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/hg19/hg19_cpg_shlves.bed"
shore="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/hg19_cpg_shores.bed"
LAD="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/laminB1Lads.bed"
FANTOM5_enhancer="/genome/hg19/enhancer/FANTOM5/FANTOM5_enhancer.bed"
UCSC_promoter="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/UCSC_promoters_hg19.bed"
TAD_IMR90="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/TAD_IMR90.bed"
TAD_HMEC="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/TAD_HMEC.bed"
TAD_A549="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/TAD_A549.bed"
UCE="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig1/Genomic_Features/Ultraconserved_elements_hg19.bed"
# MHB
MHB="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed"
# do Permutation
for i in $CGI $shelf $shore $LAD $FANTOM5_enhancer $UCSC_promoter $TAD_IMR90 $TAD_HMEC $TAD_A549 $UCE
do
    tag=${i##*/}
    Rscript /sibcb2/bioinformatics2/zhangzhiqiang/script/DNAme_pipeline/ChIApet_enrichment.R  \
            --forward ${i} \
            --reverse ${i} \
            --genome /sibcb2/bioinformatics2/zhangzhiqiang/genome/hg19/hg19.chrom.sizes \
            --testBED ${MHB} \
            --nTimes  1000 \
            --tmp ./  \
            --outFile  res/${tag%.*}_MHB_enrichment.txt
done

# plot the genomic ranges permutation test enrichment
data = data.frame(CGI=20.752,shelf=0.973,shore=3.439,LAD=0.9297,
                  FANTOM5_enhancer=6.383,UCSC_promoter=4.727,
                  TAD_IMR90=0.952,TAD_HMEC=0.947,TAD_A549=0.981,UCE=5)
data = as.data.frame(t(data))
names(data) = "Enrichment"
data$Type = rownames(data)

p <- ggplot(data,aes(y=reorder(Type,Enrichment),x=Enrichment)) + 
        geom_bar(stat="identity",fill="steelblue",width=0.8) +
        scale_x_continuous(expand=c(0,0),limits=c(0,30),position = "top")+
        labs(x="Enrichment Score")+
        theme_bw()+
        theme(panel.background = element_blank(),panel.border = element_blank(),
              panel.grid = element_blank(),axis.line=element_line(color="black"),
              axis.title.y=element_blank(),axis.text.x = element_text(color="black",size=10),
              axis.text.y = element_text(color="black",size=10),
              axis.ticks = element_line(color="black"),
              axis.ticks.length = unit(5, "pt")) +
        geom_text(aes(label=round(Enrichment,2)), vjust=0.5,hjust=0, size=3.5,color="black")
print(p)