Validation of colon cancer MHBs with single-cell data

loading required packages:

Identification of single-cell colon cancer MHBs

The single cell DNA Methylation haplotype blocks (scMHBs) were identified by mHapSC.

# cell barcode
bcFile="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/GSE97693_scMethyl_tumor_id.txt"    
# 1000 CpGs interval
bed="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/hg19_1000CpG.bed"
java -jar mHapSc-1.0-jar-with-dependencies.jar MHBDiscovery -mHapPath $mhap -bedFile $bed \
          -cpgPath hg19_CpG.gz  -bcFile $bcFile -window 5 -r2 0.5 -pvalue 0.05 \
          -outputDir ./ -tag CRC_Tumor_MHB 

Figure 4D

scMHB_Tumor="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/CRC_MHB_PT.bed.gz"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
# do LOLA enrichment
      mhb=fread(scMHB_Tumor) %>% toGRanges()
      # Universe_Sets Background
      universe_set=fread(background) %>% toGRanges()
      # regionDB
      states=loadRegionDB("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/scLOLA/")
      # Runing LOLA
      locResults = runLOLA(mhb, universe_set, states, cores=10) 
CI <- locResults %>% select(support,b,c,d) %>% apply(1,function(x){
               CI.low <-  matrix(x,ncol=2,nrow=2) %>% fisher.test() %>% broom::tidy() %>% pull(3)
               CI.high <-  matrix(x,ncol=2,nrow=2) %>% fisher.test() %>% broom::tidy() %>% pull(4)
                  return(c(CI.low,CI.high))}) %>% 
            t() %>% as.data.frame() %>% 
            rename_with(~c("CI.low","CI.high"))
locResults <- locResults %>% cbind(CI) %>% mutate(OVR=100*round(support/size,4),
                                                logFDR=ifelse(qValue==0,300,-log10(qValue)),
                                                filename=fct_rev(fct_inorder(str_split_i(filename,"[_.]",i=2))))
# PLOT
p<- ggplot(locResults,aes(x=oddsRatio,y=filename,col=logFDR)) + 
        geom_point(aes(size=OVR,col=logFDR),shape=18) +
        geom_errorbarh(aes(xmax = CI.high, xmin = CI.low), height = 0,size=1)+
        scale_color_gradient2(midpoint=0.5*max(locResults$logFDR),space ="Lab",
                              low="steelblue",mid="orange",high="firebrick") + 
        geom_vline(aes(xintercept = 1),color="gray",linetype="dashed", size = 0.5)+
        scale_x_continuous(limits=c(0.1, 3.2), breaks = seq(0, 3.5, 0.5))+
        xlab('Odds Ratio') + ylab(" ") + 
        theme_classic() +
        theme(legend.position = "top",
              axis.text = element_text(size=15,color="black"),
              axis.title.x = element_text(size=20,color="black"),
              axis.ticks.length=unit(0.2, "cm") )
print(p)

Figure 4E

# Step 1: promoter methylation
scMethyl_Mcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_methylated.txt.gz",header=T) %>%
                    distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_total.txt.gz",header=T) %>% 
                    distinct() %>% column_to_rownames(var="region")
# filter Promoter Total covered reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts 
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>%
                         mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>% 
            dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
# toGR
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# sample id  with Paired RNA & Methylation 
id <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/annotation/GSE97693_scRNA_scMethyl_matched_info.txt.gz",header=T) %>% 
      select(ID,patients,lesions,assay,tissue,tag) %>% 
      mutate(Methyl_ID = str_split_i(ID,",",i=1),Exp_ID=str_split_i(ID,",",i=2))
# Promoters overlap with MHBs
tag=c("LN","PT","ML","MP","Tumor")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
    path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/MHB/"
    MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
    # overlapping
    Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
    a<- rep(NA,length(scMethyl.GR))
    # MHB 
    a[unique(queryHits(Tx))]<-"T"
    a[is.na(a)]<-"N"
    a<- a %>% as.data.frame()
    names(a)<-"MHB"
    # sample GSM ID
    if(x!="Tumor"){
        sid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
    }else{
        sid <- id %>% pull(Methyl_ID)
    }  
    # select
    mcols(scMethyl.GR) %>% as.data.frame() %>% 
                            select(SYMBOL,all_of(sid)) %>%
                            cbind(a) %>% separate_rows(SYMBOL,sep=",")
  })
names(scMHB_Methyl) <-tag

# Step 2: scRNA + Methylation
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")

scMme <- pbapply::pblapply(tag,function(x){
    # cat(x,"\n")
    # methylation id
    if(x!="Tumor"){
        mid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID) 
    }else{
        mid <- id %>% pull(Methyl_ID)
    }  
    # methylation data reshape
    data_methyl <- scMHB_Methyl[[x]] %>% melt() %>% drop_na() %>% nest(.by=variable) %>% 
                                          dplyr::mutate(MM = lapply(data,function(y){
                                                                 y %>% mutate(type=ifelse(value>=0.8,"high",
                                                                                   ifelse(value<=0.2,"low","intermediate"))) 
                                                              }))
    # show the paired methylation & expression ,one by one
    SME <- pbapply::pblapply(mid,function(m) {
            # mean vs median
              if(1){
                ExM <- function(z) mean(z, na.rm = TRUE)
              }else{
                ExM <- function(z) median(z, na.rm = TRUE)
              }
            # cat("Processing",m,"\n")
            # Methylation
            data <-  data_methyl  %>% filter(variable==m) %>% pull(MM) %>% as.data.frame() %>% 
                                                              group_by(MHB,type) %>% 
                                                              transmute(genes=paste0(SYMBOL, collapse=",")) %>% distinct()
              # Exp
              # test the RNA data with paired Exp id
                eid <- id %>% filter(str_detect(Methyl_ID,m)) %>% pull(Exp_ID) 
                if(sum(str_detect(names(RNA.TPM),eid))==1){
                    RNA <- RNA.TPM
                    # cat("RNA_TPM ",x, " ",m, " ", eid,"\n")
                    flag<-"TPM"
                }else{
                    RNA <- RNA.FPKM
                    # cat("RNA_FPKM ",x, " ",m, " ", eid,"\n")
                    flag <-"FPKM"
                }
                            # 6 groups
                            # 1.high_mhb
                                  Hm<- data %>% filter(MHB=="T" & type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
                                  HmE<- RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Hm) %>% pull(1)
                                  HmE <- log2(ExM(HmE) +1)
                            # 2.high_nonmhb
                                  Hnm<- data %>% filter(MHB=="N" & type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
                                  HnmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Hnm) %>% pull(1)
                                  HnmE <- log2(ExM(HnmE) +1)
                            # 3.intermediate_mhb
                                  Im<- data %>% filter(MHB=="T" & type=="intermediate") %>% pull(genes) %>% str_split_1(pattern=",")
                                  ImE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Im) %>% pull(1)
                                  ImE <- log2(ExM(ImE) +1)
                            # 4.intermediate_nonmhb
                                  Inm<- data %>% filter(MHB=="N" & type=="intermediate") %>% pull(genes) %>% str_split_1(pattern=",")
                                  InmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Inm) %>% pull(1)
                                  InmE <- log2(ExM(InmE) +1)
                            # 5.low_mhb
                                  Lm<- data %>% filter(MHB=="T" & type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
                                  LmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Lm) %>% pull(1)
                                  LmE <- log2(ExM(LmE) +1)
                            # 6.low_nonmhb
                                  Lnm<- data %>% filter(MHB=="N" & type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
                                  LnmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Lnm) %>% pull(1)
                                  LnmE <- log2(ExM(LnmE) +1)
                            
                            # the Exp of methylation level in one cell
                            c(HmE,HnmE,ImE,InmE,LmE,LnmE,flag)
    })
              names(SME) <- mid
              # cancer type : promoter methyltion + MHB +  Expression 
              CME <- do.call(rbind,SME) %>% as.data.frame() %>%
                    dplyr::rename_with(~c("HmE","HnmE","ImE","InmE","LmE","LnmE","Ex")) %>%
                     mutate(ID=rownames(.))
  })

names(scMme) <- tag 
# sc_MMET: Methylation + MHB + Exp
GSE97693_scMMET <- scMme

# Step 3: PLOT
n=0
for ( x in tag ){

    # cat(x,"\n")
    n = n + 1
    if (!is.null(GSE97693_scMMET[[x]])){   ###check is null

        data <- GSE97693_scMMET[[x]] %>% as.data.frame() %>% 
                gather("Type","Exp",-c(ID,Ex)) %>% mutate(Me=str_sub(Type,0,1),MHB=toupper(str_sub(Type,2,2)),
                                                          Exp=as.numeric(Exp),Ex=fct_inorder(Ex))
        
        ## plot
        compare_means(Exp ~ MHB, data = data, group.by = c("Me","Ex"))
        assign(paste0("p",n),ggboxplot(data, x = "Me", y = "Exp", color = "MHB",
                        palette ="nejm",width=0.8,
                        bxp.errorbar = T,
                        add = "jitter",
                        add.params = list(size = 0.05),
                        facet.by = "Ex", short.panel.labs = FALSE
                        ) +
                        xlab("Methylation-level")+ 
                        ylab("Expression")+
                        ggtitle(ifelse(x=="MHB_T","Tumor",x))+ 
                        stat_compare_means(aes(group = MHB),size=2) +
                        theme_bw() +    
                        theme(panel.grid.major=element_line(colour=NA),
                        panel.background = element_rect(fill = "transparent",colour = NA),
                        plot.background = element_rect(fill = "transparent",colour = NA),
                        panel.grid.minor = element_blank(),
                        axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1),
                        plot.title = element_text(hjust = 0.5, size = 18),
                        panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))
             )
        #     cat(n,"\n")
    }
}
# paste to one figure 
p <- wrap_plots(p1, p2, p3, p4,p5)
print(p)

Figure 4F

scMethyl_Mcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_methylated.txt.gz",header=T) %>% 
                    distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_total.txt.gz",header=T) %>% 
                    distinct() %>% column_to_rownames(var="region")

# filter promoter total reads count <3 
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts 
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>% 
                         mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
            dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
# toGR
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# sample id  with Paired RNA & Methylation 
id <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/annotation/GSE97693_scRNA_scMethyl_matched_info.txt.gz",header=T) %>% 
      select(ID,patients,lesions,assay,tissue,tag) %>% 
      mutate(Methyl_ID = str_split_i(ID,",",i=1),Exp_ID=str_split_i(ID,",",i=2))
# promoter overlaps with MHB
tag=c("LN","PT","ML","MP","Tumor")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
    path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/MHB/"
    MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
    # overlapping 
    Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
    a<- rep(NA,length(scMethyl.GR))
    # MHB 
    a[unique(queryHits(Tx))]<-"T"
    a[is.na(a)]<-"N"
    a<- a %>% as.data.frame()
    names(a)<-"MHB"
    # sample GSM ID
    if(x!="Tumor"){
        sid <- id %>% dplyr::filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
    }else{
        sid <- id %>% pull(Methyl_ID)
    }  
    # select
    mcols(scMethyl.GR) %>% as.data.frame() %>% 
                            select(SYMBOL,all_of(sid)) %>%
                            cbind(a) 
})
names(scMHB_Methyl) <-tag

# Step 2: scRNA + Methylation
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")

scMme <- pbapply::pblapply(tag,function(x) {
    # cat(x,"\n")
    # methylation id
    if(x!="Tumor"){
        mid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID) 
    }else{
        mid <- id %>% pull(Methyl_ID)
    }  
    # methylation data reshape
    data_methyl <- scMHB_Methyl[[x]] %>% melt() %>% drop_na() %>% nest(.by=variable) %>% 
                                          dplyr::mutate(MM = lapply(data,function(y){
                                                                 y %>% mutate(type=ifelse(value>=0.8,"high",
                                                                                   ifelse(value<=0.2,"low","Intermediate")))
                                                              }))
# MHB Methylation
scMethyl_MHB_Mcounts <- fread(paste0("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/CRC_MHB_",x,"_methylated.txt.gz"),header=T) %>% 
distinct() %>% column_to_rownames(var="region")
scMethyl_MHB_Tcounts <- fread(paste0("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/CRC_MHB_",x,"_total.txt.gz"),header=T) %>% 
distinct() %>%column_to_rownames(var="region")
# filter MHB with Total reads count <3
scMethyl_MHB_Tcounts[scMethyl_MHB_Tcounts<3]<- NA
scMethyl_MHB <- scMethyl_MHB_Mcounts/scMethyl_MHB_Tcounts 

scMethyl_MHB.GR <- scMethyl_MHB  %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>% 
                         mutate(start=as.numeric(start)-1,end=as.numeric(end)) %>% 
                         toGRanges() %>% great(cores=10,"MSigDB:H", "GREAT:hg19") %>% 
                         getRegionGeneAssociations() %>% 
                         as.data.frame() %>%  mutate(annotated_genes=lapply(annotated_genes,function(x){
                         x %>% paste(collapse=",")
                         }) %>% do.call(rbind,.) %>% as.character(),
                        dist_to_TSS = lapply(dist_to_TSS,function(x){
                                            x %>% paste(collapse=",")
                        }) %>% do.call(rbind,.) %>% as.character()) %>% 
                        separate_rows(annotated_genes,dist_to_TSS,sep=",",convert=TRUE)  %>%
                        filter(abs(dist_to_TSS)< 1000)  %>% 
                        select(contains("GSM"),annotated_genes)  %>% 
                        group_by(annotated_genes) %>% summarise_each(funs(mean))
        
    # show the paired methylation & expression one by one
    SME <- pbapply::pblapply(mid,function(m) {
            # mean vs median
              if(1){
                ExM <- function(z) mean(z, na.rm = TRUE)
              }else{
                ExM <- function(z) median(z, na.rm = TRUE)
              }
            # cat("Processing",m,"\n")
            # Methylation
            data_Me <-  data_methyl  %>% filter(variable==m) %>% pull(MM) %>% as.data.frame() 
            data_MHB <-  scMethyl_MHB.GR %>% select(c(m,"annotated_genes")) %>% 
                          rename_with(~c("value","SYMBOL")) %>% mutate(type=ifelse(value>=0.8,"high",
                                                                            ifelse(value<=0.2,"low","intermediate"))) %>%
                          group_by(type) %>% summarise(genes=paste0(SYMBOL, collapse=",")) %>% na.omit()
            # Exp
            ## test the RNA data  with paired Exp id
                eid <- id %>% filter(str_detect(Methyl_ID,m)) %>% pull(Exp_ID) 
                if(sum(str_detect(names(RNA.TPM),eid))==1){
                    RNA <- RNA.TPM
                #    cat("RNA_TPM ",x, " ",m, " ", eid,"\n")
                    flag<-"TPM"
                }else{
                    RNA <- RNA.FPKM
                #    cat("RNA_FPKM ",x, " ",m, " ", eid,"\n")
                    flag <-"FPKM"
                }
                # two groups
                # 1. promoter intermediate methylation 
                # MHB high  methylation 
                MHB_h <- data_MHB %>% filter(type=="high") %>% pull(genes) %>% str_split_1(pattern=",") 
                # MHB low  methylation
                MHB_l <- data_MHB %>% filter(type=="low") %>% pull(genes) %>% str_split_1(pattern=",") 

                data_Me1 <- data_Me %>% mutate(MHB_Methyl = ifelse(SYMBOL %in% MHB_h ,"High",
                                                ifelse(SYMBOL %in% MHB_l,"Low","Other")))  %>% 
                                        filter(MHB_Methyl!="Other",type!="high")
                
                IE <- RNA %>%  select(contains(eid)) %>% rownames_to_column(var="SYMBOL") %>% 
                         mutate(EID = eid,flag=flag) %>% inner_join(data_Me1,by="SYMBOL") %>% 
                         dplyr::rename_with(~c("SYMBOL","Exp","EID","flag","MHB","TSS_MM","TSS_type","MHB_type")) %>% 
                         group_by(TSS_type,EID,MHB_type,flag) %>% 
                         summarise(Exp=ExM(Exp),ncount=length(SYMBOL)) 
    })
     
          names(SME) <- mid
          # cancer type : promoter methyltion + MHB + Expression 
           CME <- do.call(rbind,SME) %>% as.data.frame() %>%
                  dplyr::rename_with(~c("TSS_type","EID","MHB","flag","Exp","ncount"))
                    
})
names(scMme) <- tag 
# sc_MMET: Tumor Methylation + MHB + Exp
GSE97693_scME_MHBGene <- scMme 

# Step3: PLOT
n=0
for ( x in tag ){

    # cat(x,"\n")
    n = n + 1
    if (!is.null(GSE97693_scME_MHBGene[[x]])){   ###check is null

      data <- GSE97693_scME_MHBGene[[x]] %>% as.data.frame() %>% mutate(Exp=log2(Exp+1))
      ## plot
        compare_means(Exp ~ MHB, data = data, group.by = c("TSS_type","flag"))
        assign(paste0("p",n),ggboxplot(data, x = "TSS_type", y = "Exp",color="MHB",
                        palette ="nejm",width=0.8,
                        bxp.errorbar = T,
                        add = "jitter",
                        add.params = list(size = 0.1),
                        facet.by = c("flag"), short.panel.labs = FALSE
                        ) +
                        xlab("Promoter Methylation-level")+ 
                        ylab("Expression")+
                        ggtitle(ifelse(x=="Tumor","Tumor",x))+ 
                        stat_compare_means(aes(group = MHB),size=2) +
                        theme_bw() +    
                        theme(panel.grid.major=element_line(colour=NA),
                        panel.background = element_rect(fill = "transparent",colour = NA),
                        plot.background = element_rect(fill = "transparent",colour = NA),
                        panel.grid.minor = element_blank(),
                        axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1),
                        plot.title = element_text(hjust = 0.5, size = 18),
                        panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))
             )
            # cat(n,"\n")
    }
}

# paste to one figure 
p <- wrap_plots(p1, p2, p3, p4,p5)
print(p)

Figure 4I

# Step1: Single-cell Expression
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")
# select the single cell with TPM data. (only with PT, LN, and Normal cell)
    if(0){
        RNA <- RNA.FPKM
       # cat("RNA FPKM ","\n")
    }else{
        RNA <- RNA.TPM
       # cat("RNA TPM ","\n")
    }
names(RNA) <- str_split_i(names(RNA),"_",i=1)
# load meta data
id <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/annotation/GSE97693_Anno.txt.gz",header=T) %>% 
      select(ID,patients,lesions,assay,tissue,tag) %>% filter(tissue!="HeLa") 
id.RNA <- id %>% filter(ID  %in% names(RNA)) 
# Find DEG Markers
findDEG <- function(x,Idents) {
    # annotation
    treat <- id.RNA %>% dplyr::filter(str_detect(tissue,Idents)) %>% pull(ID)
    nc <- id.RNA %>% dplyr::filter(tissue=="NC") %>% pull(ID)
    # wilcoxon test
    z <- function(z) {
    d1 <- z[treat]%>% as.character() %>% as.numeric()
    d2 <- z[nc] %>% as.character() %>% as.numeric()

    p <-  wilcox.test(d1,d2)$p.value
    group1 <- mean(d1)
    group2 <- mean(d2)
    logFC <- log2(group1+1) - log2(group2+1)  
    
    c(group1,group2,logFC,p)
    }
    
    res <- t(apply(x,1,z)) %>% as.data.frame() %>% rename_with(~c("Treat","NC","logFC","p_value")) %>% 
            mutate(p_val_adj=p.adjust(p_value,method="fdr",length(p_value)),
                   DEG = ifelse(abs(logFC)>=1 & p_val_adj<=0.05,"Yes","No"))
    return(res)
}

PT.markers <- findDEG(RNA,Idents="PT") %>% rownames_to_column(var="SYMBOL")
LN.markers <- findDEG(RNA,Idents="LN") %>% rownames_to_column(var="SYMBOL")

# Step2: promoter DNA methylation
scMethyl_Mcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_methylated.txt.gz",header=T) %>%
                    distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_total.txt.gz",header=T) %>% 
                    distinct() %>% column_to_rownames(var="region")
# filter promoters with Total reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts 
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>% 
                         mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
            dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# promoter overlap with MHB 
tag=c("PT","LN")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
    
    path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/MHB/"
    MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
    # overlapping
    Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
    a<- rep(NA,length(scMethyl.GR))
    # MHB 
    a[unique(queryHits(Tx))]<-"T"
    a[is.na(a)]<-"N"
    a<- a %>% as.data.frame()
    names(a)<-"MHB"
    # sample GSM ID paired data
    sid <- id %>% dplyr::filter(str_detect(tissue,x),assay=="Met") %>% pull(ID)
    nc <- id %>% dplyr::filter(tissue=="NC",assay=="Met") %>% pull(ID)

    ids <- c(sid,nc)
    # select
    mcols(scMethyl.GR) %>% as.data.frame() %>% 
                            select(SYMBOL,all_of(ids)) %>%
                            cbind(a) 
})
names(scMHB_Methyl) <-tag

# find Promoter DMR markers
# Remove NA
sum_of_na <- function(x){
  sum(is.na(x))
}
target_genes <- function(x) {
      Mm <- apply(x,1,sum_of_na) > 0.3*ncol(x)  # promoter:0.3, MHB:0.9
      genes <- x$SYMBOL[Mm!="TRUE"]
      return(genes)
}

# DMG function
findDMGenes <- function(dataset,Idents){
    # annotation
    treat <- id %>% dplyr::filter(str_detect(tissue,Idents),assay=="Met") %>% pull(ID)
    nc <- id %>% dplyr::filter(tissue=="NC",assay=="Met") %>% pull(ID)
    # dataset 1 & dataset 2 merging
    dataset1 <- dataset[[Idents]] %>% dplyr::select(SYMBOL,MHB,all_of(treat)) %>% filter(SYMBOL %in% target_genes(.)) 
    dataset2 <- dataset[[Idents]] %>% dplyr::select(SYMBOL,all_of(nc)) %>% filter(SYMBOL %in% target_genes(.))
    data <-  inner_join(dataset1,dataset2,by="SYMBOL") 
    # find DMG
    dmg <- function(x){

        d1 <- x[treat]%>% as.character() %>% as.numeric()
        d2 <- x[nc] %>% as.character() %>% as.numeric()

        p <-  wilcox.test(d1,d2)$p.value
        delta <- mean(d1,na.rm=T) - mean(d2,na.rm=T)
        res <- c(delta,p)
        return(res)
    }

    res <-  apply(data,1,dmg) %>% t() %>% as.data.frame() %>% 
                    rename_with(~c("delta","P.val")) %>% mutate(fdr=p.adjust(P.val,n=length(P.val))) %>% 
            cbind(data) %>% mutate(DMG = ifelse(abs(delta)>0.1 & fdr <= 0.05, "Yes","No")) %>%
            dplyr::select(SYMBOL,delta,P.val,fdr,DMG,MHB)
    # Return
    return(res)
}
DMG.PT<- findDMGenes(scMHB_Methyl,Idents="PT")
DMG.LN<- findDMGenes(scMHB_Methyl,Idents="LN")
# Identify DEGs + NonDMR + MHB Genes 
            # PT
            NonDMG.PT <-  DMG.PT  %>% filter(DMG=="No") %>% select(SYMBOL,MHB)
            DEG.PT <-  PT.markers %>% filter(DEG=="Yes") %>% mutate(DE = ifelse(logFC >= 1,"UP","DN")) %>% select(SYMBOL,DE)
            data.PT<- inner_join(DEG.PT,NonDMG.PT,by="SYMBOL")
            # LN            
            NonDMG.LN <-  DMG.LN %>% filter(DMG=="No")  %>% select(SYMBOL,MHB)
            DEG.LN <-  LN.markers %>% filter(DEG=="Yes") %>% mutate(DE = ifelse(logFC >= 1,"UP","DN")) %>% select(SYMBOL,DE)
            data.LN <- inner_join(DEG.LN,NonDMG.LN,by="SYMBOL")
            
            PT.MHB.NonDMR.Up <- data.PT %>% filter(DE=="UP",MHB=="T")  %>% pull(SYMBOL)
            LN.MHB.NonDMR.Up <-  data.LN %>% filter(DE=="UP",MHB=="T")  %>% pull(SYMBOL)

            geneset <-  intersect(PT.MHB.NonDMR.Up,LN.MHB.NonDMR.Up)
            test <- list("geneset"=geneset)
# saving 
write_gmt <- function(x,filename) {
    
    output<-file(filename,open = "wt")
    name <- names(x)
    lapply(name,function(y){
     writeLines(paste(c(y,"NA",x[[y]]),collapse = "\t"),con = output)
    
     })
    close(output)
}

write_gmt(test,filename="PT-LN.MHB.NonDMR.Up.gmt")
fwrite(as.data.frame(geneset),file="PT-LN.MHB.NonDMR.Up_geneset.txt",sep="\t",quote=F,row.names=F,col.names=F) 


# Step3: Geneset enrichment 
# Result From Msigdb website
msig <- data.frame(ID=c("HALLMARK_MYC_TARGETS_V1","HALLMARK_UNFOLDED_PROTEIN_RESPONSE","HALLMARK_G2M_CHECKPOINT"),
                  qvalue = c(5.21e-8,2.33e-6,5.67e-4))
# PLOT
p <- ggplot(msig,aes(x=fct_rev(fct_inorder(ID)),y= -log10(qvalue))) +
     geom_bar(stat="identity",fill="steelblue") + ylim(0,10) +
     labs(x="",y="-Log10(FDR)",title="Enrichment of hallmark pathways") +
     coord_flip() + 
     theme_bw()+
     theme(panel.grid=element_blank(),
            panel.background =element_blank(),
            plot.title = element_text(hjust=0.5),
            axis.text = element_text(color="black"))
print(p)

Figure 4J

# Step 5: show the 42 common genes are shared by PT and LN.
# RNA merging with DNA methylation matrix
RNA.DEG.PT <- PT.markers %>% filter(DEG=="Yes") %>% filter(SYMBOL %in% geneset)  %>% 
                inner_join(DMG.PT,by="SYMBOL") %>% dplyr::rename("PT.FC"="logFC","PT.delta"="delta") %>%  
                dplyr::select(SYMBOL,PT.FC,PT.delta)
RNA.DEG.LN <- LN.markers %>% filter(DEG=="Yes") %>% filter(SYMBOL %in% geneset)  %>%
                inner_join(DMG.LN,by="SYMBOL") %>% dplyr::rename("LN.FC"="logFC","LN.delta"="delta") %>% 
                dplyr::select(SYMBOL,LN.FC,LN.delta)
RNA.DEG <- RNA.DEG.PT %>% inner_join(RNA.DEG.LN,by="SYMBOL")

log2.RNA <- log2(RNA+1) %>% rownames_to_column(var="SYMBOL")
Methyl <- scMethyl %>% select(-c(1:4))
data <-  RNA.DEG %>% inner_join(log2.RNA,by="SYMBOL") %>% 
            inner_join(Methyl,by="SYMBOL")
# RNA flag
id.RNA.NC <- id.RNA %>% filter(tissue=="NC") %>% pull(ID)
id.RNA.PT <- id.RNA %>% filter(tissue=="PT") %>% pull(ID)
id.RNA.LN <- id.RNA %>% filter(tissue=="LN") %>% pull(ID)
id.RNA.tag <- c(id.RNA.NC,id.RNA.PT,id.RNA.LN)
# Methylation flag
id.Met.NC <- id %>% filter(tissue=="NC",assay=="Met") %>% pull(ID)
id.Met.PT <- id %>% filter(tissue=="PT",assay=="Met") %>% pull(ID)
id.Met.LN <- id %>% filter(tissue=="LN",assay=="Met") %>% pull(ID)
id.Met.ML <- id %>% filter(tissue=="ML",assay=="Met") %>% pull(ID)
id.Met.MP <- id %>% filter(tissue=="MP",assay=="Met") %>% pull(ID)
id.Met.tag <- c(id.Met.NC,id.Met.PT,id.Met.LN,id.Met.ML,id.Met.MP)

# In tissue-level
mhb_N_Tx <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/CRC_MHB_NC.bed.gz")  %>% toGRanges() %>% 
            findOverlaps(scMethyl.GR,ignore.strand=T,type= "any")
mhb_N_genes <- mcols(scMethyl.GR[subjectHits(mhb_N_Tx)]) %>% as.data.frame() %>%  pull(SYMBOL) %>% unique()
## RNA
RNA.mean <- lapply(c("PT","LN","NC"),function(x){
     ID <-  id.RNA %>% filter(tissue==x) %>% pull(ID)
     RNA %>% dplyr::select(contains(ID)) %>% rowMeans(na.rm=T) %>% as.data.frame() %>% rename_with(~x)
})
RNA.data <- do.call(cbind,RNA.mean) %>% rownames_to_column(var="SYMBOL") %>% 
            filter(SYMBOL %in% geneset) %>% column_to_rownames(var="SYMBOL")
## Methylation
Met.mean <- lapply(c("PT","LN","ML","MP","NC"),function(x){
     ID <-  id %>% filter(tissue==x,assay=="Met") %>% pull(ID)
     scMethyl %>% select(-c(1:4))  %>% dplyr::select(contains(ID)) %>% rowMeans(na.rm=T) %>% as.data.frame() %>% rename_with(~x)
})
Met.data <- do.call(cbind,Met.mean) %>% as.data.frame() %>% cbind(scMethyl[,5]) %>% 
            filter(SYMBOL %in% geneset) %>% arrange(SYMBOL) %>% column_to_rownames(var="SYMBOL")
# PLOT 1 
## Methylation
column_ha_1 = HeatmapAnnotation(Assay = rep("WGBS",5) , Type = names(Met.data))
row_ha_1 = rowAnnotation(MHB_PT = rep(1,42),MHB_LN = rep(1,42), MHB_NC= sapply(rownames(Met.data),
                                                                            function(x) {ifelse(x %in% mhb_N_genes,1,0)}))
colfun11 <- colorRamp2(breaks = c(0, 0.5, 1), colors = c('darkblue', 'white', 'firebrick'))
## RNA
column_ha_2 = HeatmapAnnotation(Assay = rep("RNA",3) , Type = names(RNA.data) )
colfun21 <- colorRamp2(breaks = c(-4, 0, 4), colors = c('darkblue', 'white', 'firebrick'))
data.RNA.scale = t(scale(t(as.matrix(RNA.data))))

p1 <- Heatmap(as.matrix(Met.data),name="Methylation",
                col=colfun11,cluster_columns= F,
                cluster_rows=F, left_annotation = row_ha_1,
                top_annotation = column_ha_1)
p2 <- Heatmap(as.matrix(data.RNA.scale ),name="TPM(scaled)",
                col=colfun21,cluster_columns= F,
                 cluster_rows=F,
                top_annotation = column_ha_2)
p<- p1+p2
print(p)

Figure 4K

# TCGA COAD validation MYC target & G2M  genes
# load DNAme data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/TCGA_Promoter_Methylation_UD1000.RData")
MM_COAD <- cbind(mcols(Mregion)["name"],rList$COAD)

# load expression data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig5/TCGA_Exp/Firehose_Expression.RData")
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig5/TCGA_Exp/Firehose_SampleSheet.RData")
Exp_COAD <- Firehose_Expression$COAD
# clinical data 
COAD_clinical <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig5/TCGA_Exp/COAD.merged_only_clinical_clin_format.txt.gz") %>% 
                filter(V1 %in% c("patient.bcr_patient_barcode","patient.stage_event.pathologic_stage","patient.vital_status")) %>%
                column_to_rownames(var="V1") %>% t() %>% as_tibble() %>% 
                rename_with(~c("ID","Stage","status")) %>% 
                mutate(ID = str_to_upper(ID),Stage=str_replace_all(Stage,"stage ","") %>% str_to_upper())
# Combine DNAme & Exp & clinical data
match.id.COAD <- intersect(names(MM_COAD)[-1] %>% str_sub(1,16) ,colnames(Exp_COAD) %>% str_sub(1,16))
match.id.COAD <- data.frame("Tag" =match.id.COAD) %>% mutate(ID = str_sub(Tag,1,12)) %>% 
                left_join(COAD_clinical,by="ID") %>% 
                mutate(Stage=ifelse(str_detect(Stage,"III"),"III",
                              ifelse(str_detect(Stage,"II"),"II",
                              ifelse(str_detect(Stage,"IV"),"IV",
                              ifelse(str_detect(Stage,"I"),"I",NA)))),
                        Stage = ifelse(str_detect(Tag,"-11"),"Normal",Stage)) %>%
                        na.omit()
# signature selection
signature <- c("CBX3","DDX21","KARS","KIF5B","NOLC1","PSMA7","PSMB3","RANBP1","SLC12A2","TOP1")
signature.me <- lapply(c("I","II","III","IV","Normal"),function(x){
            tag <- match.id.COAD %>% filter(Stage==x) %>% pull(Tag) %>% str_replace_all("-",".")
            MM_COAD %>% as_tibble() %>% dplyr::select("name",contains(tag)) %>% 
            filter(name %in% signature) %>% arrange(name) %>% column_to_rownames(var="name") %>% rowMeans() %>% 
            as_tibble() %>% mutate(Tag = x,gene=signature)
})
signature.me <- do.call(rbind,signature.me) %>%as.data.frame() %>% pivot_wider(values_from="value",names_from="Tag") %>% 
            column_to_rownames(var="gene")
signature.exp <- lapply(c("I","II","III","IV","Normal"),function(x){
           tag <- match.id.COAD %>% filter(Stage==x) %>% pull(Tag) 
           ## to TPM 
           temp <- Exp_COAD %>% as.data.frame() %>% dplyr::select(contains(tag)) %>% rownames_to_column(var="name") %>% 
                    filter(name %in% signature) %>% arrange(name) %>% column_to_rownames(var="name") 
           tpm  <- 2^(temp)-1 
           tpm  %>%  rowMeans() %>%  as_tibble() %>% mutate(Tag = x,gene=signature)
})
signature.exp <- do.call(rbind,signature.exp) %>%as.data.frame() %>% pivot_wider(values_from="value",names_from="Tag") %>% 
                 column_to_rownames(var="gene")
# PLOT 2
## Methylation
column_ha_1 = HeatmapAnnotation(Type = paste0("Stage ",names(signature.me)))
colfun11 <- colorRamp2(breaks = c(0, 0.5, 1), colors = c('darkblue', 'white', 'firebrick'))
## RNA
column_ha_2 = HeatmapAnnotation(Type = paste0("Stage ",names(signature.exp) ))
colfun21 <- colorRamp2(breaks = c(-2, 0, 2), colors = c('darkblue', 'white', 'firebrick'))
data.RNA.scale = t(scale(t(as.matrix(signature.exp))))

p1 <- Heatmap(as.matrix(signature.me),name="Methylation",
                col=colfun11,cluster_columns= F,
                cluster_rows=F, 
                top_annotation = column_ha_1)
p2 <- Heatmap(as.matrix(data.RNA.scale ),name="TPM(scaled)",
                col=colfun21,cluster_columns= F,
                 cluster_rows=F,
                top_annotation = column_ha_2)
p<- p1+p2
print(p)