loading required packages:
The single cell DNA Methylation haplotype blocks (scMHBs) were identified by mHapSC.
# cell barcode
bcFile="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/GSE97693_scMethyl_tumor_id.txt"
# 1000 CpGs interval
bed="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/annotation/hg19_1000CpG.bed"
java -jar mHapSc-1.0-jar-with-dependencies.jar MHBDiscovery -mHapPath $mhap -bedFile $bed \
-cpgPath hg19_CpG.gz -bcFile $bcFile -window 5 -r2 0.5 -pvalue 0.05 \
-outputDir ./ -tag CRC_Tumor_MHB scMHB_Tumor="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/CRC_MHB_PT.bed.gz"
background="/sibcb1/bioinformatics/dataupload/cancermhaps/Fig1/MHB/MHB_11_cancers_clusters.bed.gz"
# do LOLA enrichment
mhb=fread(scMHB_Tumor) %>% toGRanges()
# Universe_Sets Background
universe_set=fread(background) %>% toGRanges()
# regionDB
states=loadRegionDB("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/scLOLA/")
# Runing LOLA
locResults = runLOLA(mhb, universe_set, states, cores=10)
CI <- locResults %>% select(support,b,c,d) %>% apply(1,function(x){
CI.low <- matrix(x,ncol=2,nrow=2) %>% fisher.test() %>% broom::tidy() %>% pull(3)
CI.high <- matrix(x,ncol=2,nrow=2) %>% fisher.test() %>% broom::tidy() %>% pull(4)
return(c(CI.low,CI.high))}) %>%
t() %>% as.data.frame() %>%
rename_with(~c("CI.low","CI.high"))
locResults <- locResults %>% cbind(CI) %>% mutate(OVR=100*round(support/size,4),
logFDR=ifelse(qValue==0,300,-log10(qValue)),
filename=fct_rev(fct_inorder(str_split_i(filename,"[_.]",i=2))))
# PLOT
p<- ggplot(locResults,aes(x=oddsRatio,y=filename,col=logFDR)) +
geom_point(aes(size=OVR,col=logFDR),shape=18) +
geom_errorbarh(aes(xmax = CI.high, xmin = CI.low), height = 0,size=1)+
scale_color_gradient2(midpoint=0.5*max(locResults$logFDR),space ="Lab",
low="steelblue",mid="orange",high="firebrick") +
geom_vline(aes(xintercept = 1),color="gray",linetype="dashed", size = 0.5)+
scale_x_continuous(limits=c(0.1, 3.2), breaks = seq(0, 3.5, 0.5))+
xlab('Odds Ratio') + ylab(" ") +
theme_classic() +
theme(legend.position = "top",
axis.text = element_text(size=15,color="black"),
axis.title.x = element_text(size=20,color="black"),
axis.ticks.length=unit(0.2, "cm") )
print(p)
# Step 1: promoter methylation
scMethyl_Mcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_methylated.txt.gz",header=T) %>%
distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_total.txt.gz",header=T) %>%
distinct() %>% column_to_rownames(var="region")
# filter Promoter Total covered reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>%
mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
# toGR
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# sample id with Paired RNA & Methylation
id <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/annotation/GSE97693_scRNA_scMethyl_matched_info.txt.gz",header=T) %>%
select(ID,patients,lesions,assay,tissue,tag) %>%
mutate(Methyl_ID = str_split_i(ID,",",i=1),Exp_ID=str_split_i(ID,",",i=2))
# Promoters overlap with MHBs
tag=c("LN","PT","ML","MP","Tumor")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/MHB/"
MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
# overlapping
Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
a<- rep(NA,length(scMethyl.GR))
# MHB
a[unique(queryHits(Tx))]<-"T"
a[is.na(a)]<-"N"
a<- a %>% as.data.frame()
names(a)<-"MHB"
# sample GSM ID
if(x!="Tumor"){
sid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
}else{
sid <- id %>% pull(Methyl_ID)
}
# select
mcols(scMethyl.GR) %>% as.data.frame() %>%
select(SYMBOL,all_of(sid)) %>%
cbind(a) %>% separate_rows(SYMBOL,sep=",")
})
names(scMHB_Methyl) <-tag
# Step 2: scRNA + Methylation
load("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")
scMme <- pbapply::pblapply(tag,function(x){
# cat(x,"\n")
# methylation id
if(x!="Tumor"){
mid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
}else{
mid <- id %>% pull(Methyl_ID)
}
# methylation data reshape
data_methyl <- scMHB_Methyl[[x]] %>% melt() %>% drop_na() %>% nest(.by=variable) %>%
dplyr::mutate(MM = lapply(data,function(y){
y %>% mutate(type=ifelse(value>=0.8,"high",
ifelse(value<=0.2,"low","intermediate")))
}))
# show the paired methylation & expression ,one by one
SME <- pbapply::pblapply(mid,function(m) {
# mean vs median
if(1){
ExM <- function(z) mean(z, na.rm = TRUE)
}else{
ExM <- function(z) median(z, na.rm = TRUE)
}
# cat("Processing",m,"\n")
# Methylation
data <- data_methyl %>% filter(variable==m) %>% pull(MM) %>% as.data.frame() %>%
group_by(MHB,type) %>%
transmute(genes=paste0(SYMBOL, collapse=",")) %>% distinct()
# Exp
# test the RNA data with paired Exp id
eid <- id %>% filter(str_detect(Methyl_ID,m)) %>% pull(Exp_ID)
if(sum(str_detect(names(RNA.TPM),eid))==1){
RNA <- RNA.TPM
# cat("RNA_TPM ",x, " ",m, " ", eid,"\n")
flag<-"TPM"
}else{
RNA <- RNA.FPKM
# cat("RNA_FPKM ",x, " ",m, " ", eid,"\n")
flag <-"FPKM"
}
# 6 groups
# 1.high_mhb
Hm<- data %>% filter(MHB=="T" & type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
HmE<- RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Hm) %>% pull(1)
HmE <- log2(ExM(HmE) +1)
# 2.high_nonmhb
Hnm<- data %>% filter(MHB=="N" & type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
HnmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Hnm) %>% pull(1)
HnmE <- log2(ExM(HnmE) +1)
# 3.intermediate_mhb
Im<- data %>% filter(MHB=="T" & type=="intermediate") %>% pull(genes) %>% str_split_1(pattern=",")
ImE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Im) %>% pull(1)
ImE <- log2(ExM(ImE) +1)
# 4.intermediate_nonmhb
Inm<- data %>% filter(MHB=="N" & type=="intermediate") %>% pull(genes) %>% str_split_1(pattern=",")
InmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Inm) %>% pull(1)
InmE <- log2(ExM(InmE) +1)
# 5.low_mhb
Lm<- data %>% filter(MHB=="T" & type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
LmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Lm) %>% pull(1)
LmE <- log2(ExM(LmE) +1)
# 6.low_nonmhb
Lnm<- data %>% filter(MHB=="N" & type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
LnmE<-RNA %>% select(contains(eid)) %>% filter(rownames(.) %in% Lnm) %>% pull(1)
LnmE <- log2(ExM(LnmE) +1)
# the Exp of methylation level in one cell
c(HmE,HnmE,ImE,InmE,LmE,LnmE,flag)
})
names(SME) <- mid
# cancer type : promoter methyltion + MHB + Expression
CME <- do.call(rbind,SME) %>% as.data.frame() %>%
dplyr::rename_with(~c("HmE","HnmE","ImE","InmE","LmE","LnmE","Ex")) %>%
mutate(ID=rownames(.))
})
names(scMme) <- tag
# sc_MMET: Methylation + MHB + Exp
GSE97693_scMMET <- scMme
# Step 3: PLOT
n=0
for ( x in tag ){
# cat(x,"\n")
n = n + 1
if (!is.null(GSE97693_scMMET[[x]])){ ###check is null
data <- GSE97693_scMMET[[x]] %>% as.data.frame() %>%
gather("Type","Exp",-c(ID,Ex)) %>% mutate(Me=str_sub(Type,0,1),MHB=toupper(str_sub(Type,2,2)),
Exp=as.numeric(Exp),Ex=fct_inorder(Ex))
## plot
compare_means(Exp ~ MHB, data = data, group.by = c("Me","Ex"))
assign(paste0("p",n),ggboxplot(data, x = "Me", y = "Exp", color = "MHB",
palette ="nejm",width=0.8,
bxp.errorbar = T,
add = "jitter",
add.params = list(size = 0.05),
facet.by = "Ex", short.panel.labs = FALSE
) +
xlab("Methylation-level")+
ylab("Expression")+
ggtitle(ifelse(x=="MHB_T","Tumor",x))+
stat_compare_means(aes(group = MHB),size=2) +
theme_bw() +
theme(panel.grid.major=element_line(colour=NA),
panel.background = element_rect(fill = "transparent",colour = NA),
plot.background = element_rect(fill = "transparent",colour = NA),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1),
plot.title = element_text(hjust = 0.5, size = 18),
panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))
)
# cat(n,"\n")
}
}
# paste to one figure
p <- wrap_plots(p1, p2, p3, p4,p5)
print(p)
scMethyl_Mcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_methylated.txt.gz",header=T) %>%
distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_total.txt.gz",header=T) %>%
distinct() %>% column_to_rownames(var="region")
# filter promoter total reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>%
mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
# toGR
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# sample id with Paired RNA & Methylation
id <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/annotation/GSE97693_scRNA_scMethyl_matched_info.txt.gz",header=T) %>%
select(ID,patients,lesions,assay,tissue,tag) %>%
mutate(Methyl_ID = str_split_i(ID,",",i=1),Exp_ID=str_split_i(ID,",",i=2))
# promoter overlaps with MHB
tag=c("LN","PT","ML","MP","Tumor")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/MHB/"
MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
# overlapping
Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
a<- rep(NA,length(scMethyl.GR))
# MHB
a[unique(queryHits(Tx))]<-"T"
a[is.na(a)]<-"N"
a<- a %>% as.data.frame()
names(a)<-"MHB"
# sample GSM ID
if(x!="Tumor"){
sid <- id %>% dplyr::filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
}else{
sid <- id %>% pull(Methyl_ID)
}
# select
mcols(scMethyl.GR) %>% as.data.frame() %>%
select(SYMBOL,all_of(sid)) %>%
cbind(a)
})
names(scMHB_Methyl) <-tag
# Step 2: scRNA + Methylation
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")
scMme <- pbapply::pblapply(tag,function(x) {
# cat(x,"\n")
# methylation id
if(x!="Tumor"){
mid <- id %>% filter(str_detect(tissue,x)) %>% pull(Methyl_ID)
}else{
mid <- id %>% pull(Methyl_ID)
}
# methylation data reshape
data_methyl <- scMHB_Methyl[[x]] %>% melt() %>% drop_na() %>% nest(.by=variable) %>%
dplyr::mutate(MM = lapply(data,function(y){
y %>% mutate(type=ifelse(value>=0.8,"high",
ifelse(value<=0.2,"low","Intermediate")))
}))
# MHB Methylation
scMethyl_MHB_Mcounts <- fread(paste0("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/CRC_MHB_",x,"_methylated.txt.gz"),header=T) %>%
distinct() %>% column_to_rownames(var="region")
scMethyl_MHB_Tcounts <- fread(paste0("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/CRC_MHB_",x,"_total.txt.gz"),header=T) %>%
distinct() %>%column_to_rownames(var="region")
# filter MHB with Total reads count <3
scMethyl_MHB_Tcounts[scMethyl_MHB_Tcounts<3]<- NA
scMethyl_MHB <- scMethyl_MHB_Mcounts/scMethyl_MHB_Tcounts
scMethyl_MHB.GR <- scMethyl_MHB %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>%
mutate(start=as.numeric(start)-1,end=as.numeric(end)) %>%
toGRanges() %>% great(cores=10,"MSigDB:H", "GREAT:hg19") %>%
getRegionGeneAssociations() %>%
as.data.frame() %>% mutate(annotated_genes=lapply(annotated_genes,function(x){
x %>% paste(collapse=",")
}) %>% do.call(rbind,.) %>% as.character(),
dist_to_TSS = lapply(dist_to_TSS,function(x){
x %>% paste(collapse=",")
}) %>% do.call(rbind,.) %>% as.character()) %>%
separate_rows(annotated_genes,dist_to_TSS,sep=",",convert=TRUE) %>%
filter(abs(dist_to_TSS)< 1000) %>%
select(contains("GSM"),annotated_genes) %>%
group_by(annotated_genes) %>% summarise_each(funs(mean))
# show the paired methylation & expression one by one
SME <- pbapply::pblapply(mid,function(m) {
# mean vs median
if(1){
ExM <- function(z) mean(z, na.rm = TRUE)
}else{
ExM <- function(z) median(z, na.rm = TRUE)
}
# cat("Processing",m,"\n")
# Methylation
data_Me <- data_methyl %>% filter(variable==m) %>% pull(MM) %>% as.data.frame()
data_MHB <- scMethyl_MHB.GR %>% select(c(m,"annotated_genes")) %>%
rename_with(~c("value","SYMBOL")) %>% mutate(type=ifelse(value>=0.8,"high",
ifelse(value<=0.2,"low","intermediate"))) %>%
group_by(type) %>% summarise(genes=paste0(SYMBOL, collapse=",")) %>% na.omit()
# Exp
## test the RNA data with paired Exp id
eid <- id %>% filter(str_detect(Methyl_ID,m)) %>% pull(Exp_ID)
if(sum(str_detect(names(RNA.TPM),eid))==1){
RNA <- RNA.TPM
# cat("RNA_TPM ",x, " ",m, " ", eid,"\n")
flag<-"TPM"
}else{
RNA <- RNA.FPKM
# cat("RNA_FPKM ",x, " ",m, " ", eid,"\n")
flag <-"FPKM"
}
# two groups
# 1. promoter intermediate methylation
# MHB high methylation
MHB_h <- data_MHB %>% filter(type=="high") %>% pull(genes) %>% str_split_1(pattern=",")
# MHB low methylation
MHB_l <- data_MHB %>% filter(type=="low") %>% pull(genes) %>% str_split_1(pattern=",")
data_Me1 <- data_Me %>% mutate(MHB_Methyl = ifelse(SYMBOL %in% MHB_h ,"High",
ifelse(SYMBOL %in% MHB_l,"Low","Other"))) %>%
filter(MHB_Methyl!="Other",type!="high")
IE <- RNA %>% select(contains(eid)) %>% rownames_to_column(var="SYMBOL") %>%
mutate(EID = eid,flag=flag) %>% inner_join(data_Me1,by="SYMBOL") %>%
dplyr::rename_with(~c("SYMBOL","Exp","EID","flag","MHB","TSS_MM","TSS_type","MHB_type")) %>%
group_by(TSS_type,EID,MHB_type,flag) %>%
summarise(Exp=ExM(Exp),ncount=length(SYMBOL))
})
names(SME) <- mid
# cancer type : promoter methyltion + MHB + Expression
CME <- do.call(rbind,SME) %>% as.data.frame() %>%
dplyr::rename_with(~c("TSS_type","EID","MHB","flag","Exp","ncount"))
})
names(scMme) <- tag
# sc_MMET: Tumor Methylation + MHB + Exp
GSE97693_scME_MHBGene <- scMme
# Step3: PLOT
n=0
for ( x in tag ){
# cat(x,"\n")
n = n + 1
if (!is.null(GSE97693_scME_MHBGene[[x]])){ ###check is null
data <- GSE97693_scME_MHBGene[[x]] %>% as.data.frame() %>% mutate(Exp=log2(Exp+1))
## plot
compare_means(Exp ~ MHB, data = data, group.by = c("TSS_type","flag"))
assign(paste0("p",n),ggboxplot(data, x = "TSS_type", y = "Exp",color="MHB",
palette ="nejm",width=0.8,
bxp.errorbar = T,
add = "jitter",
add.params = list(size = 0.1),
facet.by = c("flag"), short.panel.labs = FALSE
) +
xlab("Promoter Methylation-level")+
ylab("Expression")+
ggtitle(ifelse(x=="Tumor","Tumor",x))+
stat_compare_means(aes(group = MHB),size=2) +
theme_bw() +
theme(panel.grid.major=element_line(colour=NA),
panel.background = element_rect(fill = "transparent",colour = NA),
plot.background = element_rect(fill = "transparent",colour = NA),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1),
plot.title = element_text(hjust = 0.5, size = 18),
panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))
)
# cat(n,"\n")
}
}
# paste to one figure
p <- wrap_plots(p1, p2, p3, p4,p5)
print(p)
# Step1: Single-cell Expression
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Exp/GSE97693_scRNA_TPM_FPKM.RData")
# select the single cell with TPM data. (only with PT, LN, and Normal cell)
if(0){
RNA <- RNA.FPKM
# cat("RNA FPKM ","\n")
}else{
RNA <- RNA.TPM
# cat("RNA TPM ","\n")
}
names(RNA) <- str_split_i(names(RNA),"_",i=1)
# load meta data
id <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/annotation/GSE97693_Anno.txt.gz",header=T) %>%
select(ID,patients,lesions,assay,tissue,tag) %>% filter(tissue!="HeLa")
id.RNA <- id %>% filter(ID %in% names(RNA))
# Find DEG Markers
findDEG <- function(x,Idents) {
# annotation
treat <- id.RNA %>% dplyr::filter(str_detect(tissue,Idents)) %>% pull(ID)
nc <- id.RNA %>% dplyr::filter(tissue=="NC") %>% pull(ID)
# wilcoxon test
z <- function(z) {
d1 <- z[treat]%>% as.character() %>% as.numeric()
d2 <- z[nc] %>% as.character() %>% as.numeric()
p <- wilcox.test(d1,d2)$p.value
group1 <- mean(d1)
group2 <- mean(d2)
logFC <- log2(group1+1) - log2(group2+1)
c(group1,group2,logFC,p)
}
res <- t(apply(x,1,z)) %>% as.data.frame() %>% rename_with(~c("Treat","NC","logFC","p_value")) %>%
mutate(p_val_adj=p.adjust(p_value,method="fdr",length(p_value)),
DEG = ifelse(abs(logFC)>=1 & p_val_adj<=0.05,"Yes","No"))
return(res)
}
PT.markers <- findDEG(RNA,Idents="PT") %>% rownames_to_column(var="SYMBOL")
LN.markers <- findDEG(RNA,Idents="LN") %>% rownames_to_column(var="SYMBOL")
# Step2: promoter DNA methylation
scMethyl_Mcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_methylated.txt.gz",header=T) %>%
distinct() %>% column_to_rownames(var="region")
scMethyl_Tcounts <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/promoter_total.txt.gz",header=T) %>%
distinct() %>% column_to_rownames(var="region")
# filter promoters with Total reads count <3
scMethyl_Tcounts[scMethyl_Tcounts<3]<- NA
scMethyl <- scMethyl_Mcounts/scMethyl_Tcounts
scMethyl <- scMethyl %>% rownames_to_column(var="region") %>% separate(region,into=c("chr","start","end"),sep="[:-]") %>%
mutate(start=as.numeric(start)-1,end=as.numeric(end))
# promoter anno
Promoter <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig3/ESCC/Methy/GREATv4.genes.promoter_UD1000.hg19.bed.gz") %>%
dplyr::rename_with(~c("chr","start","end","strand","SYMBOL"))
scMethyl <- Promoter %>% inner_join(scMethyl,by=c("chr","start","end"))
scMethyl.GR <- makeGRangesFromDataFrame(scMethyl,keep.extra=T)
# promoter overlap with MHB
tag=c("PT","LN")
scMHB_Methyl <- pbapply::pblapply(tag,function(x){
path="/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/MHB/"
MHB<-fread(paste0(path,"CRC_MHB_",x,".bed.gz")) %>% toGRanges()
# overlapping
Tx <- findOverlaps(scMethyl.GR,MHB,ignore.strand=T)
a<- rep(NA,length(scMethyl.GR))
# MHB
a[unique(queryHits(Tx))]<-"T"
a[is.na(a)]<-"N"
a<- a %>% as.data.frame()
names(a)<-"MHB"
# sample GSM ID paired data
sid <- id %>% dplyr::filter(str_detect(tissue,x),assay=="Met") %>% pull(ID)
nc <- id %>% dplyr::filter(tissue=="NC",assay=="Met") %>% pull(ID)
ids <- c(sid,nc)
# select
mcols(scMethyl.GR) %>% as.data.frame() %>%
select(SYMBOL,all_of(ids)) %>%
cbind(a)
})
names(scMHB_Methyl) <-tag
# find Promoter DMR markers
# Remove NA
sum_of_na <- function(x){
sum(is.na(x))
}
target_genes <- function(x) {
Mm <- apply(x,1,sum_of_na) > 0.3*ncol(x) # promoter:0.3, MHB:0.9
genes <- x$SYMBOL[Mm!="TRUE"]
return(genes)
}
# DMG function
findDMGenes <- function(dataset,Idents){
# annotation
treat <- id %>% dplyr::filter(str_detect(tissue,Idents),assay=="Met") %>% pull(ID)
nc <- id %>% dplyr::filter(tissue=="NC",assay=="Met") %>% pull(ID)
# dataset 1 & dataset 2 merging
dataset1 <- dataset[[Idents]] %>% dplyr::select(SYMBOL,MHB,all_of(treat)) %>% filter(SYMBOL %in% target_genes(.))
dataset2 <- dataset[[Idents]] %>% dplyr::select(SYMBOL,all_of(nc)) %>% filter(SYMBOL %in% target_genes(.))
data <- inner_join(dataset1,dataset2,by="SYMBOL")
# find DMG
dmg <- function(x){
d1 <- x[treat]%>% as.character() %>% as.numeric()
d2 <- x[nc] %>% as.character() %>% as.numeric()
p <- wilcox.test(d1,d2)$p.value
delta <- mean(d1,na.rm=T) - mean(d2,na.rm=T)
res <- c(delta,p)
return(res)
}
res <- apply(data,1,dmg) %>% t() %>% as.data.frame() %>%
rename_with(~c("delta","P.val")) %>% mutate(fdr=p.adjust(P.val,n=length(P.val))) %>%
cbind(data) %>% mutate(DMG = ifelse(abs(delta)>0.1 & fdr <= 0.05, "Yes","No")) %>%
dplyr::select(SYMBOL,delta,P.val,fdr,DMG,MHB)
# Return
return(res)
}
DMG.PT<- findDMGenes(scMHB_Methyl,Idents="PT")
DMG.LN<- findDMGenes(scMHB_Methyl,Idents="LN")
# Identify DEGs + NonDMR + MHB Genes
# PT
NonDMG.PT <- DMG.PT %>% filter(DMG=="No") %>% select(SYMBOL,MHB)
DEG.PT <- PT.markers %>% filter(DEG=="Yes") %>% mutate(DE = ifelse(logFC >= 1,"UP","DN")) %>% select(SYMBOL,DE)
data.PT<- inner_join(DEG.PT,NonDMG.PT,by="SYMBOL")
# LN
NonDMG.LN <- DMG.LN %>% filter(DMG=="No") %>% select(SYMBOL,MHB)
DEG.LN <- LN.markers %>% filter(DEG=="Yes") %>% mutate(DE = ifelse(logFC >= 1,"UP","DN")) %>% select(SYMBOL,DE)
data.LN <- inner_join(DEG.LN,NonDMG.LN,by="SYMBOL")
PT.MHB.NonDMR.Up <- data.PT %>% filter(DE=="UP",MHB=="T") %>% pull(SYMBOL)
LN.MHB.NonDMR.Up <- data.LN %>% filter(DE=="UP",MHB=="T") %>% pull(SYMBOL)
geneset <- intersect(PT.MHB.NonDMR.Up,LN.MHB.NonDMR.Up)
test <- list("geneset"=geneset)
# saving
write_gmt <- function(x,filename) {
output<-file(filename,open = "wt")
name <- names(x)
lapply(name,function(y){
writeLines(paste(c(y,"NA",x[[y]]),collapse = "\t"),con = output)
})
close(output)
}
write_gmt(test,filename="PT-LN.MHB.NonDMR.Up.gmt")
fwrite(as.data.frame(geneset),file="PT-LN.MHB.NonDMR.Up_geneset.txt",sep="\t",quote=F,row.names=F,col.names=F)
# Step3: Geneset enrichment
# Result From Msigdb website
msig <- data.frame(ID=c("HALLMARK_MYC_TARGETS_V1","HALLMARK_UNFOLDED_PROTEIN_RESPONSE","HALLMARK_G2M_CHECKPOINT"),
qvalue = c(5.21e-8,2.33e-6,5.67e-4))
# PLOT
p <- ggplot(msig,aes(x=fct_rev(fct_inorder(ID)),y= -log10(qvalue))) +
geom_bar(stat="identity",fill="steelblue") + ylim(0,10) +
labs(x="",y="-Log10(FDR)",title="Enrichment of hallmark pathways") +
coord_flip() +
theme_bw()+
theme(panel.grid=element_blank(),
panel.background =element_blank(),
plot.title = element_text(hjust=0.5),
axis.text = element_text(color="black"))
print(p)
# Step 5: show the 42 common genes are shared by PT and LN.
# RNA merging with DNA methylation matrix
RNA.DEG.PT <- PT.markers %>% filter(DEG=="Yes") %>% filter(SYMBOL %in% geneset) %>%
inner_join(DMG.PT,by="SYMBOL") %>% dplyr::rename("PT.FC"="logFC","PT.delta"="delta") %>%
dplyr::select(SYMBOL,PT.FC,PT.delta)
RNA.DEG.LN <- LN.markers %>% filter(DEG=="Yes") %>% filter(SYMBOL %in% geneset) %>%
inner_join(DMG.LN,by="SYMBOL") %>% dplyr::rename("LN.FC"="logFC","LN.delta"="delta") %>%
dplyr::select(SYMBOL,LN.FC,LN.delta)
RNA.DEG <- RNA.DEG.PT %>% inner_join(RNA.DEG.LN,by="SYMBOL")
log2.RNA <- log2(RNA+1) %>% rownames_to_column(var="SYMBOL")
Methyl <- scMethyl %>% select(-c(1:4))
data <- RNA.DEG %>% inner_join(log2.RNA,by="SYMBOL") %>%
inner_join(Methyl,by="SYMBOL")
# RNA flag
id.RNA.NC <- id.RNA %>% filter(tissue=="NC") %>% pull(ID)
id.RNA.PT <- id.RNA %>% filter(tissue=="PT") %>% pull(ID)
id.RNA.LN <- id.RNA %>% filter(tissue=="LN") %>% pull(ID)
id.RNA.tag <- c(id.RNA.NC,id.RNA.PT,id.RNA.LN)
# Methylation flag
id.Met.NC <- id %>% filter(tissue=="NC",assay=="Met") %>% pull(ID)
id.Met.PT <- id %>% filter(tissue=="PT",assay=="Met") %>% pull(ID)
id.Met.LN <- id %>% filter(tissue=="LN",assay=="Met") %>% pull(ID)
id.Met.ML <- id %>% filter(tissue=="ML",assay=="Met") %>% pull(ID)
id.Met.MP <- id %>% filter(tissue=="MP",assay=="Met") %>% pull(ID)
id.Met.tag <- c(id.Met.NC,id.Met.PT,id.Met.LN,id.Met.ML,id.Met.MP)
# In tissue-level
mhb_N_Tx <- fread("/sibcb1/bioinformatics/dataupload/cancermhaps/Fig4/MHB/CRC_MHB_NC.bed.gz") %>% toGRanges() %>%
findOverlaps(scMethyl.GR,ignore.strand=T,type= "any")
mhb_N_genes <- mcols(scMethyl.GR[subjectHits(mhb_N_Tx)]) %>% as.data.frame() %>% pull(SYMBOL) %>% unique()
## RNA
RNA.mean <- lapply(c("PT","LN","NC"),function(x){
ID <- id.RNA %>% filter(tissue==x) %>% pull(ID)
RNA %>% dplyr::select(contains(ID)) %>% rowMeans(na.rm=T) %>% as.data.frame() %>% rename_with(~x)
})
RNA.data <- do.call(cbind,RNA.mean) %>% rownames_to_column(var="SYMBOL") %>%
filter(SYMBOL %in% geneset) %>% column_to_rownames(var="SYMBOL")
## Methylation
Met.mean <- lapply(c("PT","LN","ML","MP","NC"),function(x){
ID <- id %>% filter(tissue==x,assay=="Met") %>% pull(ID)
scMethyl %>% select(-c(1:4)) %>% dplyr::select(contains(ID)) %>% rowMeans(na.rm=T) %>% as.data.frame() %>% rename_with(~x)
})
Met.data <- do.call(cbind,Met.mean) %>% as.data.frame() %>% cbind(scMethyl[,5]) %>%
filter(SYMBOL %in% geneset) %>% arrange(SYMBOL) %>% column_to_rownames(var="SYMBOL")
# PLOT 1
## Methylation
column_ha_1 = HeatmapAnnotation(Assay = rep("WGBS",5) , Type = names(Met.data))
row_ha_1 = rowAnnotation(MHB_PT = rep(1,42),MHB_LN = rep(1,42), MHB_NC= sapply(rownames(Met.data),
function(x) {ifelse(x %in% mhb_N_genes,1,0)}))
colfun11 <- colorRamp2(breaks = c(0, 0.5, 1), colors = c('darkblue', 'white', 'firebrick'))
## RNA
column_ha_2 = HeatmapAnnotation(Assay = rep("RNA",3) , Type = names(RNA.data) )
colfun21 <- colorRamp2(breaks = c(-4, 0, 4), colors = c('darkblue', 'white', 'firebrick'))
data.RNA.scale = t(scale(t(as.matrix(RNA.data))))
p1 <- Heatmap(as.matrix(Met.data),name="Methylation",
col=colfun11,cluster_columns= F,
cluster_rows=F, left_annotation = row_ha_1,
top_annotation = column_ha_1)
p2 <- Heatmap(as.matrix(data.RNA.scale ),name="TPM(scaled)",
col=colfun21,cluster_columns= F,
cluster_rows=F,
top_annotation = column_ha_2)
p<- p1+p2
print(p)
# TCGA COAD validation MYC target & G2M genes
# load DNAme data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig4/Methyl/TCGA_Promoter_Methylation_UD1000.RData")
MM_COAD <- cbind(mcols(Mregion)["name"],rList$COAD)
# load expression data
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig5/TCGA_Exp/Firehose_Expression.RData")
load("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig5/TCGA_Exp/Firehose_SampleSheet.RData")
Exp_COAD <- Firehose_Expression$COAD
# clinical data
COAD_clinical <- fread("/sibcb2/bioinformatics2/zhangzhiqiang/CancerMHBs/Fig5/TCGA_Exp/COAD.merged_only_clinical_clin_format.txt.gz") %>%
filter(V1 %in% c("patient.bcr_patient_barcode","patient.stage_event.pathologic_stage","patient.vital_status")) %>%
column_to_rownames(var="V1") %>% t() %>% as_tibble() %>%
rename_with(~c("ID","Stage","status")) %>%
mutate(ID = str_to_upper(ID),Stage=str_replace_all(Stage,"stage ","") %>% str_to_upper())
# Combine DNAme & Exp & clinical data
match.id.COAD <- intersect(names(MM_COAD)[-1] %>% str_sub(1,16) ,colnames(Exp_COAD) %>% str_sub(1,16))
match.id.COAD <- data.frame("Tag" =match.id.COAD) %>% mutate(ID = str_sub(Tag,1,12)) %>%
left_join(COAD_clinical,by="ID") %>%
mutate(Stage=ifelse(str_detect(Stage,"III"),"III",
ifelse(str_detect(Stage,"II"),"II",
ifelse(str_detect(Stage,"IV"),"IV",
ifelse(str_detect(Stage,"I"),"I",NA)))),
Stage = ifelse(str_detect(Tag,"-11"),"Normal",Stage)) %>%
na.omit()
# signature selection
signature <- c("CBX3","DDX21","KARS","KIF5B","NOLC1","PSMA7","PSMB3","RANBP1","SLC12A2","TOP1")
signature.me <- lapply(c("I","II","III","IV","Normal"),function(x){
tag <- match.id.COAD %>% filter(Stage==x) %>% pull(Tag) %>% str_replace_all("-",".")
MM_COAD %>% as_tibble() %>% dplyr::select("name",contains(tag)) %>%
filter(name %in% signature) %>% arrange(name) %>% column_to_rownames(var="name") %>% rowMeans() %>%
as_tibble() %>% mutate(Tag = x,gene=signature)
})
signature.me <- do.call(rbind,signature.me) %>%as.data.frame() %>% pivot_wider(values_from="value",names_from="Tag") %>%
column_to_rownames(var="gene")
signature.exp <- lapply(c("I","II","III","IV","Normal"),function(x){
tag <- match.id.COAD %>% filter(Stage==x) %>% pull(Tag)
## to TPM
temp <- Exp_COAD %>% as.data.frame() %>% dplyr::select(contains(tag)) %>% rownames_to_column(var="name") %>%
filter(name %in% signature) %>% arrange(name) %>% column_to_rownames(var="name")
tpm <- 2^(temp)-1
tpm %>% rowMeans() %>% as_tibble() %>% mutate(Tag = x,gene=signature)
})
signature.exp <- do.call(rbind,signature.exp) %>%as.data.frame() %>% pivot_wider(values_from="value",names_from="Tag") %>%
column_to_rownames(var="gene")
# PLOT 2
## Methylation
column_ha_1 = HeatmapAnnotation(Type = paste0("Stage ",names(signature.me)))
colfun11 <- colorRamp2(breaks = c(0, 0.5, 1), colors = c('darkblue', 'white', 'firebrick'))
## RNA
column_ha_2 = HeatmapAnnotation(Type = paste0("Stage ",names(signature.exp) ))
colfun21 <- colorRamp2(breaks = c(-2, 0, 2), colors = c('darkblue', 'white', 'firebrick'))
data.RNA.scale = t(scale(t(as.matrix(signature.exp))))
p1 <- Heatmap(as.matrix(signature.me),name="Methylation",
col=colfun11,cluster_columns= F,
cluster_rows=F,
top_annotation = column_ha_1)
p2 <- Heatmap(as.matrix(data.RNA.scale ),name="TPM(scaled)",
col=colfun21,cluster_columns= F,
cluster_rows=F,
top_annotation = column_ha_2)
p<- p1+p2
print(p)