Load required packages and functions
# Packages
library(readr)
library(stringr)
library(ggplot2)
library(ggpubr)
library(ggridges)
library(dplyr)
library(forcats)
library(minpack.lm)
library(mefa4)
library(reshape)
library(gridExtra)
library(plyr)
library(DESeq2)
library(gplots)
#Functions
LoadingCountFile <- function(file=""){
Count <- read.table(file, header=T)
return(Count)
}
Allelic.Ratio <- function(g1_counts,g2_counts){
Xa <- as.numeric(g1_counts)
Xi <- as.numeric(g2_counts)
AllelicRatio <- ( Xi / (Xi + Xa))
return(AllelicRatio)
}
AR.table <- function(counts_table){
AllelicRatio_table <- counts_table[1:6]
for (j in seq(8,ncol(counts_table),3)){
AllelicRatio_table <- cbind(AllelicRatio_table,
Allelic.Ratio(as.matrix(counts_table[j]),
as.matrix(counts_table[j+1]) ) )
}
return(AllelicRatio_table)
}
give.n <- function(x){
return(c(y = 0, label = length(x)))}
chrRNA_data_path <- "/path/to/folder/with/chrRNA/data/"
chrRNA_counts_path <- "/path/to/folder/with/chrRNA/data/chrRNA_counts/"
#Load iXist-Chrx-Dom chrRNA counts matrices
chrRNA_A11B2_ES_d0r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"WT_NoDox.sortn.count"))
chrRNA_A11B2_ES_d0r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"WTB2_NoDox_Rep1.sortn.count"))
chrRNA_A11B2_ES_d1r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"WT_DoxA.sortn.count"))
chrRNA_A11B2_ES_d1r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"WTB2_24hrDox_Rep1.sortn.count"))
chrRNA_A11B2_d2r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"1_A11B2_NPC_d2_mm.sortn.count"))
chrRNA_A11B2_d2r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"1_A11B2_NPCd2_mm.sortn.count"))
chrRNA_A11B2_d3r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"2_A11B2_NPC_d3_mm.sortn.count"))
chrRNA_A11B2_d3r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"2_A11B2_NPCd3_mm.sortn.count"))
chrRNA_A11B2_d5r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"4_A11B2_NPC_d5_mm.sortn.count"))
chrRNA_A11B2_d5r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"3_A11B2_NPCd5_mm.sortn.count"))
chrRNA_A11B2_d7r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"6_A11B2_NPC_d7_mm.sortn.count"))
chrRNA_A11B2_d7r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"4_A11B2_NPCd7_mm.sortn.count"))
chrRNA_A11B2_d15 <- LoadingCountFile(paste0(chrRNA_counts_path,"7_A11B2_NPC_d15_mm.sortn.count"))
chrRNA_A11B2_d21 <- LoadingCountFile(paste0(chrRNA_counts_path,"6_A11B2_NPCd21_mm.sortn.count"))
chrRNA_A11B2_counts_table <- cbind( chrRNA_A11B2_ES_d0r1[1:9],chrRNA_A11B2_ES_d0r2[7:9],
chrRNA_A11B2_ES_d1r1[7:9],chrRNA_A11B2_ES_d1r2[7:9],
chrRNA_A11B2_d2r1[7:9],chrRNA_A11B2_d2r2[7:9],
chrRNA_A11B2_d3r1[7:9],chrRNA_A11B2_d3r2[7:9],
chrRNA_A11B2_d5r1[7:9],chrRNA_A11B2_d5r2[7:9],
chrRNA_A11B2_d7r1[7:9],chrRNA_A11B2_d7r2[7:9],
chrRNA_A11B2_d15[7:9],chrRNA_A11B2_d21[7:9])
Filter the counts matrices by genes amenable to allelic silencing analysis and make allelic ratio tables
#Load iXist-ChrX-Dom_GeneInfo from ChrRNA analysis folder
GeneInfo_A11B2 <- read_delim(paste0(chrRNA_data_path,"GeneInfo_A11B2.txt"), "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)
colnames(GeneInfo_A11B2) #check column order and re-name as appropriate
## [1] "Geneid" "Halftime" "Starts_numeric"
## [4] "Dist_Xist" "InitialTPM" "Distance_Xist"
## [7] "Expression_group" "SilencingRate" "SmcHD1_dependence"
colnames(GeneInfo_A11B2) <- c("Geneid","Halftime","Starts_numeric","Dist_Xist","InitialTPM","Distance_Xist","Expression_group","SilencingRate","SmcHD1_dependence")
#Filter counts matrices by genes that passed allelic analysis in our previous study
chrRNA_A11B2_counts_table_filt <- chrRNA_A11B2_counts_table[which(chrRNA_A11B2_counts_table$Geneid %in% GeneInfo_A11B2$Geneid),]
#Make allelic ratio table from counts table
chrRNA_A11B2_AR_table <- AR.table(chrRNA_A11B2_counts_table_filt)
Prepare data for ribbon plots
#re-name columns of allelic ratio table with sample names
colnames(chrRNA_A11B2_AR_table)[7:ncol(chrRNA_A11B2_AR_table)] <-c("chrRNA_A11B2_ES_d0r1","chrRNA_A11B2_ES_d0r2",
"chrRNA_A11B2_ES_d1r1","chrRNA_A11B2_ES_d1r2",
"chrRNA_A11B2_d2r1","chrRNA_A11B2_d2r2",
"chrRNA_A11B2_d3r1","chrRNA_A11B2_d3r2",
"chrRNA_A11B2_d5r1","chrRNA_A11B2_d5r2",
"chrRNA_A11B2_d7r1","chrRNA_A11B2_d7r2",
"chrRNA_A11B2_d15","chrRNA_A11B2_d21")
# chrRNA - average together replicates for AR ribbon plot
chrRNA_A11B2_AR_table_merge <- data.frame(chrRNA_A11B2_AR_table[,1:6],
0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d0r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d0r2),
0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d1r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d1r2),
0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d2r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d2r2),
0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d3r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d3r2),
0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d5r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d5r2),
0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d7r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d7r2),
chrRNA_A11B2_AR_table$chrRNA_A11B2_d15, chrRNA_A11B2_AR_table$chrRNA_A11B2_d21)
# make numerical column names for sumarising for ribbon plot
chrRNA_timepoints_merge <- c(0,1,2,3,5,7,15,21)
colnames(chrRNA_A11B2_AR_table_merge)[7:ncol(chrRNA_A11B2_AR_table_merge)] <- as.numeric(chrRNA_timepoints_merge)
chrRNA_A11B2_ARsummary <- as.array(apply(chrRNA_A11B2_AR_table_merge[,7:ncol(chrRNA_A11B2_AR_table_merge)], 2, summary))
chrRNA_A11B2_ARsummary_t <- data.frame(chrRNA_timepoints_merge,
t(chrRNA_A11B2_ARsummary))
colnames(chrRNA_A11B2_ARsummary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")
#directory for allelic ATAC-seq analysis
ATAC_analysis_path <- "/path/to/your/ATAC/analysis/folder/"
# Load ATAC allelic ratio tables
ATAC_NPC_AR_table <- read_table2(paste0(ATAC_analysis_path,"ATAC_NPC_AR_table.txt"))
ATAC_NPC_AR_table_merge <- read_table2(paste0(ATAC_analysis_path,"ATAC_NPC_AR_table_merge.txt"))
# Filter by .bed file of peaks (n=790) remaining after EDM modelling and classification of persistent/depleted peaks
ATAC_peaks_EDM <- read_delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.bed"), delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
colnames(ATAC_peaks_EDM) <-c("Chr","Start","End","Label","Halftime","Peakid")
ATAC_NPC_AR_table <- ATAC_NPC_AR_table[which(ATAC_NPC_AR_table$Peakid %in% ATAC_peaks_EDM$Peakid),]
ATAC_NPC_AR_table_merge <- ATAC_NPC_AR_table_merge[which(ATAC_NPC_AR_table_merge$Peakid %in% ATAC_peaks_EDM$Peakid),]
# make numerical column names for sumarising for ribbon plot
ATAC_timepoints_merge <- c(0,1,3,6,17)
colnames(ATAC_NPC_AR_table_merge)[7:ncol(ATAC_NPC_AR_table_merge)] <- as.numeric(ATAC_timepoints_merge)
ATAC_A11B2_ARsummary <- as.array(apply(ATAC_NPC_AR_table_merge[,7:ncol(ATAC_NPC_AR_table_merge)], 2, summary))
ATAC_A11B2_ARsummary_t <- data.frame(ATAC_timepoints_merge,
t(ATAC_A11B2_ARsummary))
colnames(ATAC_A11B2_ARsummary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")
ggplot() + theme_light() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.8),expand = FALSE) +
#chrRNA
geom_ribbon(data=chrRNA_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
geom_point(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1)) +
geom_line(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
#ATAC
geom_ribbon(data=ATAC_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.2, fill="blue") +
geom_point(data=ATAC_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), colour="blue") +
geom_line(data=ATAC_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), colour="blue", linetype="dotted")
ChrRNA of individual genes
To plot exponential decay fit for each gene individually, we needed the output from running the exponential decay model fitting on chrRNA-seq timecourse (performed in our previous study).
#Load output of EDM for chrRNA
OutTableEDM_A11B2 <- read_delim(paste0(chrRNA_data_path,"OutTableEDM_A11B2.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
#plot 3 genes: points and EDM fits
chrRNA_A11B2_AR_table_merge_melt <- melt(chrRNA_A11B2_AR_table_merge, id=c("Geneid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(chrRNA_A11B2_AR_table_merge_melt)[7:8] <- c("time", "ratio")
chrRNA_A11B2_AR_table_merge_melt$time <- as.numeric(as.character(chrRNA_A11B2_AR_table_merge_melt$time))
ggplot(data=chrRNA_A11B2_AR_table_merge_melt, aes(x=time, y=ratio, group=Geneid)) + theme_bw() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Pdk3")), colour="red") +
stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Pdk3")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Pdk3")] +
(OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Pdk3")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Pdk3")])), colour="red") +
geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Mecp2")), colour="blue") +
stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Mecp2")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Mecp2")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Mecp2")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Mecp2")])), colour="blue") +
geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ogt")), colour="orange") +
stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ogt")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ogt")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ogt")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ogt")])), colour="orange") +
geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ddx3x")), colour="green") +
stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ddx3x")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")] + ((OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]-OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]))*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ddx3x")])), colour="green")
ChrRNA with technical replicates as separate data points
#melted allelic ratio table
chrRNA_A11B2_AR_table_melt <- melt(chrRNA_A11B2_AR_table, id=c("Geneid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(chrRNA_A11B2_AR_table_melt)[7:8] <- c("time", "ratio")
#Change values in "time" column to numeric (does not work before because of strange averaging properties of melt() function)
chrRNA_A11B2_AR_table_melt <- chrRNA_A11B2_AR_table_melt %>%
mutate(time = case_when(
time == "chrRNA_A11B2_ES_d0r1" ~ 0, time == "chrRNA_A11B2_ES_d0r2" ~ 0 ,
time == "chrRNA_A11B2_ES_d1r1" ~ 1, time == "chrRNA_A11B2_ES_d1r2" ~ 1 ,
time == "chrRNA_A11B2_d2r1" ~ 2, time == "chrRNA_A11B2_d2r2" ~ 2 ,
time == "chrRNA_A11B2_d3r1" ~ 3, time == "chrRNA_A11B2_d3r2" ~ 3 ,
time == "chrRNA_A11B2_d5r1" ~ 5, time == "chrRNA_A11B2_d5r2" ~ 5 ,
time == "chrRNA_A11B2_d7r1" ~ 7, time == "chrRNA_A11B2_d7r2" ~ 7 ,
time == "chrRNA_A11B2_d15" ~ 15, time == "chrRNA_A11B2_d21" ~ 21 ))
#plot replicates for each gene
ggplot(data=chrRNA_A11B2_AR_table_melt, aes(x=time, y=ratio, group=Geneid)) + theme_bw() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Pdk3")), colour="red") +
stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Pdk3")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Pdk3")] +
(OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Pdk3")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Pdk3")])), colour="red") +
geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Mecp2")), colour="blue") +
stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Mecp2")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Mecp2")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Mecp2")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Mecp2")])), colour="blue") +
geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ogt")), colour="orange") +
stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ogt")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ogt")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ogt")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ogt")])), colour="orange") +
geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ddx3x")), colour="green") +
stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ddx3x")),
fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")] + ((OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]-OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]))*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ddx3x")])), colour="green")
ATAC-seq of individual genes
#Load output of EDM for ATAC
OutTableEDM_ATAC <- read_delim(paste0(ATAC_analysis_path,"OutTableEDM_subset.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
#plot 3 peaks: points and EDM fits
library(reshape2)
ATAC_NPC_AR_table_merge_melt <- melt(ATAC_NPC_AR_table_merge, id.vars=c("Peakid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(ATAC_NPC_AR_table_merge_melt)[7:8] <- c("time", "ratio")
ATAC_NPC_AR_table_merge_melt$time <- as.numeric(as.character(ATAC_NPC_AR_table_merge_melt$time))
ggplot(data=ATAC_NPC_AR_table_merge_melt, aes(x=time, y=ratio, group=Peakid)) + theme_bw() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1689")), colour="red") +
stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1689")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1689")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1689")])), colour="red") +
geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1385")), colour="blue") +
stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1385")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1385")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1385")])), colour="blue") +
geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1923")), colour="orange") +
stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1923")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1923")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1923")])), colour="orange") +
geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_337")), colour="green") +
stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_337")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_337")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_337")])), colour="green")
ATAC-seq with technical replicates as separate data points
#melted allelic ratio table
ATAC_NPC_AR_table_melt <- melt(ATAC_NPC_AR_table, id=c("Peakid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(ATAC_NPC_AR_table_melt)[7:8] <- c("time", "ratio")
#Change values in "time" column to numeric (does not work before because of strange averaging properties of melt() function)
ATAC_NPC_AR_table_melt <- ATAC_NPC_AR_table_melt %>%
mutate(time = case_when(
time == "NPC_0d_Rep1" ~ 0, time == "NPC_0d_Rep2" ~ 0 , time == "NPC_0d_Rep3" ~ 0,
time == "NPC_1d_Rep1" ~ 1, time == "NPC_1d_Rep2" ~ 1 ,
time == "NPC_3d_Rep1" ~ 3, time == "NPC_3d_Rep2" ~ 3 ,
time == "NPC_6d_Rep1" ~ 6, time == "NPC_6d_Rep2" ~ 6 ,
time == "NPC_17d_Rep1" ~ 17, time == "NPC_17d_Rep2" ~ 17 ))
ggplot(data=ATAC_NPC_AR_table_melt, aes(x=time, y=ratio, group=Peakid)) + theme_bw() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1689")), colour="red") +
stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1689")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1689")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1689")])), colour="red") +
geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1385")), colour="blue") +
stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1385")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1385")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1385")])), colour="blue") +
geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1923")), colour="orange") +
stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1923")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1923")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1923")])), colour="orange") +
geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_337")), colour="green") +
stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_337")),
fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_337")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_337")])), colour="green")
I run plotly separately at the end to identify the noticeable persistent peaks by finding them in a genome browser. Load the ATAC_ConsensusPeaks.rmtn5.bed file from GSE240677
#Load extra packages
library(viridis)
library(plotly)
## heatmap of allic ratio of RE over time course #
ATAC_NPC_AR_table_merge_melt$time <- as.character(ATAC_NPC_AR_table_merge_melt$time)
data.table::setorderv(ATAC_NPC_AR_table_merge_melt, cols = c("Start"))
p <- subset(ATAC_NPC_AR_table_merge_melt, time %in% c("0", "1","3","6","17")) %>%
mutate(time = factor(time, levels=c("17","6","3","1","0"))) %>%
ggplot(aes(time, reorder(Peakid, Start), fill= ratio)) +
geom_tile() + scale_fill_viridis() +
theme(panel.background = element_rect(fill = "white"),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(title="ATAC-seq: allelic ratio REs over time course") +
coord_flip()
p
q <- subset(ATAC_NPC_AR_table_merge_melt, time %in% c("0", "1","3","6","17")) %>%
mutate(time = factor(time, levels=c("17","6","3","1","0"))) %>%
ggplot(aes(time, reorder(Peakid, Start), fill= ratio)) +
geom_tile() + scale_fill_viridis() +
theme(panel.background = element_rect(fill = "white"),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
labs(title="ATAC-seq: allelic ratio REs over time course")
#ggplotly(q)
Add more information to ATAC-seq peaks
Prior to running these analyses, I overlapped the remaining ATAC-seq peak set (“ATAC_peaks_EDM.bed”) with:
“Promoter Regions” defined by 500bp -/+ TSS (see additional script “MakeTSSannotations” for generation of this file)
CTCF ChIP-seq peaks (called from public mESC data GSM4776653)
This overlap was performed with “bedtools intersect” to produce the files “ATAC_peaks_EDM.CTCF.bed” and “ATAC_peaks_EDM.TSS1KB.bed”.
Additionally, I used “bedtools closest” to assign each peak to its nearest gene TSS to make the file “ATAC_peaks_EDM_closestTSS.bed”
bedtools sort -i ATAC_peaks_EDM.bed > ATAC_peaks_EDM.sort.bed
bedtools intersect -u -a ATAC_peaks_EDM.sort.bed -b /path/to/CTCF_peaks.bed > ATAC_peaks_EDM.CTCF.bed
bedtools intersect -u -a ATAC_peaks_EDM.sort.bed -b /path/to/Genes_Nonredundant_chrX1_allelic_TSS1KB.gtf > ATAC_peaks_EDM.TSS1KB.bed
bedtools closest -d -t all -a ATAC_peaks_EDM.sort.bed -b /path/to/Genes_Nonredundant_chrX1_allelic_TSS.bed > ATAC_peaks_EDM_closestTSS.bed
I combined these categorisations into a single file with all the information - “ATAC_peaks_GeneInfo.txt”
# First load file with each peak assigned to closest gene
ATAC_peaks_EDM_closestTSS <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM_closestTSS.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_closestTSS) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid","TSS_chr","TSS_Start","TSS_End","closestGene","score","strand","source","type","phase","info","distTSS")
# Then add CTCF and Distal/Promoter annotations to "ATAC_peaks_EDM_closestTSS" table
#CTCFsite
ATAC_peaks_EDM_CTCF <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.CTCF.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_CTCF) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid")
ATAC_peaks_EDM_closestTSS <- mutate(ATAC_peaks_EDM_closestTSS,
CTCFsite = case_when(
ATAC_peaks_EDM_closestTSS$Peakid %in% ATAC_peaks_EDM_CTCF$Peakid ~ "CTCF", TRUE ~ "nonCTCF") )
#Distal vs TSS1KB
ATAC_peaks_EDM_TSS1KB <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.TSS1KB.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_TSS1KB) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid")
ATAC_peaks_EDM_closestTSS <- mutate(ATAC_peaks_EDM_closestTSS,
dist_prom = case_when(
ATAC_peaks_EDM_closestTSS$Peakid %in% ATAC_peaks_EDM_TSS1KB$Peakid ~ "prom", TRUE ~ "distal") )
#merge with GeneInfo to add more information about nearest gene to each peak
gene_cat <- data.frame(GeneInfo_A11B2$Geneid, GeneInfo_A11B2$Halftime, GeneInfo_A11B2$SilencingRate, GeneInfo_A11B2$InitialTPM)
colnames(gene_cat) <- c("closestGene","gene_halftime","gene_cat", "gene_initialTPM")
ATAC_peaks_EDM_closestTSS_1 <- ATAC_peaks_EDM_closestTSS[,c("Chr","Start","End","per_v_dep","peak_halftime","Peakid","closestGene","distTSS","CTCFsite","dist_prom")]
ATAC_peaks_info <- merge(ATAC_peaks_EDM_closestTSS_1, gene_cat, all.x = TRUE)
#change gene_halftime to days
ATAC_peaks_info$gene_halftime <- ATAC_peaks_info$gene_halftime/24
#remove where exactly the same gene and peak
ATAC_peaks_info <- ddply(ATAC_peaks_info, c("closestGene","Peakid"), head, 1)
#When two genes are equally close, label the least expressed as a duplicate
ATAC_peaks_info <- ATAC_peaks_info %>% arrange(dplyr::desc(ATAC_peaks_info$gene_initialTPM))
ATAC_peaks_info$peak_duplicated <- with(ATAC_peaks_info,duplicated(ATAC_peaks_info$Peakid))
Export ATAC_peaks_info file as resource to publish
## Export ATAC_peaks_GeneInfo ####
#write.table(ATAC_peaks_info,
# row.names = FALSE, col.names = TRUE, quote = FALSE,
# paste0(ATAC_analysis_path,"ATAC_peaks_GeneInfo.txt"), sep="\t")
Figure S1D -Analysis of peak half-time by RE category
Promoters and CTCF sites remain accessible longer than other distal RE categories over the course of Xist-mediated XCI.
#subset peaks by 3 categories: promoter, distal CTCF, and distal non-CTCF
ATAC_peaks_info$comp3 <- ifelse(ATAC_peaks_info$dist_prom == "prom", 'prom',
ifelse(ATAC_peaks_info$CTCFsite == "CTCF" & ATAC_peaks_info$dist_prom == "distal", 'dist_CTCF',
'dist_nonCTCF'))
#boxplots of peak halftimes by CTCF binding
comparisons <- list(c("nonCTCF", "CTCF"))
box_CTCF <- subset(ATAC_peaks_info, peak_duplicated == FALSE) %>%
mutate(CTCFsite = factor(CTCFsite, levels=c("CTCF","nonCTCF"))) %>%
ggplot(aes(x=CTCFsite,y=peak_halftime)) + geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
stat_summary(fun.data = give.n, geom = "text", fun.y = median)
comparisons <- list(c("prom", "distal"))
box_promdist <- subset(ATAC_peaks_info, peak_duplicated == FALSE) %>%
mutate(dist_prom = factor(dist_prom, levels=c("prom", "distal"))) %>%
ggplot(aes(x=dist_prom,y=peak_halftime)) + geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
stat_summary(fun.data = give.n, geom = "text", fun.y = median)
grid.arrange(box_CTCF, box_promdist, nrow = 1)
#make boxplot of 3 categories with p values
comparisons <- list(c("prom", "dist_CTCF"),
c("prom","dist_nonCTCF"),
c("dist_nonCTCF","dist_CTCF"))
box_comp3 <- subset(ATAC_peaks_info, peak_duplicated == FALSE) %>%
mutate(comp3 = factor(comp3, levels=c("prom","dist_nonCTCF","dist_CTCF"))) %>%
ggplot(aes(x=comp3,y=peak_halftime)) + geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
stat_summary(fun.data = give.n, geom = "text", fun.y = median)
box_comp3
#recapitulate S1C ribbon plot but separating by RE category
ATAC_peaks_info_nodup <- subset(ATAC_peaks_info, peak_duplicated == FALSE)
ATAC_NPC_AR_table_merge_comp <- merge(ATAC_NPC_AR_table_merge, ATAC_peaks_info_nodup[,c("Peakid","comp3")], by="Peakid",all.x=TRUE)
ATAC_A11B2_ARsummary_prom <- subset(ATAC_NPC_AR_table_merge_comp, comp3 == "prom")
ATAC_A11B2_ARsummary_prom <- as.array(apply(ATAC_A11B2_ARsummary_prom[,7:11], 2, summary))
ATAC_A11B2_ARsummary_prom_t <- data.frame(ATAC_timepoints_merge,
t(ATAC_A11B2_ARsummary_prom))
colnames(ATAC_A11B2_ARsummary_prom_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")
ATAC_A11B2_ARsummary_dist_CTCF <- subset(ATAC_NPC_AR_table_merge_comp, comp3 == "dist_CTCF")
ATAC_A11B2_ARsummary_dist_CTCF <- as.array(apply(ATAC_A11B2_ARsummary_dist_CTCF[,7:11], 2, summary))
ATAC_A11B2_ARsummary_dist_CTCF_t <- data.frame(ATAC_timepoints_merge,
t(ATAC_A11B2_ARsummary_dist_CTCF))
colnames(ATAC_A11B2_ARsummary_dist_CTCF_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")
ATAC_A11B2_ARsummary_dist_nonCTCF <- subset(ATAC_NPC_AR_table_merge_comp, comp3 == "dist_nonCTCF")
ATAC_A11B2_ARsummary_dist_nonCTCF <- as.array(apply(ATAC_A11B2_ARsummary_dist_nonCTCF[,7:11], 2, summary))
ATAC_A11B2_ARsummary_dist_nonCTCF_t <- data.frame(ATAC_timepoints_merge,
t(ATAC_A11B2_ARsummary_dist_nonCTCF))
colnames(ATAC_A11B2_ARsummary_dist_nonCTCF_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")
Recapitulate S1C but for separately for promoter and distal-nonCTCF elements
#chrRNA vs promoter
chr_vs_prom_ribbon <- ggplot() + theme_light() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.8),expand = FALSE) +
#chrRNA
geom_ribbon(data=chrRNA_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
geom_point(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1)) +
geom_line(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
#ATAC promoter
geom_ribbon(data=ATAC_A11B2_ARsummary_prom_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="lightblue") +
geom_point(data=ATAC_A11B2_ARsummary_prom_t, aes(x=time,y=Median, group=1)) +
geom_line(data=ATAC_A11B2_ARsummary_prom_t, aes(x=time,y=Median, group=1), linetype="dotted")
#chrRNA vs distal nonCTCF
chr_vs_distnonCTCF_ribbon <- ggplot() + theme_light() +
coord_cartesian(xlim =c(0,17), ylim = c(0,0.8),expand = FALSE) +
#chrRNA
geom_ribbon(data=chrRNA_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
geom_point(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1)) +
geom_line(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
#ATAC distal nonCTCF
geom_ribbon(data=ATAC_A11B2_ARsummary_dist_nonCTCF_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkblue") +
geom_point(data=ATAC_A11B2_ARsummary_dist_nonCTCF_t, aes(x=time,y=Median, group=1)) +
geom_line(data=ATAC_A11B2_ARsummary_dist_nonCTCF_t, aes(x=time,y=Median, group=1), linetype="dotted")
grid.arrange(chr_vs_prom_ribbon, chr_vs_distnonCTCF_ribbon, nrow = 1)
Figure 1C & Figure 1D - plot scatter plots and boxplots
#correlation gene and peak halftimes for promoter and distal non-CTCF groups
prom_scatter <- subset(ATAC_peaks_info, comp3 == "prom" &
!is.na(peak_halftime) & !is.na(gene_cat)) %>%
ggplot( aes(x=gene_halftime, y=peak_halftime)) +
geom_point(shape=16) + theme_bw() +
scale_x_continuous(expand = c(0, 0), limits = c(0, 4.5)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 17)) +
geom_smooth(method = "lm",se=FALSE, formula=y~x-1,linetype="dashed") +
stat_cor(method="spearman") +
annotate(geom="text", x=4, y=16.2, label = paste0("n = ", nrow(subset(ATAC_peaks_info, ATAC_peaks_info$distTSS < 500))))
#plot correlation for distal non-CTCF elements - remove any >100KB from nearest gene (at this distance, the likelihood these REs regulate the gene they are closest to is low)
dist_scatter <- subset(ATAC_peaks_info, comp3 == "dist_nonCTCF" & distTSS < 100000 &
!is.na(peak_halftime) & !is.na(gene_cat)) %>%
ggplot( aes(x=gene_halftime, y=peak_halftime)) +
geom_point(shape=16) + theme_bw() +
scale_x_continuous(expand = c(0, 0), limits = c(0, 4.5)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 17)) +
geom_smooth(method = "lm",se=FALSE, formula=y~x-1,linetype="dashed") +
stat_cor(method="spearman") +
annotate(geom="text", x=4, y=16.2, label = paste0("n = ", nrow(subset(ATAC_peaks_info, comp3 == "dist_nonCTCF" & distTSS < 100000 &
!is.na(peak_halftime) & !is.na(gene_cat)))))
grid.arrange(prom_scatter, dist_scatter, nrow = 1)
#boxplots of peak halftimes by silencing rate categories of genes - use same definition of distal non-CTCF elements
comparisons <- list(c("fastsilencing", "mediumsilencing"),c("mediumsilencing", "slowsilencing"),c("fastsilencing", "slowsilencing"))
box_TSS <- subset(ATAC_peaks_info, comp3 == "prom" &
!is.na(peak_halftime) & !is.na(gene_cat)) %>%
mutate(gene_cat = factor(gene_cat, levels=c("fastsilencing", "mediumsilencing","slowsilencing"))) %>%
ggplot(aes(x=gene_cat,y=peak_halftime)) + geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
stat_summary(fun.data = give.n, geom = "text", fun.y = median)
box_Dist <- subset(ATAC_peaks_info, comp3 == "dist_nonCTCF" & distTSS < 100000 &
!is.na(peak_halftime) & !is.na(gene_cat)) %>%
mutate(gene_cat = factor(gene_cat, levels=c("fastsilencing", "mediumsilencing","slowsilencing"))) %>%
ggplot(aes(x=gene_cat,y=peak_halftime)) + geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
stat_compare_means(comparisons = comparisons, method = "wilcox.test",label="p.format") +
stat_summary(fun.data = give.n, geom = "text", fun.y = median)
grid.arrange(box_TSS, box_Dist, nrow = 1)