Load required packages and functions

# Packages
library(readr)
library(stringr)
library(ggplot2)
library(ggpubr) 
library(ggridges)
library(dplyr)
library(forcats)
library(minpack.lm)
library(mefa4)
library(reshape)
library(gridExtra)
library(plyr)
library(DESeq2)
library(gplots)
#Functions
LoadingCountFile <- function(file=""){
  Count <- read.table(file, header=T)
  return(Count)
}
Allelic.Ratio <- function(g1_counts,g2_counts){
  Xa <- as.numeric(g1_counts)
  Xi <- as.numeric(g2_counts)
  AllelicRatio <- ( Xi / (Xi + Xa))
  return(AllelicRatio)
}
AR.table <- function(counts_table){
  AllelicRatio_table <- counts_table[1:6]
  for (j in seq(8,ncol(counts_table),3)){
    AllelicRatio_table <- cbind(AllelicRatio_table,
                                Allelic.Ratio(as.matrix(counts_table[j]),
                                              as.matrix(counts_table[j+1]) ) )
  }
  return(AllelicRatio_table)
}
give.n <- function(x){
  return(c(y = 0, label = length(x)))}

Load chrRNA-seq data of gene silencing

chrRNA_data_path <- "/path/to/folder/with/chrRNA/data/"
chrRNA_counts_path <- "/path/to/folder/with/chrRNA/data/chrRNA_counts/"
#Load iXist-Chrx-Dom chrRNA counts matrices
chrRNA_A11B2_ES_d0r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"WT_NoDox.sortn.count"))
chrRNA_A11B2_ES_d0r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"WTB2_NoDox_Rep1.sortn.count"))
chrRNA_A11B2_ES_d1r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"WT_DoxA.sortn.count")) 
chrRNA_A11B2_ES_d1r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"WTB2_24hrDox_Rep1.sortn.count"))
chrRNA_A11B2_d2r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"1_A11B2_NPC_d2_mm.sortn.count"))
chrRNA_A11B2_d2r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"1_A11B2_NPCd2_mm.sortn.count"))
chrRNA_A11B2_d3r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"2_A11B2_NPC_d3_mm.sortn.count"))
chrRNA_A11B2_d3r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"2_A11B2_NPCd3_mm.sortn.count"))
chrRNA_A11B2_d5r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"4_A11B2_NPC_d5_mm.sortn.count"))
chrRNA_A11B2_d5r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"3_A11B2_NPCd5_mm.sortn.count"))
chrRNA_A11B2_d7r1 <- LoadingCountFile(paste0(chrRNA_counts_path,"6_A11B2_NPC_d7_mm.sortn.count"))
chrRNA_A11B2_d7r2 <- LoadingCountFile(paste0(chrRNA_counts_path,"4_A11B2_NPCd7_mm.sortn.count"))
chrRNA_A11B2_d15 <- LoadingCountFile(paste0(chrRNA_counts_path,"7_A11B2_NPC_d15_mm.sortn.count"))
chrRNA_A11B2_d21 <- LoadingCountFile(paste0(chrRNA_counts_path,"6_A11B2_NPCd21_mm.sortn.count"))

chrRNA_A11B2_counts_table <- cbind( chrRNA_A11B2_ES_d0r1[1:9],chrRNA_A11B2_ES_d0r2[7:9],
                                    chrRNA_A11B2_ES_d1r1[7:9],chrRNA_A11B2_ES_d1r2[7:9],
                                    chrRNA_A11B2_d2r1[7:9],chrRNA_A11B2_d2r2[7:9],
                                    chrRNA_A11B2_d3r1[7:9],chrRNA_A11B2_d3r2[7:9],
                                    chrRNA_A11B2_d5r1[7:9],chrRNA_A11B2_d5r2[7:9],
                                    chrRNA_A11B2_d7r1[7:9],chrRNA_A11B2_d7r2[7:9],
                                    chrRNA_A11B2_d15[7:9],chrRNA_A11B2_d21[7:9])

Filter the counts matrices by genes amenable to allelic silencing analysis and make allelic ratio tables

#Load iXist-ChrX-Dom_GeneInfo from ChrRNA analysis folder
GeneInfo_A11B2 <- read_delim(paste0(chrRNA_data_path,"GeneInfo_A11B2.txt"), "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)
colnames(GeneInfo_A11B2) #check column order and re-name as appropriate
## [1] "Geneid"            "Halftime"          "Starts_numeric"   
## [4] "Dist_Xist"         "InitialTPM"        "Distance_Xist"    
## [7] "Expression_group"  "SilencingRate"     "SmcHD1_dependence"
colnames(GeneInfo_A11B2) <-  c("Geneid","Halftime","Starts_numeric","Dist_Xist","InitialTPM","Distance_Xist","Expression_group","SilencingRate","SmcHD1_dependence")

#Filter counts matrices by genes that passed allelic analysis in our previous study
chrRNA_A11B2_counts_table_filt <- chrRNA_A11B2_counts_table[which(chrRNA_A11B2_counts_table$Geneid %in% GeneInfo_A11B2$Geneid),]

#Make allelic ratio table from counts table
chrRNA_A11B2_AR_table <- AR.table(chrRNA_A11B2_counts_table_filt)

Prepare data for ribbon plots

#re-name columns of allelic ratio table with sample names
colnames(chrRNA_A11B2_AR_table)[7:ncol(chrRNA_A11B2_AR_table)] <-c("chrRNA_A11B2_ES_d0r1","chrRNA_A11B2_ES_d0r2",
                                                                   "chrRNA_A11B2_ES_d1r1","chrRNA_A11B2_ES_d1r2",
                                                                   "chrRNA_A11B2_d2r1","chrRNA_A11B2_d2r2",
                                                                   "chrRNA_A11B2_d3r1","chrRNA_A11B2_d3r2",
                                                                   "chrRNA_A11B2_d5r1","chrRNA_A11B2_d5r2",
                                                                   "chrRNA_A11B2_d7r1","chrRNA_A11B2_d7r2",
                                                                   "chrRNA_A11B2_d15","chrRNA_A11B2_d21")
# chrRNA - average together replicates for AR ribbon plot
chrRNA_A11B2_AR_table_merge <- data.frame(chrRNA_A11B2_AR_table[,1:6],
  0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d0r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d0r2),
  0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d1r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_ES_d1r2),
  0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d2r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d2r2),
  0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d3r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d3r2),
  0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d5r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d5r2),
  0.5*(chrRNA_A11B2_AR_table$chrRNA_A11B2_d7r1 + chrRNA_A11B2_AR_table$chrRNA_A11B2_d7r2),
  chrRNA_A11B2_AR_table$chrRNA_A11B2_d15, chrRNA_A11B2_AR_table$chrRNA_A11B2_d21)

# make numerical column names for sumarising for ribbon plot
chrRNA_timepoints_merge <- c(0,1,2,3,5,7,15,21)
colnames(chrRNA_A11B2_AR_table_merge)[7:ncol(chrRNA_A11B2_AR_table_merge)] <- as.numeric(chrRNA_timepoints_merge)
chrRNA_A11B2_ARsummary <- as.array(apply(chrRNA_A11B2_AR_table_merge[,7:ncol(chrRNA_A11B2_AR_table_merge)], 2, summary))
chrRNA_A11B2_ARsummary_t <- data.frame(chrRNA_timepoints_merge,
                                       t(chrRNA_A11B2_ARsummary))
colnames(chrRNA_A11B2_ARsummary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

Load ATAC-seq data

#directory for allelic ATAC-seq analysis
ATAC_analysis_path <- "/path/to/your/ATAC/analysis/folder/"
# Load ATAC allelic ratio tables
ATAC_NPC_AR_table <- read_table2(paste0(ATAC_analysis_path,"ATAC_NPC_AR_table.txt"))
ATAC_NPC_AR_table_merge <- read_table2(paste0(ATAC_analysis_path,"ATAC_NPC_AR_table_merge.txt"))

# Filter by .bed file of peaks (n=790) remaining after EDM modelling and classification of persistent/depleted peaks
ATAC_peaks_EDM <- read_delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.bed"), delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
colnames(ATAC_peaks_EDM) <-c("Chr","Start","End","Label","Halftime","Peakid")
ATAC_NPC_AR_table <- ATAC_NPC_AR_table[which(ATAC_NPC_AR_table$Peakid %in% ATAC_peaks_EDM$Peakid),]
ATAC_NPC_AR_table_merge <- ATAC_NPC_AR_table_merge[which(ATAC_NPC_AR_table_merge$Peakid %in% ATAC_peaks_EDM$Peakid),]

# make numerical column names for sumarising for ribbon plot
ATAC_timepoints_merge <- c(0,1,3,6,17)
colnames(ATAC_NPC_AR_table_merge)[7:ncol(ATAC_NPC_AR_table_merge)] <- as.numeric(ATAC_timepoints_merge)
ATAC_A11B2_ARsummary <- as.array(apply(ATAC_NPC_AR_table_merge[,7:ncol(ATAC_NPC_AR_table_merge)], 2, summary))
ATAC_A11B2_ARsummary_t <- data.frame(ATAC_timepoints_merge,
                                       t(ATAC_A11B2_ARsummary))
colnames(ATAC_A11B2_ARsummary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

Figure S1C - plot ribbon plot of chrRNA and ATAC (ggplot)

ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.8),expand = FALSE) +
  #chrRNA
  geom_ribbon(data=chrRNA_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #ATAC
  geom_ribbon(data=ATAC_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.2, fill="blue") +
  geom_point(data=ATAC_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), colour="blue") +
  geom_line(data=ATAC_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), colour="blue", linetype="dotted")

Figure 1B - individual genes

ChrRNA of individual genes

To plot exponential decay fit for each gene individually, we needed the output from running the exponential decay model fitting on chrRNA-seq timecourse (performed in our previous study).

#Load output of EDM for chrRNA
OutTableEDM_A11B2 <- read_delim(paste0(chrRNA_data_path,"OutTableEDM_A11B2.txt"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)

#plot 3 genes: points and EDM fits
chrRNA_A11B2_AR_table_merge_melt <- melt(chrRNA_A11B2_AR_table_merge, id=c("Geneid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(chrRNA_A11B2_AR_table_merge_melt)[7:8] <- c("time", "ratio")
chrRNA_A11B2_AR_table_merge_melt$time <- as.numeric(as.character(chrRNA_A11B2_AR_table_merge_melt$time))

ggplot(data=chrRNA_A11B2_AR_table_merge_melt, aes(x=time, y=ratio, group=Geneid)) + theme_bw() +
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
  geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Pdk3")), colour="red") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Pdk3")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Pdk3")] +
                                   (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Pdk3")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Pdk3")])), colour="red") +
  geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Mecp2")), colour="blue") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Mecp2")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Mecp2")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Mecp2")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Mecp2")])), colour="blue") +
  geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ogt")), colour="orange") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ogt")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ogt")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ogt")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ogt")])), colour="orange") +
  geom_point(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ddx3x")), colour="green") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_merge_melt, Geneid %in% c("Ddx3x")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")] + ((OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]-OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]))*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ddx3x")])), colour="green")

ChrRNA with technical replicates as separate data points

#melted allelic ratio table
chrRNA_A11B2_AR_table_melt <- melt(chrRNA_A11B2_AR_table, id=c("Geneid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(chrRNA_A11B2_AR_table_melt)[7:8] <- c("time", "ratio")
#Change values in "time" column to numeric (does not work before because of strange averaging properties of melt() function)
chrRNA_A11B2_AR_table_melt <- chrRNA_A11B2_AR_table_melt %>%
  mutate(time = case_when(
    time == "chrRNA_A11B2_ES_d0r1" ~ 0, time == "chrRNA_A11B2_ES_d0r2" ~ 0 ,
    time == "chrRNA_A11B2_ES_d1r1" ~ 1, time == "chrRNA_A11B2_ES_d1r2" ~ 1 ,
    time == "chrRNA_A11B2_d2r1" ~ 2, time == "chrRNA_A11B2_d2r2" ~ 2 ,
    time == "chrRNA_A11B2_d3r1" ~ 3, time == "chrRNA_A11B2_d3r2" ~ 3 ,
    time == "chrRNA_A11B2_d5r1" ~ 5, time == "chrRNA_A11B2_d5r2" ~ 5 ,
    time == "chrRNA_A11B2_d7r1" ~ 7, time == "chrRNA_A11B2_d7r2" ~ 7 ,
    time == "chrRNA_A11B2_d15" ~ 15, time == "chrRNA_A11B2_d21" ~ 21  ))


#plot replicates for each gene 
ggplot(data=chrRNA_A11B2_AR_table_melt, aes(x=time, y=ratio, group=Geneid)) + theme_bw() +
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
  geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Pdk3")), colour="red") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Pdk3")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Pdk3")] +
                                   (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Pdk3")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Pdk3")])), colour="red") +
  geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Mecp2")), colour="blue") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Mecp2")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Mecp2")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Mecp2")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Mecp2")])), colour="blue") +
  geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ogt")), colour="orange") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ogt")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ogt")] + (OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ogt")])*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ogt")])), colour="orange") +
  geom_point(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ddx3x")), colour="green") +
  stat_function(data = subset(chrRNA_A11B2_AR_table_melt, Geneid %in% c("Ddx3x")),
                fun=function(x) (OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")] + ((OutTableEDM_A11B2$EDM_start[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]-OutTableEDM_A11B2$EDM_final[which(OutTableEDM_A11B2$Geneid == "Ddx3x")]))*exp(-x/OutTableEDM_A11B2$EDM_rate[which(OutTableEDM_A11B2$Geneid == "Ddx3x")])), colour="green")

ATAC-seq of individual genes

#Load output of EDM for ATAC
OutTableEDM_ATAC <- read_delim(paste0(ATAC_analysis_path,"OutTableEDM_subset.txt"),  delim = "\t", escape_double = FALSE, trim_ws = TRUE)

#plot 3 peaks: points and EDM fits
library(reshape2)
ATAC_NPC_AR_table_merge_melt <- melt(ATAC_NPC_AR_table_merge, id.vars=c("Peakid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(ATAC_NPC_AR_table_merge_melt)[7:8] <- c("time", "ratio")
ATAC_NPC_AR_table_merge_melt$time <- as.numeric(as.character(ATAC_NPC_AR_table_merge_melt$time))

ggplot(data=ATAC_NPC_AR_table_merge_melt, aes(x=time, y=ratio, group=Peakid)) + theme_bw() +
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
  geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1689")), colour="red") +
  stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1689")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1689")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1689")])), colour="red") +
  geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1385")), colour="blue") +
  stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1385")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1385")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1385")])), colour="blue") +
  geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1923")), colour="orange") +
  stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_1923")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1923")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1923")])), colour="orange") +
  geom_point(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_337")), colour="green") +
  stat_function(data = subset(ATAC_NPC_AR_table_merge_melt, Peakid %in% c("peak_337")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_337")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_337")])), colour="green")

ATAC-seq with technical replicates as separate data points

#melted allelic ratio table
ATAC_NPC_AR_table_melt <- melt(ATAC_NPC_AR_table, id=c("Peakid","Chr","Start","End","Strand","Length"), stringsAsFactors = FALSE)
colnames(ATAC_NPC_AR_table_melt)[7:8] <- c("time", "ratio")
#Change values in "time" column to numeric (does not work before because of strange averaging properties of melt() function)
ATAC_NPC_AR_table_melt <- ATAC_NPC_AR_table_melt %>%
  mutate(time = case_when(
    time == "NPC_0d_Rep1" ~ 0, time == "NPC_0d_Rep2" ~ 0 , time == "NPC_0d_Rep3" ~ 0, 
    time == "NPC_1d_Rep1" ~ 1, time == "NPC_1d_Rep2" ~ 1 ,
    time == "NPC_3d_Rep1" ~ 3, time == "NPC_3d_Rep2" ~ 3 ,
    time == "NPC_6d_Rep1" ~ 6, time == "NPC_6d_Rep2" ~ 6 ,
    time == "NPC_17d_Rep1" ~ 17, time == "NPC_17d_Rep2" ~ 17 ))


ggplot(data=ATAC_NPC_AR_table_melt, aes(x=time, y=ratio, group=Peakid)) + theme_bw() +
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.9),expand = FALSE) +
  geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1689")), colour="red") +
  stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1689")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1689")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1689")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1689")])), colour="red") +
  geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1385")), colour="blue") +
  stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1385")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1385")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1385")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1385")])), colour="blue") +
  geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1923")), colour="orange") +
  stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_1923")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_1923")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_1923")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_1923")])), colour="orange") +
  geom_point(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_337")), colour="green") +
  stat_function(data = subset(ATAC_NPC_AR_table_melt, Peakid %in% c("peak_337")),
                fun=function(x) (OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")] + (OutTableEDM_ATAC$EDM_start[which(OutTableEDM_ATAC$Peakid == "peak_337")] - OutTableEDM_ATAC$EDM_final[which(OutTableEDM_ATAC$Peakid == "peak_337")])*exp(-x/OutTableEDM_ATAC$EDM_rate[which(OutTableEDM_ATAC$Peakid == "peak_337")])), colour="green")

Figure S1 - Heatmap of all REs over timecourse

I run plotly separately at the end to identify the noticeable persistent peaks by finding them in a genome browser. Load the ATAC_ConsensusPeaks.rmtn5.bed file from GSE240677

#Load extra packages
library(viridis)
library(plotly)
## heatmap of allic ratio of RE over time course #
ATAC_NPC_AR_table_merge_melt$time <- as.character(ATAC_NPC_AR_table_merge_melt$time)
data.table::setorderv(ATAC_NPC_AR_table_merge_melt, cols = c("Start"))

p <- subset(ATAC_NPC_AR_table_merge_melt, time %in% c("0", "1","3","6","17")) %>%
  mutate(time = factor(time, levels=c("17","6","3","1","0"))) %>%
  ggplot(aes(time, reorder(Peakid, Start), fill= ratio)) +
  geom_tile() + scale_fill_viridis() + 
  theme(panel.background = element_rect(fill = "white"),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  labs(title="ATAC-seq: allelic ratio REs over time course") +
  coord_flip()
p

q <- subset(ATAC_NPC_AR_table_merge_melt, time %in% c("0", "1","3","6","17")) %>%
  mutate(time = factor(time, levels=c("17","6","3","1","0"))) %>%
  ggplot(aes(time, reorder(Peakid, Start), fill= ratio)) +
  geom_tile() + scale_fill_viridis() + 
  theme(panel.background = element_rect(fill = "white"),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) + 
  labs(title="ATAC-seq: allelic ratio REs over time course")

#ggplotly(q)

Comparisons by RE categories

Add more information to ATAC-seq peaks

Prior to running these analyses, I overlapped the remaining ATAC-seq peak set (“ATAC_peaks_EDM.bed”) with:

This overlap was performed with “bedtools intersect” to produce the files “ATAC_peaks_EDM.CTCF.bed” and “ATAC_peaks_EDM.TSS1KB.bed”.

Additionally, I used “bedtools closest” to assign each peak to its nearest gene TSS to make the file “ATAC_peaks_EDM_closestTSS.bed”

bedtools sort -i ATAC_peaks_EDM.bed > ATAC_peaks_EDM.sort.bed
bedtools intersect -u -a ATAC_peaks_EDM.sort.bed -b /path/to/CTCF_peaks.bed > ATAC_peaks_EDM.CTCF.bed
bedtools intersect -u -a ATAC_peaks_EDM.sort.bed -b /path/to/Genes_Nonredundant_chrX1_allelic_TSS1KB.gtf > ATAC_peaks_EDM.TSS1KB.bed
bedtools closest -d -t all -a ATAC_peaks_EDM.sort.bed -b /path/to/Genes_Nonredundant_chrX1_allelic_TSS.bed > ATAC_peaks_EDM_closestTSS.bed

I combined these categorisations into a single file with all the information - “ATAC_peaks_GeneInfo.txt”

# First load file with each peak assigned to closest gene
ATAC_peaks_EDM_closestTSS <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM_closestTSS.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_closestTSS) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid","TSS_chr","TSS_Start","TSS_End","closestGene","score","strand","source","type","phase","info","distTSS")

# Then add CTCF and Distal/Promoter annotations to "ATAC_peaks_EDM_closestTSS" table
#CTCFsite
ATAC_peaks_EDM_CTCF <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.CTCF.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_CTCF) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid")
ATAC_peaks_EDM_closestTSS <-  mutate(ATAC_peaks_EDM_closestTSS,
                                     CTCFsite = case_when(
                                    ATAC_peaks_EDM_closestTSS$Peakid %in% ATAC_peaks_EDM_CTCF$Peakid ~ "CTCF", TRUE ~ "nonCTCF") )

#Distal vs TSS1KB
ATAC_peaks_EDM_TSS1KB <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.TSS1KB.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_TSS1KB) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid")
ATAC_peaks_EDM_closestTSS <-  mutate(ATAC_peaks_EDM_closestTSS,
                                     dist_prom = case_when(
                                    ATAC_peaks_EDM_closestTSS$Peakid %in% ATAC_peaks_EDM_TSS1KB$Peakid ~ "prom", TRUE ~ "distal") )


#merge with GeneInfo to add more information about nearest gene to each peak
gene_cat <- data.frame(GeneInfo_A11B2$Geneid, GeneInfo_A11B2$Halftime, GeneInfo_A11B2$SilencingRate, GeneInfo_A11B2$InitialTPM)
colnames(gene_cat) <- c("closestGene","gene_halftime","gene_cat", "gene_initialTPM")
ATAC_peaks_EDM_closestTSS_1 <- ATAC_peaks_EDM_closestTSS[,c("Chr","Start","End","per_v_dep","peak_halftime","Peakid","closestGene","distTSS","CTCFsite","dist_prom")]
ATAC_peaks_info <- merge(ATAC_peaks_EDM_closestTSS_1, gene_cat, all.x = TRUE)
#change gene_halftime to days
ATAC_peaks_info$gene_halftime <- ATAC_peaks_info$gene_halftime/24
#remove where exactly the same gene and peak
ATAC_peaks_info <- ddply(ATAC_peaks_info, c("closestGene","Peakid"), head, 1)
#When two genes are equally close, label the least expressed as a duplicate
ATAC_peaks_info <- ATAC_peaks_info %>% arrange(dplyr::desc(ATAC_peaks_info$gene_initialTPM))
ATAC_peaks_info$peak_duplicated <- with(ATAC_peaks_info,duplicated(ATAC_peaks_info$Peakid))

Export ATAC_peaks_info file as resource to publish

## Export ATAC_peaks_GeneInfo ####
#write.table(ATAC_peaks_info,
#            row.names = FALSE, col.names = TRUE, quote = FALSE,
#            paste0(ATAC_analysis_path,"ATAC_peaks_GeneInfo.txt"), sep="\t")

Figure S1D -Analysis of peak half-time by RE category

Promoters and CTCF sites remain accessible longer than other distal RE categories over the course of Xist-mediated XCI.

#subset peaks by 3 categories: promoter, distal CTCF, and distal non-CTCF
ATAC_peaks_info$comp3 <- ifelse(ATAC_peaks_info$dist_prom == "prom", 'prom',
                                ifelse(ATAC_peaks_info$CTCFsite == "CTCF" & ATAC_peaks_info$dist_prom == "distal", 'dist_CTCF',
                                       'dist_nonCTCF'))

#boxplots of peak halftimes by CTCF binding 
comparisons <- list(c("nonCTCF", "CTCF"))
box_CTCF <- subset(ATAC_peaks_info, peak_duplicated == FALSE) %>%
  mutate(CTCFsite = factor(CTCFsite, levels=c("CTCF","nonCTCF"))) %>%
  ggplot(aes(x=CTCFsite,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text", fun.y = median)

comparisons <- list(c("prom", "distal"))
box_promdist <- subset(ATAC_peaks_info, peak_duplicated == FALSE) %>%
  mutate(dist_prom = factor(dist_prom, levels=c("prom", "distal"))) %>%
  ggplot(aes(x=dist_prom,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text", fun.y = median)

grid.arrange(box_CTCF, box_promdist, nrow = 1)

#make boxplot of 3 categories with p values
comparisons <- list(c("prom", "dist_CTCF"),
                    c("prom","dist_nonCTCF"),
                    c("dist_nonCTCF","dist_CTCF"))
box_comp3 <- subset(ATAC_peaks_info, peak_duplicated == FALSE) %>%
  mutate(comp3 = factor(comp3, levels=c("prom","dist_nonCTCF","dist_CTCF"))) %>%
  ggplot(aes(x=comp3,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text", fun.y = median)
box_comp3

#recapitulate S1C ribbon plot but separating by RE category
ATAC_peaks_info_nodup <- subset(ATAC_peaks_info, peak_duplicated == FALSE)
ATAC_NPC_AR_table_merge_comp <- merge(ATAC_NPC_AR_table_merge, ATAC_peaks_info_nodup[,c("Peakid","comp3")], by="Peakid",all.x=TRUE)

ATAC_A11B2_ARsummary_prom <- subset(ATAC_NPC_AR_table_merge_comp, comp3 == "prom")
ATAC_A11B2_ARsummary_prom <- as.array(apply(ATAC_A11B2_ARsummary_prom[,7:11], 2, summary))
ATAC_A11B2_ARsummary_prom_t <- data.frame(ATAC_timepoints_merge,
                                       t(ATAC_A11B2_ARsummary_prom))
colnames(ATAC_A11B2_ARsummary_prom_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

ATAC_A11B2_ARsummary_dist_CTCF <- subset(ATAC_NPC_AR_table_merge_comp, comp3 == "dist_CTCF")
ATAC_A11B2_ARsummary_dist_CTCF <- as.array(apply(ATAC_A11B2_ARsummary_dist_CTCF[,7:11], 2, summary))
ATAC_A11B2_ARsummary_dist_CTCF_t <- data.frame(ATAC_timepoints_merge,
                                       t(ATAC_A11B2_ARsummary_dist_CTCF))
colnames(ATAC_A11B2_ARsummary_dist_CTCF_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

ATAC_A11B2_ARsummary_dist_nonCTCF <- subset(ATAC_NPC_AR_table_merge_comp, comp3 == "dist_nonCTCF")
ATAC_A11B2_ARsummary_dist_nonCTCF <- as.array(apply(ATAC_A11B2_ARsummary_dist_nonCTCF[,7:11], 2, summary))
ATAC_A11B2_ARsummary_dist_nonCTCF_t <- data.frame(ATAC_timepoints_merge,
                                       t(ATAC_A11B2_ARsummary_dist_nonCTCF))
colnames(ATAC_A11B2_ARsummary_dist_nonCTCF_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

Recapitulate S1C but for separately for promoter and distal-nonCTCF elements

#chrRNA vs promoter
chr_vs_prom_ribbon <- ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.8),expand = FALSE) +
  #chrRNA
  geom_ribbon(data=chrRNA_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #ATAC promoter
  geom_ribbon(data=ATAC_A11B2_ARsummary_prom_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="lightblue") +
  geom_point(data=ATAC_A11B2_ARsummary_prom_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=ATAC_A11B2_ARsummary_prom_t, aes(x=time,y=Median, group=1), linetype="dotted")

#chrRNA vs distal nonCTCF
chr_vs_distnonCTCF_ribbon <- ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,17), ylim = c(0,0.8),expand = FALSE) +
  #chrRNA
  geom_ribbon(data=chrRNA_A11B2_ARsummary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=chrRNA_A11B2_ARsummary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #ATAC distal nonCTCF
  geom_ribbon(data=ATAC_A11B2_ARsummary_dist_nonCTCF_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkblue") +
  geom_point(data=ATAC_A11B2_ARsummary_dist_nonCTCF_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=ATAC_A11B2_ARsummary_dist_nonCTCF_t, aes(x=time,y=Median, group=1), linetype="dotted")

grid.arrange(chr_vs_prom_ribbon, chr_vs_distnonCTCF_ribbon, nrow = 1)

Figure 1C & Figure 1D - plot scatter plots and boxplots

#correlation gene and peak halftimes for promoter and distal non-CTCF groups 
prom_scatter <- subset(ATAC_peaks_info, comp3 == "prom" &
                         !is.na(peak_halftime) & !is.na(gene_cat)) %>%
  ggplot( aes(x=gene_halftime, y=peak_halftime)) + 
  geom_point(shape=16) + theme_bw() +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 4.5)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 17))  + 
  geom_smooth(method = "lm",se=FALSE, formula=y~x-1,linetype="dashed") +
  stat_cor(method="spearman") +
  annotate(geom="text", x=4, y=16.2, label = paste0("n = ", nrow(subset(ATAC_peaks_info, ATAC_peaks_info$distTSS < 500))))

#plot correlation for distal non-CTCF elements - remove any >100KB from nearest gene (at this distance, the likelihood these REs regulate the gene they are closest to is low)
dist_scatter <- subset(ATAC_peaks_info, comp3 == "dist_nonCTCF" & distTSS < 100000 &
                         !is.na(peak_halftime) & !is.na(gene_cat)) %>%
    ggplot( aes(x=gene_halftime, y=peak_halftime)) + 
  geom_point(shape=16) + theme_bw() +
  scale_x_continuous(expand = c(0, 0), limits = c(0, 4.5)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 17))  + 
  geom_smooth(method = "lm",se=FALSE, formula=y~x-1,linetype="dashed") +
  stat_cor(method="spearman") +
  annotate(geom="text", x=4, y=16.2, label = paste0("n = ", nrow(subset(ATAC_peaks_info, comp3 == "dist_nonCTCF" & distTSS < 100000 &
                         !is.na(peak_halftime) & !is.na(gene_cat)))))

grid.arrange(prom_scatter, dist_scatter, nrow = 1)

#boxplots of peak halftimes by silencing rate categories of genes - use same definition of distal non-CTCF elements
comparisons <- list(c("fastsilencing", "mediumsilencing"),c("mediumsilencing", "slowsilencing"),c("fastsilencing", "slowsilencing"))

box_TSS <- subset(ATAC_peaks_info, comp3 == "prom" &
                    !is.na(peak_halftime) & !is.na(gene_cat)) %>%
  mutate(gene_cat = factor(gene_cat, levels=c("fastsilencing", "mediumsilencing","slowsilencing"))) %>%
  ggplot(aes(x=gene_cat,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text", fun.y = median)

box_Dist <- subset(ATAC_peaks_info, comp3 == "dist_nonCTCF" & distTSS < 100000 &
                     !is.na(peak_halftime) & !is.na(gene_cat)) %>%
  mutate(gene_cat = factor(gene_cat, levels=c("fastsilencing", "mediumsilencing","slowsilencing"))) %>%
  ggplot(aes(x=gene_cat,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text", fun.y = median)
grid.arrange(box_TSS, box_Dist, nrow = 1)