Load required packages and functions

library(readr)
library(plyr)
library(dplyr)
library(reshape)
library(ggpubr)
library(gridExtra)
give.n <- function(x){
  return(c(y = 0, label = length(x)))}

ATAC-seq: association of persistent peaks with YY1 binding

Load files from previous analyses

#sort ATAC_peaks_EDM
bedtools sort -i ATAC_peaks_EDM.bed > ATAC_peaks_EDM.sort.bed
bedtools closest -d -t all -a ATAC_peaks_EDM.sort.bed -b /path/to/YY1ab_ConsensusPeaks.rmInput.bed > ATAC_peaks_EDM_closestYY1ab.bed
awk '{ if ($13 < 1) { print } }' ATAC_peaks_EDM_closestYY1ab.bed > ATAC_peaks_EDM_YY1bound_YY1ab.bed
data_path <- "/path/to/folder/with/chrRNA/data/"
ATAC_analysis_path <- "/path/to/your/ATAC/analysis/folder/"
GeneInfo_A11B2 <- read_delim(paste0(data_path,"GeneInfo_A11B2.txt"), "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)

ATAC_NPC_AR_table_merge <- read_delim(paste0(ATAC_analysis_path,"ATAC_NPC_AR_table_merge.txt"),
                                             delim = "\t", escape_double = FALSE,  trim_ws = TRUE)
ATAC_peaks_EDM_closestYY1 <- read_delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM_closestYY1ab.bed"), 
                                        delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
colnames(ATAC_peaks_EDM_closestYY1) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid",
                                         "YY1peak_chr","YY1peak_Start","YY1peak_End","info","YY1peakScore","info2","distYY1")

Label ATAC peaks by YY1 binding

#By YY1-target yes/no
ATAC_NPC_AR_table_merge <- subset(ATAC_NPC_AR_table_merge,ATAC_NPC_AR_table_merge$Peakid %in% ATAC_peaks_EDM_closestYY1$Peakid)
ATAC_peaks_EDM_closestYY1 <- subset(ATAC_peaks_EDM_closestYY1, select = -c(info,info2,YY1peak_chr) )
ATAC_NPC_AR_table_byYY1 <- merge(ATAC_NPC_AR_table_merge,ATAC_peaks_EDM_closestYY1)

#Use only unique ATAC-peaks for plots
ATAC_NPC_AR_table_byYY1 <- ATAC_NPC_AR_table_byYY1 %>% 
  arrange(Peakid, dplyr::desc(YY1peakScore))
ATAC_NPC_AR_table_byYY1 <- ddply(ATAC_NPC_AR_table_byYY1, c("Peakid"), head, 1)

#Add extra column of YY1target
ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                   YY1target = case_when(
                                     ATAC_NPC_AR_table_byYY1$distYY1 < 0.5 ~ "YY1bound",
                                     ATAC_NPC_AR_table_byYY1$distYY1 > 0.5 ~ "nonYY1"))

Label ATAC peaks by number of YY1 motifs

(Note. analysis by number of YY1 motifs is not displayed in the paper)

bedtools getfasta -fi /path/to/whole/genome/fasta/mm10.fa -bed ATAC_peaks_EDM.bed > ATAC_peaks_EDM.fa
#remove extra lines from tsv file 
tail -n +2 YY1_motifs_ATACpeaks_FIMO.tsv > YY1_motifs_ATACpeaks_FIMO2.tsv
head -n -4 YY1_motifs_ATACpeaks_FIMO2.tsv > YY1_motifs_ATACpeaks_FIMO3.tsv
#coerces tsv file into bed of motifs:
awk 'BEGIN { OFS="\t"; } { n = split($3, a, /[:-]/); print a[1],a[2]+$4,a[2]+$5,$6,$7,$9,$10 }' YY1_motifs_ATACpeaks_FIMO3.tsv > YY1_motifs_ATACpeaks_FIMO_strand.bed
awk 'BEGIN { OFS="\t"; } { n = split($3, a, /[:-]/); print a[1],a[2]+$4,a[2]+$5,$10,$7,$6,$9 }' YY1_motifs_ATACpeaks_FIMO3.tsv > YY1_motifs_ATACpeaks_FIMO_sequence.bed

rm YY1_motifs_ATACpeaks_FIMO2.tsv YY1_motifs_ATACpeaks_FIMO3.tsv

#Count number of YY1 motifs in each ATAC peak
bedtools intersect -wa -c -a ATAC_peaks_EDM.sort.bed -b YY1_motifs_ATACpeaks_FIMO_sequence.bed > ATAC_peaks_EDM.YY1motifCount.bed

Load this “ATAC_peaks_EDM.YY1motifCount.bed” into R to add extra column (“YY1motif_group”) to ATAC-seq peak table:

ATAC_peaks_EDM_YY1motifCount <- read_delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.YY1motifCount.bed"), 
                                           delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
ATAC_peaks_EDM_YY1motifCount <- ATAC_peaks_EDM_YY1motifCount[,c(1:3,7)]
colnames(ATAC_peaks_EDM_YY1motifCount) <- c("Chr","Start","End","YY1motifs")

ATAC_NPC_AR_table_byYY1 <-  merge(ATAC_NPC_AR_table_byYY1, ATAC_peaks_EDM_YY1motifCount)

ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                   YY1motif_group = case_when(
                                     ATAC_NPC_AR_table_byYY1$YY1motifs > 1 ~ "MultipleYY1motif",
                                     ATAC_NPC_AR_table_byYY1$YY1motifs > 0 ~ "SingleYY1motif",
                                     ATAC_NPC_AR_table_byYY1$YY1motifs < 1 ~ "NoYY1motif")  )
length(which(ATAC_NPC_AR_table_byYY1$YY1motif_group == "MultipleYY1motif"))
## [1] 69
length(which(ATAC_NPC_AR_table_byYY1$YY1motif_group == "SingleYY1motif"))
## [1] 157
length(which(ATAC_NPC_AR_table_byYY1$YY1motif_group == "NoYY1motif"))
## [1] 564

Plots of ATAC peaks by YY1 binding and no of YY1 motifs

1. allelic ratio boxplot

colnames(ATAC_NPC_AR_table_byYY1)
##  [1] "Chr"            "Start"          "End"            "Peakid"        
##  [5] "Strand"         "Length"         "NPC_0d"         "NPC_1d"        
##  [9] "NPC_3d"         "NPC_6d"         "NPC_17d"        "per_v_dep"     
## [13] "peak_halftime"  "YY1peak_Start"  "YY1peak_End"    "YY1peakScore"  
## [17] "distYY1"        "YY1target"      "YY1motifs"      "YY1motif_group"
ATAC_NPC_AR_table_byYY1_melt <- melt(ATAC_NPC_AR_table_byYY1, 
                                     id=c("Peakid","Chr","Start","End","Strand","Length",
                                          "per_v_dep","peak_halftime","YY1peak_Start","YY1peak_End",
                                          "YY1peakScore","distYY1","YY1target","YY1motifs", "YY1motif_group"),
                                     stringsAsFactors = FALSE)
colnames(ATAC_NPC_AR_table_byYY1_melt)[(ncol(ATAC_NPC_AR_table_byYY1_melt)-1):ncol(ATAC_NPC_AR_table_byYY1_melt)] <- c("sample","ratio")
ATAC_NPC_AR_table_byYY1_melt$sample <- as.character(ATAC_NPC_AR_table_byYY1_melt$sample)
#YY1 vs non-YY1
AR_YY1target <- mutate(ATAC_NPC_AR_table_byYY1_melt, 
                       YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  mutate(sample = factor(sample, levels=c("NPC_0d","NPC_1d","NPC_3d","NPC_6d","NPC_17d"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1target

#YY1 motif group
AR_YY1motif_group <- mutate(ATAC_NPC_AR_table_byYY1_melt, 
                      YY1motif_group = factor(YY1motif_group, levels=c("NoYY1motif","SingleYY1motif","MultipleYY1motif"))) %>%
  mutate(sample = factor(sample, levels=c("NPC_0d","NPC_1d","NPC_3d","NPC_6d","NPC_17d"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1motif_group) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1motif_group

2. Accessibility halftime boxplot

#YY1 vs non-YY1
comparisons <- list(c("nonYY1", "YY1bound"))
YY1target_box <- mutate(ATAC_NPC_AR_table_byYY1, YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  ggplot(aes(x=YY1target,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box

#for promoters and distal elements separately
#Distal vs TSS1KB
ATAC_peaks_EDM_TSS1KB <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.TSS1KB.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_TSS1KB) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid")

ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                     dist_prom = case_when(
                                       ATAC_NPC_AR_table_byYY1$Peakid %in% ATAC_peaks_EDM_TSS1KB$Peakid ~ "prom",
                                       TRUE ~ "distal") )

YY1target_box_prom <- subset(ATAC_NPC_AR_table_byYY1, dist_prom %in% c("prom")) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  ggplot(aes(x=YY1target,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box_distal <- subset(ATAC_NPC_AR_table_byYY1, dist_prom %in% c("distal")) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  ggplot(aes(x=YY1target,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
grid.arrange(YY1target_box_prom, YY1target_box_distal, nrow = 1)

#YY1 motif group
comparisons <- list(c("NoYY1motif","SingleYY1motif"),c("SingleYY1motif","MultipleYY1motif"))
YY1motif_group_box <- mutate(ATAC_NPC_AR_table_byYY1, YY1motif_group = factor(YY1motif_group, levels=c("NoYY1motif","SingleYY1motif","MultipleYY1motif"))) %>%
  ggplot(aes(x=YY1motif_group,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1motif_group_box

3. pie chart, by depleted vs persistent peaks

#YY1 vs non-YY1
depleted_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("depleted_peak"))
depleted_YY1count <- c(length(which(depleted_pie$YY1target == "YY1bound")),
                       length(which(depleted_pie$YY1target == "nonYY1")))
depleted_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), depleted_YY1count )

persistent_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("persistent_peak"))
persistent_YY1count <- c(length(which(persistent_pie$YY1target == "YY1bound")),
                         length(which(persistent_pie$YY1target == "nonYY1")))
persistent_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), persistent_YY1count )

pie_depleted <- ggplot(depleted_pie, aes(x="", y=depleted_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("depleted-peaks") +
  geom_text(aes(label = paste(depleted_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_persistent <- ggplot(persistent_pie, aes(x="", y=persistent_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("persistent-peaks") +
  geom_text(aes(label = paste(persistent_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

YY1target_pie <- grid.arrange(pie_depleted, pie_persistent, nrow = 1)

#By YY1motif
depleted_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("depleted_peak"))
depleted_YY1count <- depleted_pie %>% dplyr::count(YY1motifs) 
persistent_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("persistent_peak"))
persistent_YY1count <- persistent_pie %>% dplyr::count(YY1motifs) 

#total numbers of YY1 motifs in persisten vs depeleted peaks:
sum(persistent_pie$YY1motifs) 
## [1] 239
sum(depleted_pie$YY1motifs)
## [1] 112
pie_depleted <- ggplot(depleted_YY1count, aes(x="", y=n, fill=as.character(YY1motifs))) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("depleted-peaks") +
  geom_text(aes(label = unlist(depleted_YY1count[2])), position = position_stack(vjust = 0.5)) #+
#coord_polar("y", start=0) + theme_void()
pie_persistent <- ggplot(persistent_YY1count, aes(x="", y=n, fill=as.character(YY1motifs))) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("persistent-peaks") +
  geom_text(aes(label = unlist(persistent_YY1count[2])), position = position_stack(vjust = 0.5)) #+
#coord_polar("y", start=0) + theme_void()
grid.arrange(pie_depleted, pie_persistent, nrow = 1)

4. pie chart, by category

#YY1 vs non-YY1
nonYY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("nonYY1"))
nonYY1_count <- c(length(which(nonYY1_pie$per_v_dep == "persistent_peak")),
                  length(which(nonYY1_pie$per_v_dep == "depleted_peak")))
nonYY1_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), nonYY1_count )

YY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("YY1bound"))
YY1_count <- c(length(which(YY1_pie$per_v_dep == "persistent_peak")),
               length(which(YY1_pie$per_v_dep == "depleted_peak")))
YY1_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), YY1_count )

pie_nonYY1 <- ggplot(nonYY1_pie, aes(x="", y=nonYY1_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("nonYY1") +
  geom_text(aes(label = paste(nonYY1_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

pie_YY1 <- ggplot(YY1_pie, aes(x="", y=YY1_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("YY1") +
  geom_text(aes(label = paste(YY1_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

per_v_dep_pie <- grid.arrange(pie_nonYY1, pie_YY1, nrow = 1)

#By YY1motif
NoYY1motif_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1motif_group %in% c("NoYY1motif"))
NoYY1motif_count <- c(length(which(NoYY1motif_pie$per_v_dep == "persistent_peak")),
                  length(which(NoYY1motif_pie$per_v_dep == "depleted_peak")))
NoYY1motif_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), NoYY1motif_count )

SingleYY1motif_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1motif_group %in% c("SingleYY1motif"))
SingleYY1motif_count <- c(length(which(SingleYY1motif_pie$per_v_dep == "persistent_peak")),
                      length(which(SingleYY1motif_pie$per_v_dep == "depleted_peak")))
SingleYY1motif_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), SingleYY1motif_count )

MultipleYY1motif_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1motif_group %in% c("MultipleYY1motif"))
MultipleYY1motif_count <- c(length(which(MultipleYY1motif_pie$per_v_dep == "persistent_peak")),
                          length(which(MultipleYY1motif_pie$per_v_dep == "depleted_peak")))
MultipleYY1motif_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), MultipleYY1motif_count )

pie_NoYY1motif <- ggplot(NoYY1motif_pie, aes(x="", y=NoYY1motif_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("NoYY1motif") +
  geom_text(aes(label = paste(NoYY1motif_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

pie_SingleYY1motif <- ggplot(SingleYY1motif_pie, aes(x="", y=SingleYY1motif_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("SingleYY1motif") +
  geom_text(aes(label = paste(SingleYY1motif_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

pie_MultipleYY1motif <- ggplot(MultipleYY1motif_pie, aes(x="", y=MultipleYY1motif_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("MultipleYY1motif") +
  geom_text(aes(label = paste(MultipleYY1motif_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

per_v_dep_pie <- grid.arrange(pie_NoYY1motif, pie_SingleYY1motif, pie_MultipleYY1motif, nrow = 1)

ChrRNA-seq:

Load published chrRNA-seq data

#Load AR tables, keep only WT samples, and merge replicates
C7H8_AR_table <- read_delim(paste0(data_path,"C7H8_AR_table_merged_discreet.txt"), #replace these with GEO file names later
                            delim = "\t", escape_double = FALSE, trim_ws = TRUE)
A11B2_AR_table <- read_delim(paste0(data_path, "A11B2_AR_table_merged_discreet.txt"), #replace these with GEO file names later
                            delim = "\t", escape_double = FALSE, trim_ws = TRUE)

#Load GeneInfo tables
GeneInfo_A11B2 <- read_delim(paste0(data_path,"GeneInfo_A11B2.txt"), 
                             "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)
GeneInfo_C7H8 <- read_delim(paste0(data_path,"GeneInfo_C7H8.txt"), 
                            "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)
#make halftimes in days rather than hours
GeneInfo_A11B2$Halftime <- (1/24)*GeneInfo_A11B2$Halftime
GeneInfo_C7H8$Halftime <- (1/24)*GeneInfo_C7H8$Halftime

Direct YY1-target genes

#Genes_Nonredundant_GenomeWide_TSS.gtf is extracted from genome annotation gtf used for chrRNA-seq analysis
gtf2bed < Genes_Nonredundant_GenomeWide_TSS.gtf > Genes_Nonredundant_GenomeWide_TSS.bed
bedtools closest -d -t all -a Genes_Nonredundant_GenomeWide_TSS.bed -b /path/to/YY1ab_ConsensusPeaks.rmInput.bed > GeneTSS_closestYY1peak.bed
#Only take lines where there is exact overlap between TSS and YY1 peak (ie. YY1 in promoter)
awk -F "\t" '{ if ($17 < 1) { print } }' GeneTSS_closestYY1peak.bed > GeneTSS_YY1prom.bed
#Load file of YY1-bound TSS
GeneTSS_YY1prom <- read.delim(paste0(data_path,"GeneTSS_YY1prom.bed"), header=FALSE)
#Append YY1target to C7H8_AR_table
C7H8_AR_table_byYY1 <-  mutate(C7H8_AR_table,
                                   YY1target = case_when(
                                     C7H8_AR_table$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                     !(C7H8_AR_table$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )

C7H8_AR_YY1 <- C7H8_AR_table_byYY1[which(C7H8_AR_table_byYY1$YY1target == "YY1"),]
C7H8_AR_nonYY1 <- C7H8_AR_table_byYY1[which(C7H8_AR_table_byYY1$YY1target == "nonYY1"),]

Plot boxplots- chrRNA YY1targets vs non-targets

colnames(C7H8_AR_table_byYY1)
##  [1] "Geneid"    "Chr"       "Start"     "End"       "Strand"    "Length"   
##  [7] "0h"        "24h"       "48h"       "72h"       "120h"      "168h"     
## [13] "NPC"       "YY1target"
C7H8_AR_table_byYY1_melt <- melt(as.data.frame(na.omit(C7H8_AR_table_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(C7H8_AR_table_byYY1_melt)[(ncol(C7H8_AR_table_byYY1_melt)-1):ncol(C7H8_AR_table_byYY1_melt)] <- c("sample","ratio")
C7H8_AR_table_byYY1_melt$sample <- as.character(C7H8_AR_table_byYY1_melt$sample)

#plot boxplots
AR_YY1target <- mutate(C7H8_AR_table_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  mutate(sample = factor(sample, levels=c("0h","24h","48h","72h","120h","168h","NPC"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1target

Plot Ribbon plot - chrRNA YY1targets vs non-targets

C7H8_AR_table_timepoints <- c(0,1,2,3,5,7,18)
colnames(C7H8_AR_table)[7:13] <- as.numeric(C7H8_AR_table_timepoints)

C7H8_AR_YY1summary <- as.array(apply(C7H8_AR_YY1[,7:13], 2, summary))
C7H8_AR_YY1summary_t <- data.frame(C7H8_AR_table_timepoints, t(C7H8_AR_YY1summary))
colnames(C7H8_AR_YY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

C7H8_AR_nonYY1summary <- as.array(apply(na.omit(C7H8_AR_nonYY1[,7:13]), 2, summary))
C7H8_AR_nonYY1summary_t <- data.frame(C7H8_AR_table_timepoints, t(C7H8_AR_nonYY1summary))
colnames(C7H8_AR_nonYY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,10), ylim = c(0,0.8),expand = FALSE) +
  #scale_y_continuous(limits = c(0,0.8), expand = c(0, 0)) + scale_x_continuous(limits = c(0,17), expand = c(0, 0)) +
  #YY1
  geom_ribbon(data=C7H8_AR_YY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=C7H8_AR_YY1summary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=C7H8_AR_YY1summary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #nonYY1
  geom_ribbon(data=C7H8_AR_nonYY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.2, fill="blue") +
  geom_point(data=C7H8_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue") +
  geom_line(data=C7H8_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue", linetype="dotted")

Append YY1targets to GeneInfo and plot boxplots by GeneHalftime

C7H8_AR_table_byYY1_Info <- merge(C7H8_AR_table_byYY1, GeneInfo_C7H8)

#Just YY1 v non-YY1
comparisons <- list(c("nonYY1", "YY1"))
YY1target_box <- mutate(C7H8_AR_table_byYY1_Info, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=Halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box

Pie charts of genes by YY1 targets and YY1 groups

#By YY1target
fast_pie <- subset(C7H8_AR_table_byYY1_Info, SilencingRate %in% c("fastsilencing"))
fast_YY1count <- c(length(which(fast_pie$YY1target == "YY1")),
                              length(which(fast_pie$YY1target == "nonYY1")))
fast_pie <- data.frame(YY1target=c("YY1", "nonYY1"), fast_YY1count )

medium_pie <- subset(C7H8_AR_table_byYY1_Info, SilencingRate %in% c("mediumsilencing"))
medium_YY1count <- c(length(which(medium_pie$YY1target == "YY1")),
                   length(which(medium_pie$YY1target == "nonYY1")))
medium_pie <- data.frame(YY1target=c("YY1", "nonYY1"), medium_YY1count )

slow_pie <- subset(C7H8_AR_table_byYY1_Info, SilencingRate %in% c("slowsilencing"))
slow_YY1count <- c(length(which(slow_pie$YY1target == "YY1")),
                   length(which(slow_pie$YY1target == "nonYY1")))
slow_pie <- data.frame(YY1target=c("YY1", "nonYY1"), slow_YY1count )

pie_fast <- ggplot(fast_pie, aes(x="", y=fast_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("fast-silencing") +
  geom_text(aes(label = paste(fast_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_medium <- ggplot(medium_pie, aes(x="", y=medium_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("medium-silencing") +
  geom_text(aes(label = paste(medium_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_slow <- ggplot(slow_pie, aes(x="", y=slow_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("slow-silencing") +
  geom_text(aes(label = paste(slow_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

YY1target_pie <- grid.arrange(pie_fast, pie_medium, pie_slow, nrow = 1)

iXist-ChrX-Dom ChrRNA by YY1

# Append YY1target to A11B2_AR_table 
A11B2_AR_table_byYY1 <-  mutate(A11B2_AR_table,
                               YY1target = case_when(
                                 A11B2_AR_table$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                 !(A11B2_AR_table$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )

A11B2_AR_YY1 <- A11B2_AR_table_byYY1[which(A11B2_AR_table_byYY1$YY1target == "YY1"),]
A11B2_AR_nonYY1 <- A11B2_AR_table_byYY1[which(A11B2_AR_table_byYY1$YY1target == "nonYY1"),]

AR boxplot

#chrRNA YY1targets vs non-targets
colnames(A11B2_AR_table_byYY1)
##  [1] "Geneid"    "Chr"       "Start"     "End"       "Strand"    "Length"   
##  [7] "0h"        "24h"       "48h"       "72h"       "120h"      "168h"     
## [13] "NPC"       "YY1target"
A11B2_AR_table_byYY1_melt <- melt(as.data.frame(na.omit(A11B2_AR_table_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(A11B2_AR_table_byYY1_melt)[(ncol(A11B2_AR_table_byYY1_melt)-1):ncol(A11B2_AR_table_byYY1_melt)] <- c("sample","ratio")
A11B2_AR_table_byYY1_melt$sample <- as.character(A11B2_AR_table_byYY1_melt$sample)

#plot boxplots
AR_YY1target <- mutate(A11B2_AR_table_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  mutate(sample = factor(sample, levels=c("0h","24h","48h","72h","120h","168h","NPC"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1target

Ribbon plot

#chrRNA YY1targets vs non-targets
A11B2_AR_table_timepoints <- c(0,1,2,3,5,7,18)
colnames(A11B2_AR_table)[7:13] <- as.numeric(A11B2_AR_table_timepoints)

A11B2_AR_YY1summary <- as.array(apply(A11B2_AR_YY1[,7:13], 2, summary))
A11B2_AR_YY1summary_t <- data.frame(A11B2_AR_table_timepoints, t(A11B2_AR_YY1summary))
colnames(A11B2_AR_YY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

A11B2_AR_nonYY1summary <- as.array(apply(na.omit(A11B2_AR_nonYY1[,7:13]), 2, summary))
A11B2_AR_nonYY1summary_t <- data.frame(A11B2_AR_table_timepoints, t(A11B2_AR_nonYY1summary))
colnames(A11B2_AR_nonYY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,10), ylim = c(0,0.8),expand = FALSE) +
  #scale_y_continuous(limits = c(0,0.8), expand = c(0, 0)) + scale_x_continuous(limits = c(0,17), expand = c(0, 0)) +
  #YY1
  geom_ribbon(data=A11B2_AR_YY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=A11B2_AR_YY1summary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=A11B2_AR_YY1summary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #nonYY1
  geom_ribbon(data=A11B2_AR_nonYY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.2, fill="blue") +
  geom_point(data=A11B2_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue") +
  geom_line(data=A11B2_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue", linetype="dotted")

Gene halftime boxplot

# Append YY1targets to GeneInfo and plot boxplots by GeneHalftime
A11B2_AR_table_byYY1_Info <- merge(A11B2_AR_table_byYY1,GeneInfo_A11B2)

#Just YY1 v non-YY1
comparisons <- list(c("nonYY1", "YY1"))
YY1target_box <- mutate(A11B2_AR_table_byYY1_Info, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=Halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_continuous(expand = c(0, 0.5), limits = c(0, 6)) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box

Pie charts of genes by YY1 targets and YY1 groups

#By YY1target
fast_pie <- subset(A11B2_AR_table_byYY1_Info, SilencingRate %in% c("fastsilencing"))
fast_YY1count <- c(length(which(fast_pie$YY1target == "YY1")),
                   length(which(fast_pie$YY1target == "nonYY1")))
fast_pie <- data.frame(YY1target=c("YY1", "nonYY1"), fast_YY1count )

medium_pie <- subset(A11B2_AR_table_byYY1_Info, SilencingRate %in% c("mediumsilencing"))
medium_YY1count <- c(length(which(medium_pie$YY1target == "YY1")),
                     length(which(medium_pie$YY1target == "nonYY1")))
medium_pie <- data.frame(YY1target=c("YY1", "nonYY1"), medium_YY1count )

slow_pie <- subset(A11B2_AR_table_byYY1_Info, SilencingRate %in% c("slowsilencing"))
slow_YY1count <- c(length(which(slow_pie$YY1target == "YY1")),
                   length(which(slow_pie$YY1target == "nonYY1")))
slow_pie <- data.frame(YY1target=c("YY1", "nonYY1"), slow_YY1count )

pie_fast <- ggplot(fast_pie, aes(x="", y=fast_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("fast-silencing") +
  geom_text(aes(label = paste(fast_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_medium <- ggplot(medium_pie, aes(x="", y=medium_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("medium-silencing") +
  geom_text(aes(label = paste(medium_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_slow <- ggplot(slow_pie, aes(x="", y=slow_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("slow-silencing") +
  geom_text(aes(label = paste(slow_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

YY1target_pie <- grid.arrange(pie_fast, pie_medium, pie_slow, nrow = 1)

mESC silencing by YY1 target

#Load files
C7H8_mESCd10_AR <-  read_delim(paste0(data_path,"iXist-ChrX-Cast_mESC_d10_AllelicRatio_table.txt"), 
                               delim = "\t", escape_double = FALSE, trim_ws = TRUE)
A11B2_mESCd10_AR <- read_delim(paste0(data_path,"iXist-ChrX-Dom_mESC_d10_AllelicRatio_table.txt"), 
                              delim = "\t", escape_double = FALSE, trim_ws = TRUE)

#C7H8
C7H8_ES_d10 <- (1/3)*rowSums(C7H8_mESCd10_AR[,7:9])
C7H8_mESCd10_AR <- cbind(C7H8_mESCd10_AR[,c(1:6)], C7H8_ES_d10)
rm(C7H8_ES_d10)
C7H8_mESCd10_AR_byYY1 <- mutate(C7H8_mESCd10_AR,
                                YY1target = case_when(
                                  C7H8_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                  !(C7H8_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )
C7H8_mESCd10_AR_byYY1_melt <- melt(as.data.frame(na.omit(C7H8_mESCd10_AR_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(C7H8_mESCd10_AR_byYY1_melt)[(ncol(C7H8_mESCd10_AR_byYY1_melt)-1):ncol(C7H8_mESCd10_AR_byYY1_melt)] <- c("sample","ratio")
C7H8_mESCd10_AR_byYY1_melt$sample <- as.character(C7H8_mESCd10_AR_byYY1_melt$sample)

#plot boxplots
comparisons <- list(c("nonYY1","YY1"))
mutate(C7H8_mESCd10_AR_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=ratio) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(-0.05, 1.05)) +
  geom_hline(yintercept=0.5, linetype="dashed") +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") + stat_summary(fun.data = give.n, geom = "text")

#A11B2
A11B2_ES_d10 <- (1/4)*rowSums(A11B2_mESCd10_AR[,7:10])
A11B2_mESCd10_AR <- cbind(A11B2_mESCd10_AR[,c(1:6)], A11B2_ES_d10)
rm(A11B2_ES_d10)
A11B2_mESCd10_AR_byYY1 <- mutate(A11B2_mESCd10_AR,
                                YY1target = case_when(
                                  A11B2_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                  !(A11B2_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )
A11B2_mESCd10_AR_byYY1_melt <- melt(as.data.frame(na.omit(A11B2_mESCd10_AR_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(A11B2_mESCd10_AR_byYY1_melt)[(ncol(A11B2_mESCd10_AR_byYY1_melt)-1):ncol(A11B2_mESCd10_AR_byYY1_melt)] <- c("sample","ratio")
A11B2_mESCd10_AR_byYY1_melt$sample <- as.character(A11B2_mESCd10_AR_byYY1_melt$sample)

#plot boxplots
comparisons <- list(c("nonYY1","YY1"))
mutate(A11B2_mESCd10_AR_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=ratio) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(-0.05, 1.05)) +
  geom_hline(yintercept=0.5, linetype="dashed") +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") + stat_summary(fun.data = give.n, geom = "text")

Analysis for Revisions

Reviewer suggestion 1

Investigation by association between YY1 binding and gene expression level

  • Here I look at the cross-associations between gene silencing rate, initial expression level and YY1 binding.
  • First, I investigate if YY1 targets on average, more highly expressed than non-YY1 targets genome-wide. I load in our previous data of mRNA-seq in iXist-ChrX GSE185869 to calculate TPM for all genes in the genome, and compare this with the list of all YY1 target genes in the gneome from GeneTSS_YY1prom. YY1 target genes indeed tend to be more highly expressed, both genome-wide and on ChrX.
  • I then look within the C7H8_AR_table_byYY1_Info table (master table of chrRNA-seq gene silencing anaysis) if YY1 target genes are slower-silencing whilst controlling for initial expression levels (ie. within categories of high/medium/low-expressed genes). Within the group of medium-expressed genes in particular, YY1 targets remain signficantly slower to silence.
#Genes which meet criteria for allelic analysis genome-wide            
GenomeWide_Allelic_Genes <- read_delim(paste0(data_path,"GenomeWide_Allelic_Genes_ARtable.txt"), 
    delim = "\t", escape_double = FALSE, 
    col_types = cols_only(Geneid = col_guess()), 
    trim_ws = TRUE)

#TPM genome wide
A11B2_mRNA_0h_count <- read_table2(paste0(data_path,"A11B2_mRNA_0h_count_exon"),skip = 1)
A11B2_mRNA_0h_count <- cbind(A11B2_mRNA_0h_count[1:6], 0.5*(A11B2_mRNA_0h_count[7] + A11B2_mRNA_0h_count[8]))
RPK <- sweep(A11B2_mRNA_0h_count[7:ncol(A11B2_mRNA_0h_count)], MARGIN=1,FUN="/",STATS=A11B2_mRNA_0h_count$Length)
RPK <- cbind(A11B2_mRNA_0h_count[1:6],RPK)
#Sum RPKs then divide through norm factor to make TPM
RPK_Sums <- colSums(RPK[7:ncol(RPK)])
RPK_NormFactors <- rep(RPK_Sums[seq(1,length(RPK_Sums),3)], each=3)/1000000
TPM_mRNA <- cbind(RPK[1:6],
                  sweep(RPK[7:ncol(RPK)], 2, RPK_NormFactors, "/"))
colnames(TPM_mRNA)[7] <- "GeneTPM"

#Add YY1 target annotated to genome-wide TPM file
TPM_mRNA <-  mutate(TPM_mRNA,
                    YY1target = case_when(
                      TPM_mRNA$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                      !(TPM_mRNA$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )

# subset by only genes which would pass allelic analysis
TPM_mRNA_sub <- subset(TPM_mRNA, Geneid %in% GenomeWide_Allelic_Genes$Geneid | 
                         Geneid %in% C7H8_AR_table$Geneid) 
table(TPM_mRNA_sub$YY1target)
## 
## nonYY1    YY1 
##  10329   1946
#how many YY1 targets on autosomes vs chrX?
table(subset(TPM_mRNA_sub, !grepl("chrX",TPM_mRNA_sub$Chr))$YY1target) #autosomes
## 
## nonYY1    YY1 
##   9983   1882
1882/(1882+9983)
## [1] 0.1586178
table(subset(TPM_mRNA_sub, grepl("chrX",TPM_mRNA_sub$Chr))$YY1target) #chrX
## 
## nonYY1    YY1 
##    346     64
64/(64+346)
## [1] 0.1560976
#subset by autosomes
YY1target_TPM_autosomes <- subset(TPM_mRNA_sub, !grepl("chrX",TPM_mRNA_sub$Chr)) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1")))
#TPM of autosomes by if they are YY1 targets
YY1target_TPM_autosomes_box <-  ggplot(YY1target_TPM_autosomes, aes(x=YY1target,y=GeneTPM)) +   
  geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_log10(expand = c(0, 0.5)) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test", label="p.format") +
  stat_summary(fun.data = give.n, geom = "text") 
YY1target_TPM_autosomes_box

#calculate median TPM of autosomes
median(subset(YY1target_TPM_autosomes, YY1target == "YY1")$GeneTPM)
## [1] 29.63787
nrow(subset(YY1target_TPM_autosomes, YY1target == "YY1"))
## [1] 1882
median(subset(YY1target_TPM_autosomes, YY1target == "nonYY1")$GeneTPM)
## [1] 8.037361
nrow(subset(YY1target_TPM_autosomes, YY1target == "nonYY1"))
## [1] 9983
#subset by chrX
YY1target_TPM_chrX <- subset(TPM_mRNA_sub, grepl("chrX",TPM_mRNA_sub$Chr)) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1")))
#TPM of chrX by if they are YY1 targets
YY1target_TPM_chrX_box <-  ggplot(YY1target_TPM_chrX, aes(x=YY1target,y=GeneTPM)) +   
  geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_log10(expand = c(0, 0.5)) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test", label="p.format") +
  stat_summary(fun.data = give.n, geom = "text") 
YY1target_TPM_chrX_box

#calculate median TPM of chrX genes
median(subset(YY1target_TPM_chrX, YY1target == "YY1")$GeneTPM)
## [1] 70.81174
nrow(subset(YY1target_TPM_chrX, YY1target == "YY1"))
## [1] 64
median(subset(YY1target_TPM_chrX, YY1target == "nonYY1")$GeneTPM)
## [1] 30.69529
nrow(subset(YY1target_TPM_chrX, YY1target == "nonYY1"))
## [1] 346
#within all YY1 target genes, is YY1 enrichment correlated with expression level
TPM_mRNA_minimal <- TPM_mRNA[,c("Geneid","GeneTPM")]
YY1genes_minimal <- GeneTSS_YY1prom[,c(4,15)]
colnames(YY1genes_minimal) <- c("Geneid","YY1enrichment")
YY1_targets_TPM <- merge(YY1genes_minimal, TPM_mRNA_minimal, by = "Geneid", all.x = TRUE)
cor(YY1_targets_TPM$YY1enrichment, YY1_targets_TPM$GeneTPM)
## [1] -0.05388916
scatter_YY1_TPM <- ggplot(YY1_targets_TPM, aes(x=YY1enrichment, y=GeneTPM)) + 
  geom_point(shape=16) + theme_bw() +
  scale_y_log10()  + 
  scale_x_log10()  +
  stat_cor(method="spearman") 
scatter_YY1_TPM

#Initial TPM by YY1 v non-YY1 in ChrX data
#replace TPM in GenInfo table by TPM re-calculated above
C7H8_AR_table_byYY1_Info$TPM
## NULL
TPM_replace <- subset(TPM_mRNA, Geneid %in% C7H8_AR_table_byYY1_Info$Geneid)
C7H8_AR_table_byYY1_Info <- merge(C7H8_AR_table_byYY1_Info,TPM_replace[,c("Geneid","GeneTPM")])
#I do not know why the old and new TPM values are not identical, but they are extremely well correlated (R=0.99) so the differences are negligible
cor(C7H8_AR_table_byYY1_Info$InitialTPM, C7H8_AR_table_byYY1_Info$GeneTPM)
## [1] 0.9923476
comparisons <- list(c("nonYY1", "YY1"))
#plot boxplot
YY1target_TPM_box <- mutate(C7H8_AR_table_byYY1_Info, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=GeneTPM)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_log10(expand = c(0, 0.5)) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text") 
YY1target_TPM_box

#calculate medians
median(subset(C7H8_AR_table_byYY1_Info, YY1target == "YY1")$GeneTPM)
## [1] 70.81174
nrow(subset(C7H8_AR_table_byYY1_Info, YY1target == "YY1"))
## [1] 64
median(subset(C7H8_AR_table_byYY1_Info, YY1target == "nonYY1")$GeneTPM)
## [1] 31.4976
nrow(subset(C7H8_AR_table_byYY1_Info, YY1target == "nonYY1"))
## [1] 335
#plot by YY1-target controlling for the same expression group
 Halftime_byExpression_byYY1 <- mutate(C7H8_AR_table_byYY1_Info, 
                                       Expression_group = factor(Expression_group, levels=c("lowexpressed","mediumexpressed","highexpressed"))) %>%
  ggplot(aes(x=Expression_group,y=Halftime,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  stat_compare_means(method = "wilcox.test", label="p.format",label.y=1) + 
   stat_summary(fun.data = give.n, geom = "text")
Halftime_byExpression_byYY1

Reviewer suggestion 2

Stratify peaks by quartiles rather than a binary persistent/depleted categorisation

  • The assumption that biallelically accessible peaks in NPCs = persistent peaks (by kinetics of accessibility decrease) may not hold for a quartile-based strategy. Thus, for this analysis I will keep these biallelic peaks as a separate (n=133), and then group the remaining 662 peaks into 4 groups of slow (aS) | intermediate (aI)| fast (aF) | biallalic (aB) accessibility decrease. The a stands for “accessibility” and distinguishes from fast/intermediate/slow silencing genes.
  • I then see with these fine-grained categorisation of REs whether numbers of YY1 motifs and YY1 / non-YY1 increase in these four categories. The association of YY1 binding with slower loss of accessibility holds regardless of analysis strategy.
#make equal-sized peak-accessbility groups
fast_threshold <- 2.39
slow_threshold <- 6.03
ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                   four_categ = case_when(
                                     is.na(ATAC_NPC_AR_table_byYY1$peak_halftime) ~ "aBiallelic",
                                     ATAC_NPC_AR_table_byYY1$peak_halftime > slow_threshold ~ "aSlow",
                                     ATAC_NPC_AR_table_byYY1$peak_halftime < slow_threshold & 
                                       ATAC_NPC_AR_table_byYY1$peak_halftime > fast_threshold ~ "aIntermediate",
                                     ATAC_NPC_AR_table_byYY1$peak_halftime < fast_threshold ~ "aFast"))
#Distributions of the four categories in YY1 vs non-YY1 peaks
nonYY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("nonYY1"))
nonYY1_count <- c(length(which(nonYY1_pie$four_categ == "aBiallelic")),
                  length(which(nonYY1_pie$four_categ == "aSlow")),
                  length(which(nonYY1_pie$four_categ == "aIntermediate")),
                  length(which(nonYY1_pie$four_categ == "aFast")))
nonYY1_pie <- data.frame(four_categ=c("aBiallelic", "aSlow","aIntermediate","aFast"), nonYY1_count )

YY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("YY1bound"))
YY1_count <- c(length(which(YY1_pie$four_categ == "aBiallelic")),
                  length(which(YY1_pie$four_categ == "aSlow")),
                  length(which(YY1_pie$four_categ == "aIntermediate")),
                  length(which(YY1_pie$four_categ == "aFast")))
YY1_pie <- data.frame(four_categ=c("aBiallelic", "aSlow","aIntermediate","aFast"), YY1_count )

nonYY1_pie <- mutate(nonYY1_pie, 
                       four_categ = factor(four_categ, levels=c("aBiallelic", "aSlow","aIntermediate","aFast")))
pie_nonYY1 <- ggplot(nonYY1_pie, aes(x="", y=nonYY1_count, fill=four_categ)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("nonYY1") +
  geom_text(aes(label = paste(nonYY1_count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_nonYY1

YY1_pie <- mutate(YY1_pie, 
                       four_categ = factor(four_categ, levels=c("aBiallelic", "aSlow","aIntermediate","aFast")))
pie_YY1 <- ggplot(YY1_pie, aes(x="", y=YY1_count, fill=four_categ)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("YY1") +
  geom_text(aes(label = paste(YY1_count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_YY1

#Distributions of YY1 vs non-YY1 within the four categories
#By YY1target
aFast_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aFast"))
aFast_YY1count <- c(length(which(aFast_pie$YY1target == "YY1bound")),
                              length(which(aFast_pie$YY1target == "nonYY1")))
aFast_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aFast_YY1count )

aIntermediate_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aIntermediate"))
aIntermediate_YY1count <- c(length(which(aIntermediate_pie$YY1target == "YY1bound")),
                   length(which(aIntermediate_pie$YY1target == "nonYY1")))
aIntermediate_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aIntermediate_YY1count )

aSlow_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aSlow"))
aSlow_YY1count <- c(length(which(aSlow_pie$YY1target == "YY1bound")),
                   length(which(aSlow_pie$YY1target == "nonYY1")))
aSlow_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aSlow_YY1count )

aBiallelic_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aBiallelic"))
aBiallelic_YY1count <- c(length(which(aBiallelic_pie$YY1target == "YY1bound")),
                   length(which(aBiallelic_pie$YY1target == "nonYY1")))
aBiallelic_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aBiallelic_YY1count )


pie_aFast <- ggplot(aFast_pie, aes(x="", y=aFast_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aFast") +
  geom_text(aes(label = paste(aFast_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_aIntermediate <- ggplot(aIntermediate_pie, aes(x="", y=aIntermediate_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aIntermediate") +
  geom_text(aes(label = paste(aIntermediate_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_aSlow <- ggplot(aSlow_pie, aes(x="", y=aSlow_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aSlow") +
  geom_text(aes(label = paste(aSlow_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_aBiallelic <- ggplot(aBiallelic_pie, aes(x="", y=aBiallelic_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aBiallelic") +
  geom_text(aes(label = paste(aBiallelic_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()

YY1target_pie <- grid.arrange(pie_aFast, pie_aIntermediate, pie_aSlow, pie_aBiallelic, nrow = 1)

YY1target_pie
## TableGrob (1 x 4) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (1-1,4-4) arrange gtable[layout]

Reviewer suggestion 3

Compare YY1 target genes by SmcHD1-dependence categories

  • There seems to be some correlation between YY1 target genes and SmcHD1 dependence. However, this correspondence is not exact, and the fact that a different model system was used to define SmcHD1 dependence (SmcHD1 KO MEFs) makes this comparison less appropriate.
table(subset(C7H8_AR_table_byYY1_Info, YY1target == "YY1")$SmcHD1_dependence)
## 
##                MEF_escapee           SmcHD1_dependent 
##                          4                         13 
##       SmcHD1_not_dependent SmcHD1_partially_dependent 
##                          8                         34
table(subset(C7H8_AR_table_byYY1_Info, YY1target == "nonYY1")$SmcHD1_dependence)
## 
##                MEF_escapee           SmcHD1_dependent 
##                         14                         43 
##       SmcHD1_not_dependent SmcHD1_partially_dependent 
##                         93                        109
table(subset(C7H8_AR_table_byYY1_Info, SmcHD1_dependence == "SmcHD1_dependent")$YY1target)
## 
## nonYY1    YY1 
##     43     13
table(subset(C7H8_AR_table_byYY1_Info, SmcHD1_dependence == "MEF_escapee")$YY1target)
## 
## nonYY1    YY1 
##     14      4