ATAC-seq: association of persistent peaks with YY1 binding
Plots of ATAC peaks by YY1 binding and no of YY1 motifs
ChrRNA-seq:
iXist-ChrX-Dom ChrRNA by YY1
mESC silencing by YY1 target
Analysis for Revisions

Load required packages and functions

library(readr)
library(plyr)
library(dplyr)
library(reshape)
library(ggpubr)
library(gridExtra)
give.n <- function(x){
  return(c(y = 0, label = length(x)))}

ATAC-seq: association of persistent peaks with YY1 binding

Load files from previous analyses

The iXist-ChrX-Dom “GeneInfo” file can be downloaded from this link. It needs to be converted from .xlsx to .txt file type to load in R. Note that A11B2 is the clone synonym of the iXist-ChrX-Dom cell line.
Find nearest YY1 ChIP-seq peak to ATAC-seq peak set (“ATAC_peaks_EDM.bed”) using bedtools closest. YY1 peaks in iXist-ChrX mESCs (called by macs2 over ChIP-seq peaks using an endogenous anti-YY1 antibody) can be found at GSE240682.

#sort ATAC_peaks_EDM
bedtools sort -i ATAC_peaks_EDM.bed > ATAC_peaks_EDM.sort.bed
bedtools closest -d -t all -a ATAC_peaks_EDM.sort.bed -b /path/to/YY1ab_ConsensusPeaks.rmInput.bed > ATAC_peaks_EDM_closestYY1ab.bed
awk '{ if ($13 < 1) { print } }' ATAC_peaks_EDM_closestYY1ab.bed > ATAC_peaks_EDM_YY1bound_YY1ab.bed

data_path <- "/path/to/folder/with/chrRNA/data/"
ATAC_analysis_path <- "/path/to/your/ATAC/analysis/folder/"

GeneInfo_A11B2 <- read_delim(paste0(data_path,"GeneInfo_A11B2.txt"), "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)

ATAC_NPC_AR_table_merge <- read_delim(paste0(ATAC_analysis_path,"ATAC_NPC_AR_table_merge.txt"),
                                             delim = "\t", escape_double = FALSE,  trim_ws = TRUE)
ATAC_peaks_EDM_closestYY1 <- read_delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM_closestYY1ab.bed"), 
                                        delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
colnames(ATAC_peaks_EDM_closestYY1) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid",
                                         "YY1peak_chr","YY1peak_Start","YY1peak_End","info","YY1peakScore","info2","distYY1")

Label ATAC peaks by YY1 binding

#By YY1-target yes/no
ATAC_NPC_AR_table_merge <- subset(ATAC_NPC_AR_table_merge,ATAC_NPC_AR_table_merge$Peakid %in% ATAC_peaks_EDM_closestYY1$Peakid)
ATAC_peaks_EDM_closestYY1 <- subset(ATAC_peaks_EDM_closestYY1, select = -c(info,info2,YY1peak_chr) )
ATAC_NPC_AR_table_byYY1 <- merge(ATAC_NPC_AR_table_merge,ATAC_peaks_EDM_closestYY1)

#Use only unique ATAC-peaks for plots
ATAC_NPC_AR_table_byYY1 <- ATAC_NPC_AR_table_byYY1 %>% 
  arrange(Peakid, dplyr::desc(YY1peakScore))
ATAC_NPC_AR_table_byYY1 <- ddply(ATAC_NPC_AR_table_byYY1, c("Peakid"), head, 1)

#Add extra column of YY1target
ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                   YY1target = case_when(
                                     ATAC_NPC_AR_table_byYY1$distYY1 < 0.5 ~ "YY1bound",
                                     ATAC_NPC_AR_table_byYY1$distYY1 > 0.5 ~ "nonYY1"))

Label ATAC peaks by number of YY1 motifs

(Note. analysis by number of YY1 motifs is not displayed in the paper)

Count the numbers of YY1 motifs within each ATAC-seq peak using the FIMO online tool from MEME suite: https://meme-suite.org/meme/tools/fimo
Input files were the PWM for YY1 bidning in the JASPAR database (YY1_MA0095.2.meme) and a fasta file for all DNA sequence contents within ATAC peaks:

bedtools getfasta -fi /path/to/whole/genome/fasta/mm10.fa -bed ATAC_peaks_EDM.bed > ATAC_peaks_EDM.fa

Save output as tsv file
Count number of YY1 motifs in each ATAC-seq peak via awk in the command line:

#remove extra lines from tsv file 
tail -n +2 YY1_motifs_ATACpeaks_FIMO.tsv > YY1_motifs_ATACpeaks_FIMO2.tsv
head -n -4 YY1_motifs_ATACpeaks_FIMO2.tsv > YY1_motifs_ATACpeaks_FIMO3.tsv
#coerces tsv file into bed of motifs:
awk 'BEGIN { OFS="\t"; } { n = split($3, a, /[:-]/); print a[1],a[2]+$4,a[2]+$5,$6,$7,$9,$10 }' YY1_motifs_ATACpeaks_FIMO3.tsv > YY1_motifs_ATACpeaks_FIMO_strand.bed
awk 'BEGIN { OFS="\t"; } { n = split($3, a, /[:-]/); print a[1],a[2]+$4,a[2]+$5,$10,$7,$6,$9 }' YY1_motifs_ATACpeaks_FIMO3.tsv > YY1_motifs_ATACpeaks_FIMO_sequence.bed

rm YY1_motifs_ATACpeaks_FIMO2.tsv YY1_motifs_ATACpeaks_FIMO3.tsv

#Count number of YY1 motifs in each ATAC peak
bedtools intersect -wa -c -a ATAC_peaks_EDM.sort.bed -b YY1_motifs_ATACpeaks_FIMO_sequence.bed > ATAC_peaks_EDM.YY1motifCount.bed

Load this “ATAC_peaks_EDM.YY1motifCount.bed” into R to add extra column (“YY1motif_group”) to ATAC-seq peak table:

ATAC_peaks_EDM_YY1motifCount <- read_delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.YY1motifCount.bed"), 
                                           delim = "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE)
ATAC_peaks_EDM_YY1motifCount <- ATAC_peaks_EDM_YY1motifCount[,c(1:3,7)]
colnames(ATAC_peaks_EDM_YY1motifCount) <- c("Chr","Start","End","YY1motifs")

ATAC_NPC_AR_table_byYY1 <-  merge(ATAC_NPC_AR_table_byYY1, ATAC_peaks_EDM_YY1motifCount)

ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                   YY1motif_group = case_when(
                                     ATAC_NPC_AR_table_byYY1$YY1motifs > 1 ~ "MultipleYY1motif",
                                     ATAC_NPC_AR_table_byYY1$YY1motifs > 0 ~ "SingleYY1motif",
                                     ATAC_NPC_AR_table_byYY1$YY1motifs < 1 ~ "NoYY1motif")  )
length(which(ATAC_NPC_AR_table_byYY1$YY1motif_group == "MultipleYY1motif"))

## [1] 69

length(which(ATAC_NPC_AR_table_byYY1$YY1motif_group == "SingleYY1motif"))

## [1] 157

length(which(ATAC_NPC_AR_table_byYY1$YY1motif_group == "NoYY1motif"))

## [1] 564

Plots of ATAC peaks by YY1 binding and no of YY1 motifs

1. allelic ratio boxplot

colnames(ATAC_NPC_AR_table_byYY1)

##  [1] "Chr"            "Start"          "End"            "Peakid"        
##  [5] "Strand"         "Length"         "NPC_0d"         "NPC_1d"        
##  [9] "NPC_3d"         "NPC_6d"         "NPC_17d"        "per_v_dep"     
## [13] "peak_halftime"  "YY1peak_Start"  "YY1peak_End"    "YY1peakScore"  
## [17] "distYY1"        "YY1target"      "YY1motifs"      "YY1motif_group"

ATAC_NPC_AR_table_byYY1_melt <- melt(ATAC_NPC_AR_table_byYY1, 
                                     id=c("Peakid","Chr","Start","End","Strand","Length",
                                          "per_v_dep","peak_halftime","YY1peak_Start","YY1peak_End",
                                          "YY1peakScore","distYY1","YY1target","YY1motifs", "YY1motif_group"),
                                     stringsAsFactors = FALSE)
colnames(ATAC_NPC_AR_table_byYY1_melt)[(ncol(ATAC_NPC_AR_table_byYY1_melt)-1):ncol(ATAC_NPC_AR_table_byYY1_melt)] <- c("sample","ratio")
ATAC_NPC_AR_table_byYY1_melt$sample <- as.character(ATAC_NPC_AR_table_byYY1_melt$sample)
#YY1 vs non-YY1
AR_YY1target <- mutate(ATAC_NPC_AR_table_byYY1_melt, 
                       YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  mutate(sample = factor(sample, levels=c("NPC_0d","NPC_1d","NPC_3d","NPC_6d","NPC_17d"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1target

#YY1 motif group
AR_YY1motif_group <- mutate(ATAC_NPC_AR_table_byYY1_melt, 
                      YY1motif_group = factor(YY1motif_group, levels=c("NoYY1motif","SingleYY1motif","MultipleYY1motif"))) %>%
  mutate(sample = factor(sample, levels=c("NPC_0d","NPC_1d","NPC_3d","NPC_6d","NPC_17d"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1motif_group) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1motif_group

2. Accessibility halftime boxplot

#YY1 vs non-YY1
comparisons <- list(c("nonYY1", "YY1bound"))
YY1target_box <- mutate(ATAC_NPC_AR_table_byYY1, YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  ggplot(aes(x=YY1target,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box

#for promoters and distal elements separately
#Distal vs TSS1KB
ATAC_peaks_EDM_TSS1KB <- read.delim(paste0(ATAC_analysis_path,"ATAC_peaks_EDM.TSS1KB.bed"), header=FALSE)
colnames(ATAC_peaks_EDM_TSS1KB) <- c("Chr","Start","End","per_v_dep","peak_halftime","Peakid")

ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                     dist_prom = case_when(
                                       ATAC_NPC_AR_table_byYY1$Peakid %in% ATAC_peaks_EDM_TSS1KB$Peakid ~ "prom",
                                       TRUE ~ "distal") )

YY1target_box_prom <- subset(ATAC_NPC_AR_table_byYY1, dist_prom %in% c("prom")) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  ggplot(aes(x=YY1target,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box_distal <- subset(ATAC_NPC_AR_table_byYY1, dist_prom %in% c("distal")) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1bound"))) %>%
  ggplot(aes(x=YY1target,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
grid.arrange(YY1target_box_prom, YY1target_box_distal, nrow = 1)

#YY1 motif group
comparisons <- list(c("NoYY1motif","SingleYY1motif"),c("SingleYY1motif","MultipleYY1motif"))
YY1motif_group_box <- mutate(ATAC_NPC_AR_table_byYY1, YY1motif_group = factor(YY1motif_group, levels=c("NoYY1motif","SingleYY1motif","MultipleYY1motif"))) %>%
  ggplot(aes(x=YY1motif_group,y=peak_halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1motif_group_box

3. pie chart, by depleted vs persistent peaks

#YY1 vs non-YY1
depleted_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("depleted_peak"))
depleted_YY1count <- c(length(which(depleted_pie$YY1target == "YY1bound")),
                       length(which(depleted_pie$YY1target == "nonYY1")))
depleted_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), depleted_YY1count )

persistent_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("persistent_peak"))
persistent_YY1count <- c(length(which(persistent_pie$YY1target == "YY1bound")),
                         length(which(persistent_pie$YY1target == "nonYY1")))
persistent_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), persistent_YY1count )

pie_depleted <- ggplot(depleted_pie, aes(x="", y=depleted_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("depleted-peaks") +
  geom_text(aes(label = paste(depleted_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_persistent <- ggplot(persistent_pie, aes(x="", y=persistent_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("persistent-peaks") +
  geom_text(aes(label = paste(persistent_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

YY1target_pie <- grid.arrange(pie_depleted, pie_persistent, nrow = 1)

#By YY1motif
depleted_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("depleted_peak"))
depleted_YY1count <- depleted_pie %>% dplyr::count(YY1motifs) 
persistent_pie <- subset(ATAC_NPC_AR_table_byYY1, per_v_dep %in% c("persistent_peak"))
persistent_YY1count <- persistent_pie %>% dplyr::count(YY1motifs) 

#total numbers of YY1 motifs in persisten vs depeleted peaks:
sum(persistent_pie$YY1motifs)

## [1] 239

sum(depleted_pie$YY1motifs)

## [1] 112

pie_depleted <- ggplot(depleted_YY1count, aes(x="", y=n, fill=as.character(YY1motifs))) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("depleted-peaks") +
  geom_text(aes(label = unlist(depleted_YY1count[2])), position = position_stack(vjust = 0.5)) #+
#coord_polar("y", start=0) + theme_void()
pie_persistent <- ggplot(persistent_YY1count, aes(x="", y=n, fill=as.character(YY1motifs))) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("persistent-peaks") +
  geom_text(aes(label = unlist(persistent_YY1count[2])), position = position_stack(vjust = 0.5)) #+
#coord_polar("y", start=0) + theme_void()
grid.arrange(pie_depleted, pie_persistent, nrow = 1)

4. pie chart, by category

#YY1 vs non-YY1
nonYY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("nonYY1"))
nonYY1_count <- c(length(which(nonYY1_pie$per_v_dep == "persistent_peak")),
                  length(which(nonYY1_pie$per_v_dep == "depleted_peak")))
nonYY1_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), nonYY1_count )

YY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("YY1bound"))
YY1_count <- c(length(which(YY1_pie$per_v_dep == "persistent_peak")),
               length(which(YY1_pie$per_v_dep == "depleted_peak")))
YY1_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), YY1_count )

pie_nonYY1 <- ggplot(nonYY1_pie, aes(x="", y=nonYY1_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("nonYY1") +
  geom_text(aes(label = paste(nonYY1_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

pie_YY1 <- ggplot(YY1_pie, aes(x="", y=YY1_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("YY1") +
  geom_text(aes(label = paste(YY1_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

per_v_dep_pie <- grid.arrange(pie_nonYY1, pie_YY1, nrow = 1)

#By YY1motif
NoYY1motif_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1motif_group %in% c("NoYY1motif"))
NoYY1motif_count <- c(length(which(NoYY1motif_pie$per_v_dep == "persistent_peak")),
                  length(which(NoYY1motif_pie$per_v_dep == "depleted_peak")))
NoYY1motif_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), NoYY1motif_count )

SingleYY1motif_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1motif_group %in% c("SingleYY1motif"))
SingleYY1motif_count <- c(length(which(SingleYY1motif_pie$per_v_dep == "persistent_peak")),
                      length(which(SingleYY1motif_pie$per_v_dep == "depleted_peak")))
SingleYY1motif_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), SingleYY1motif_count )

MultipleYY1motif_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1motif_group %in% c("MultipleYY1motif"))
MultipleYY1motif_count <- c(length(which(MultipleYY1motif_pie$per_v_dep == "persistent_peak")),
                          length(which(MultipleYY1motif_pie$per_v_dep == "depleted_peak")))
MultipleYY1motif_pie <- data.frame(per_v_dep=c("persistent_peak", "depleted_peak"), MultipleYY1motif_count )

pie_NoYY1motif <- ggplot(NoYY1motif_pie, aes(x="", y=NoYY1motif_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("NoYY1motif") +
  geom_text(aes(label = paste(NoYY1motif_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

pie_SingleYY1motif <- ggplot(SingleYY1motif_pie, aes(x="", y=SingleYY1motif_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("SingleYY1motif") +
  geom_text(aes(label = paste(SingleYY1motif_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

pie_MultipleYY1motif <- ggplot(MultipleYY1motif_pie, aes(x="", y=MultipleYY1motif_count, fill=per_v_dep)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("MultipleYY1motif") +
  geom_text(aes(label = paste(MultipleYY1motif_count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

per_v_dep_pie <- grid.arrange(pie_NoYY1motif, pie_SingleYY1motif, pie_MultipleYY1motif, nrow = 1)

ChrRNA-seq:

Allelic ratio tables for iXist-ChrX-Dom (aka C7H8) and iXist-ChrX-Cast (aka A11B2) are provided within GSE240684
Most of the samples in these allelic ratio tables can be found within GSE185843, along with methods of how the samples are processed to generate ‘allelic ratios’ for each gene. Exceptions are ES (d0 and d2) samples, which are taken from our previous publication and can be found at GSE119602.
Gene info tables can be downloaded from supplementary files of 10.1016/j.celrep.2022.110830
I intially did all the same analysis in this section with iXist-ChrX-Cast (C7H8) ChrRNA-seq data, simply because it allows for the analysis of a greater number of X-linked genes (n=399 compared to n=246) compared to iXist-ChrX-Dom (A11B2), which has a recombination of the chromosome after the Xist locus.
The same analysis for iXist-ChrX-Dom (A11B2) is below and shows the same trends, confering more confidence in our results.

Load published chrRNA-seq data

#Load AR tables, keep only WT samples, and merge replicates
C7H8_AR_table <- read_delim(paste0(data_path,"C7H8_AR_table_merged_discreet.txt"), #replace these with GEO file names later
                            delim = "\t", escape_double = FALSE, trim_ws = TRUE)
A11B2_AR_table <- read_delim(paste0(data_path, "A11B2_AR_table_merged_discreet.txt"), #replace these with GEO file names later
                            delim = "\t", escape_double = FALSE, trim_ws = TRUE)

#Load GeneInfo tables
GeneInfo_A11B2 <- read_delim(paste0(data_path,"GeneInfo_A11B2.txt"), 
                             "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)
GeneInfo_C7H8 <- read_delim(paste0(data_path,"GeneInfo_C7H8.txt"), 
                            "\t", escape_double = FALSE, col_names = TRUE, trim_ws = TRUE)
#make halftimes in days rather than hours
GeneInfo_A11B2$Halftime <- (1/24)*GeneInfo_A11B2$Halftime
GeneInfo_C7H8$Halftime <- (1/24)*GeneInfo_C7H8$Halftime

Direct YY1-target genes

I generated a file of Gene TSS which directly overlap YY1 peaks using bedtools closest as follows. This classified 2168 genes in the genome as putative “direct” YY1 targets.

#Genes_Nonredundant_GenomeWide_TSS.gtf is extracted from genome annotation gtf used for chrRNA-seq analysis
gtf2bed < Genes_Nonredundant_GenomeWide_TSS.gtf > Genes_Nonredundant_GenomeWide_TSS.bed
bedtools closest -d -t all -a Genes_Nonredundant_GenomeWide_TSS.bed -b /path/to/YY1ab_ConsensusPeaks.rmInput.bed > GeneTSS_closestYY1peak.bed
#Only take lines where there is exact overlap between TSS and YY1 peak (ie. YY1 in promoter)
awk -F "\t" '{ if ($17 < 1) { print } }' GeneTSS_closestYY1peak.bed > GeneTSS_YY1prom.bed

Load this file into R and append to allelic ratio tables of iXist-ChrX-Cast

#Load file of YY1-bound TSS
GeneTSS_YY1prom <- read.delim(paste0(data_path,"GeneTSS_YY1prom.bed"), header=FALSE)
#Append YY1target to C7H8_AR_table
C7H8_AR_table_byYY1 <-  mutate(C7H8_AR_table,
                                   YY1target = case_when(
                                     C7H8_AR_table$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                     !(C7H8_AR_table$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )

C7H8_AR_YY1 <- C7H8_AR_table_byYY1[which(C7H8_AR_table_byYY1$YY1target == "YY1"),]
C7H8_AR_nonYY1 <- C7H8_AR_table_byYY1[which(C7H8_AR_table_byYY1$YY1target == "nonYY1"),]

Plot boxplots- chrRNA YY1targets vs non-targets

colnames(C7H8_AR_table_byYY1)

##  [1] "Geneid"    "Chr"       "Start"     "End"       "Strand"    "Length"   
##  [7] "0h"        "24h"       "48h"       "72h"       "120h"      "168h"     
## [13] "NPC"       "YY1target"

C7H8_AR_table_byYY1_melt <- melt(as.data.frame(na.omit(C7H8_AR_table_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(C7H8_AR_table_byYY1_melt)[(ncol(C7H8_AR_table_byYY1_melt)-1):ncol(C7H8_AR_table_byYY1_melt)] <- c("sample","ratio")
C7H8_AR_table_byYY1_melt$sample <- as.character(C7H8_AR_table_byYY1_melt$sample)

#plot boxplots
AR_YY1target <- mutate(C7H8_AR_table_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  mutate(sample = factor(sample, levels=c("0h","24h","48h","72h","120h","168h","NPC"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1target

Plot Ribbon plot - chrRNA YY1targets vs non-targets

C7H8_AR_table_timepoints <- c(0,1,2,3,5,7,18)
colnames(C7H8_AR_table)[7:13] <- as.numeric(C7H8_AR_table_timepoints)

C7H8_AR_YY1summary <- as.array(apply(C7H8_AR_YY1[,7:13], 2, summary))
C7H8_AR_YY1summary_t <- data.frame(C7H8_AR_table_timepoints, t(C7H8_AR_YY1summary))
colnames(C7H8_AR_YY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

C7H8_AR_nonYY1summary <- as.array(apply(na.omit(C7H8_AR_nonYY1[,7:13]), 2, summary))
C7H8_AR_nonYY1summary_t <- data.frame(C7H8_AR_table_timepoints, t(C7H8_AR_nonYY1summary))
colnames(C7H8_AR_nonYY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,10), ylim = c(0,0.8),expand = FALSE) +
  #scale_y_continuous(limits = c(0,0.8), expand = c(0, 0)) + scale_x_continuous(limits = c(0,17), expand = c(0, 0)) +
  #YY1
  geom_ribbon(data=C7H8_AR_YY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=C7H8_AR_YY1summary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=C7H8_AR_YY1summary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #nonYY1
  geom_ribbon(data=C7H8_AR_nonYY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.2, fill="blue") +
  geom_point(data=C7H8_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue") +
  geom_line(data=C7H8_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue", linetype="dotted")

Append YY1targets to GeneInfo and plot boxplots by GeneHalftime

C7H8_AR_table_byYY1_Info <- merge(C7H8_AR_table_byYY1, GeneInfo_C7H8)

#Just YY1 v non-YY1
comparisons <- list(c("nonYY1", "YY1"))
YY1target_box <- mutate(C7H8_AR_table_byYY1_Info, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=Halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box

Pie charts of genes by YY1 targets and YY1 groups

#By YY1target
fast_pie <- subset(C7H8_AR_table_byYY1_Info, SilencingRate %in% c("fastsilencing"))
fast_YY1count <- c(length(which(fast_pie$YY1target == "YY1")),
                              length(which(fast_pie$YY1target == "nonYY1")))
fast_pie <- data.frame(YY1target=c("YY1", "nonYY1"), fast_YY1count )

medium_pie <- subset(C7H8_AR_table_byYY1_Info, SilencingRate %in% c("mediumsilencing"))
medium_YY1count <- c(length(which(medium_pie$YY1target == "YY1")),
                   length(which(medium_pie$YY1target == "nonYY1")))
medium_pie <- data.frame(YY1target=c("YY1", "nonYY1"), medium_YY1count )

slow_pie <- subset(C7H8_AR_table_byYY1_Info, SilencingRate %in% c("slowsilencing"))
slow_YY1count <- c(length(which(slow_pie$YY1target == "YY1")),
                   length(which(slow_pie$YY1target == "nonYY1")))
slow_pie <- data.frame(YY1target=c("YY1", "nonYY1"), slow_YY1count )

pie_fast <- ggplot(fast_pie, aes(x="", y=fast_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("fast-silencing") +
  geom_text(aes(label = paste(fast_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_medium <- ggplot(medium_pie, aes(x="", y=medium_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("medium-silencing") +
  geom_text(aes(label = paste(medium_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_slow <- ggplot(slow_pie, aes(x="", y=slow_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("slow-silencing") +
  geom_text(aes(label = paste(slow_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

YY1target_pie <- grid.arrange(pie_fast, pie_medium, pie_slow, nrow = 1)

iXist-ChrX-Dom ChrRNA by YY1

# Append YY1target to A11B2_AR_table 
A11B2_AR_table_byYY1 <-  mutate(A11B2_AR_table,
                               YY1target = case_when(
                                 A11B2_AR_table$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                 !(A11B2_AR_table$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )

A11B2_AR_YY1 <- A11B2_AR_table_byYY1[which(A11B2_AR_table_byYY1$YY1target == "YY1"),]
A11B2_AR_nonYY1 <- A11B2_AR_table_byYY1[which(A11B2_AR_table_byYY1$YY1target == "nonYY1"),]

AR boxplot

#chrRNA YY1targets vs non-targets
colnames(A11B2_AR_table_byYY1)

##  [1] "Geneid"    "Chr"       "Start"     "End"       "Strand"    "Length"   
##  [7] "0h"        "24h"       "48h"       "72h"       "120h"      "168h"     
## [13] "NPC"       "YY1target"

A11B2_AR_table_byYY1_melt <- melt(as.data.frame(na.omit(A11B2_AR_table_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(A11B2_AR_table_byYY1_melt)[(ncol(A11B2_AR_table_byYY1_melt)-1):ncol(A11B2_AR_table_byYY1_melt)] <- c("sample","ratio")
A11B2_AR_table_byYY1_melt$sample <- as.character(A11B2_AR_table_byYY1_melt$sample)

#plot boxplots
AR_YY1target <- mutate(A11B2_AR_table_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  mutate(sample = factor(sample, levels=c("0h","24h","48h","72h","120h","168h","NPC"))) %>%
  ggplot(aes(x=sample,y=ratio,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1)) +
  geom_hline(yintercept=0.5, linetype="dashed") 
AR_YY1target

Ribbon plot

#chrRNA YY1targets vs non-targets
A11B2_AR_table_timepoints <- c(0,1,2,3,5,7,18)
colnames(A11B2_AR_table)[7:13] <- as.numeric(A11B2_AR_table_timepoints)

A11B2_AR_YY1summary <- as.array(apply(A11B2_AR_YY1[,7:13], 2, summary))
A11B2_AR_YY1summary_t <- data.frame(A11B2_AR_table_timepoints, t(A11B2_AR_YY1summary))
colnames(A11B2_AR_YY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

A11B2_AR_nonYY1summary <- as.array(apply(na.omit(A11B2_AR_nonYY1[,7:13]), 2, summary))
A11B2_AR_nonYY1summary_t <- data.frame(A11B2_AR_table_timepoints, t(A11B2_AR_nonYY1summary))
colnames(A11B2_AR_nonYY1summary_t) <- c("time","Min","Lower","Median","Mean","Upper","Max")

ggplot() +  theme_light() + 
  coord_cartesian(xlim =c(0,10), ylim = c(0,0.8),expand = FALSE) +
  #scale_y_continuous(limits = c(0,0.8), expand = c(0, 0)) + scale_x_continuous(limits = c(0,17), expand = c(0, 0)) +
  #YY1
  geom_ribbon(data=A11B2_AR_YY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.4, fill="darkgrey") +
  geom_point(data=A11B2_AR_YY1summary_t, aes(x=time,y=Median, group=1)) +
  geom_line(data=A11B2_AR_YY1summary_t, aes(x=time,y=Median, group=1), linetype="dotted") +
  #nonYY1
  geom_ribbon(data=A11B2_AR_nonYY1summary_t, aes(x=time, y=Median, group=1, ymin=Lower, ymax=Upper), alpha=0.2, fill="blue") +
  geom_point(data=A11B2_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue") +
  geom_line(data=A11B2_AR_nonYY1summary_t, aes(x=time,y=Median, group=1), colour="blue", linetype="dotted")

Gene halftime boxplot

# Append YY1targets to GeneInfo and plot boxplots by GeneHalftime
A11B2_AR_table_byYY1_Info <- merge(A11B2_AR_table_byYY1,GeneInfo_A11B2)

#Just YY1 v non-YY1
comparisons <- list(c("nonYY1", "YY1"))
YY1target_box <- mutate(A11B2_AR_table_byYY1_Info, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=Halftime)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_continuous(expand = c(0, 0.5), limits = c(0, 6)) +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text")
YY1target_box

Pie charts of genes by YY1 targets and YY1 groups

#By YY1target
fast_pie <- subset(A11B2_AR_table_byYY1_Info, SilencingRate %in% c("fastsilencing"))
fast_YY1count <- c(length(which(fast_pie$YY1target == "YY1")),
                   length(which(fast_pie$YY1target == "nonYY1")))
fast_pie <- data.frame(YY1target=c("YY1", "nonYY1"), fast_YY1count )

medium_pie <- subset(A11B2_AR_table_byYY1_Info, SilencingRate %in% c("mediumsilencing"))
medium_YY1count <- c(length(which(medium_pie$YY1target == "YY1")),
                     length(which(medium_pie$YY1target == "nonYY1")))
medium_pie <- data.frame(YY1target=c("YY1", "nonYY1"), medium_YY1count )

slow_pie <- subset(A11B2_AR_table_byYY1_Info, SilencingRate %in% c("slowsilencing"))
slow_YY1count <- c(length(which(slow_pie$YY1target == "YY1")),
                   length(which(slow_pie$YY1target == "nonYY1")))
slow_pie <- data.frame(YY1target=c("YY1", "nonYY1"), slow_YY1count )

pie_fast <- ggplot(fast_pie, aes(x="", y=fast_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("fast-silencing") +
  geom_text(aes(label = paste(fast_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_medium <- ggplot(medium_pie, aes(x="", y=medium_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("medium-silencing") +
  geom_text(aes(label = paste(medium_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()
pie_slow <- ggplot(slow_pie, aes(x="", y=slow_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("slow-silencing") +
  geom_text(aes(label = paste(slow_YY1count)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start=0) + theme_void()

YY1target_pie <- grid.arrange(pie_fast, pie_medium, pie_slow, nrow = 1)

mESC silencing by YY1 target

Here I also looked at the chrRNA-seq data of iXist-ChrX cells induced with dox for an extended period (10 days) but maintained in mESC conditions.
These allelic ratio tables can be downloaded directly from the Supplementary Files of GSE185852

#Load files
C7H8_mESCd10_AR <-  read_delim(paste0(data_path,"iXist-ChrX-Cast_mESC_d10_AllelicRatio_table.txt"), 
                               delim = "\t", escape_double = FALSE, trim_ws = TRUE)
A11B2_mESCd10_AR <- read_delim(paste0(data_path,"iXist-ChrX-Dom_mESC_d10_AllelicRatio_table.txt"), 
                              delim = "\t", escape_double = FALSE, trim_ws = TRUE)

#C7H8
C7H8_ES_d10 <- (1/3)*rowSums(C7H8_mESCd10_AR[,7:9])
C7H8_mESCd10_AR <- cbind(C7H8_mESCd10_AR[,c(1:6)], C7H8_ES_d10)
rm(C7H8_ES_d10)
C7H8_mESCd10_AR_byYY1 <- mutate(C7H8_mESCd10_AR,
                                YY1target = case_when(
                                  C7H8_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                  !(C7H8_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )
C7H8_mESCd10_AR_byYY1_melt <- melt(as.data.frame(na.omit(C7H8_mESCd10_AR_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(C7H8_mESCd10_AR_byYY1_melt)[(ncol(C7H8_mESCd10_AR_byYY1_melt)-1):ncol(C7H8_mESCd10_AR_byYY1_melt)] <- c("sample","ratio")
C7H8_mESCd10_AR_byYY1_melt$sample <- as.character(C7H8_mESCd10_AR_byYY1_melt$sample)

#plot boxplots
comparisons <- list(c("nonYY1","YY1"))
mutate(C7H8_mESCd10_AR_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=ratio) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(-0.05, 1.05)) +
  geom_hline(yintercept=0.5, linetype="dashed") +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") + stat_summary(fun.data = give.n, geom = "text")

#A11B2
A11B2_ES_d10 <- (1/4)*rowSums(A11B2_mESCd10_AR[,7:10])
A11B2_mESCd10_AR <- cbind(A11B2_mESCd10_AR[,c(1:6)], A11B2_ES_d10)
rm(A11B2_ES_d10)
A11B2_mESCd10_AR_byYY1 <- mutate(A11B2_mESCd10_AR,
                                YY1target = case_when(
                                  A11B2_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                                  !(A11B2_mESCd10_AR$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )
A11B2_mESCd10_AR_byYY1_melt <- melt(as.data.frame(na.omit(A11B2_mESCd10_AR_byYY1)), id=c("Geneid","Chr","Start","End","Strand","Length","YY1target"), stringsAsFactors = FALSE)
colnames(A11B2_mESCd10_AR_byYY1_melt)[(ncol(A11B2_mESCd10_AR_byYY1_melt)-1):ncol(A11B2_mESCd10_AR_byYY1_melt)] <- c("sample","ratio")
A11B2_mESCd10_AR_byYY1_melt$sample <- as.character(A11B2_mESCd10_AR_byYY1_melt$sample)

#plot boxplots
comparisons <- list(c("nonYY1","YY1"))
mutate(A11B2_mESCd10_AR_byYY1_melt, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=ratio) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  scale_y_continuous(expand = c(0, 0), limits = c(-0.05, 1.05)) +
  geom_hline(yintercept=0.5, linetype="dashed") +
  stat_compare_means(comparisons = comparisons,method = "wilcox.test",label="p.format") + stat_summary(fun.data = give.n, geom = "text")

Analysis for Revisions

Reviewer suggestion 1

Investigation by association between YY1 binding and gene expression level

Here I look at the cross-associations between gene silencing rate, initial expression level and YY1 binding.
First, I investigate if YY1 targets on average, more highly expressed than non-YY1 targets genome-wide. I load in our previous data of mRNA-seq in iXist-ChrX GSE185869 to calculate TPM for all genes in the genome, and compare this with the list of all YY1 target genes in the gneome from GeneTSS_YY1prom. YY1 target genes indeed tend to be more highly expressed, both genome-wide and on ChrX.
I then look within the C7H8_AR_table_byYY1_Info table (master table of chrRNA-seq gene silencing anaysis) if YY1 target genes are slower-silencing whilst controlling for initial expression levels (ie. within categories of high/medium/low-expressed genes). Within the group of medium-expressed genes in particular, YY1 targets remain signficantly slower to silence.

#Genes which meet criteria for allelic analysis genome-wide            
GenomeWide_Allelic_Genes <- read_delim(paste0(data_path,"GenomeWide_Allelic_Genes_ARtable.txt"), 
    delim = "\t", escape_double = FALSE, 
    col_types = cols_only(Geneid = col_guess()), 
    trim_ws = TRUE)

#TPM genome wide
A11B2_mRNA_0h_count <- read_table2(paste0(data_path,"A11B2_mRNA_0h_count_exon"),skip = 1)
A11B2_mRNA_0h_count <- cbind(A11B2_mRNA_0h_count[1:6], 0.5*(A11B2_mRNA_0h_count[7] + A11B2_mRNA_0h_count[8]))
RPK <- sweep(A11B2_mRNA_0h_count[7:ncol(A11B2_mRNA_0h_count)], MARGIN=1,FUN="/",STATS=A11B2_mRNA_0h_count$Length)
RPK <- cbind(A11B2_mRNA_0h_count[1:6],RPK)
#Sum RPKs then divide through norm factor to make TPM
RPK_Sums <- colSums(RPK[7:ncol(RPK)])
RPK_NormFactors <- rep(RPK_Sums[seq(1,length(RPK_Sums),3)], each=3)/1000000
TPM_mRNA <- cbind(RPK[1:6],
                  sweep(RPK[7:ncol(RPK)], 2, RPK_NormFactors, "/"))
colnames(TPM_mRNA)[7] <- "GeneTPM"

#Add YY1 target annotated to genome-wide TPM file
TPM_mRNA <-  mutate(TPM_mRNA,
                    YY1target = case_when(
                      TPM_mRNA$Geneid %in% GeneTSS_YY1prom[,4] ~ "YY1",
                      !(TPM_mRNA$Geneid %in% GeneTSS_YY1prom[,4]) ~ "nonYY1")  )

# subset by only genes which would pass allelic analysis
TPM_mRNA_sub <- subset(TPM_mRNA, Geneid %in% GenomeWide_Allelic_Genes$Geneid | 
                         Geneid %in% C7H8_AR_table$Geneid) 
table(TPM_mRNA_sub$YY1target)

## 
## nonYY1    YY1 
##  10329   1946

#how many YY1 targets on autosomes vs chrX?
table(subset(TPM_mRNA_sub, !grepl("chrX",TPM_mRNA_sub$Chr))$YY1target) #autosomes

## 
## nonYY1    YY1 
##   9983   1882

1882/(1882+9983)

## [1] 0.1586178

table(subset(TPM_mRNA_sub, grepl("chrX",TPM_mRNA_sub$Chr))$YY1target) #chrX

## 
## nonYY1    YY1 
##    346     64

64/(64+346)

## [1] 0.1560976

#subset by autosomes
YY1target_TPM_autosomes <- subset(TPM_mRNA_sub, !grepl("chrX",TPM_mRNA_sub$Chr)) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1")))
#TPM of autosomes by if they are YY1 targets
YY1target_TPM_autosomes_box <-  ggplot(YY1target_TPM_autosomes, aes(x=YY1target,y=GeneTPM)) +   
  geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_log10(expand = c(0, 0.5)) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test", label="p.format") +
  stat_summary(fun.data = give.n, geom = "text") 
YY1target_TPM_autosomes_box

#calculate median TPM of autosomes
median(subset(YY1target_TPM_autosomes, YY1target == "YY1")$GeneTPM)

## [1] 29.63787

nrow(subset(YY1target_TPM_autosomes, YY1target == "YY1"))

## [1] 1882

median(subset(YY1target_TPM_autosomes, YY1target == "nonYY1")$GeneTPM)

## [1] 8.037361

nrow(subset(YY1target_TPM_autosomes, YY1target == "nonYY1"))

## [1] 9983

#subset by chrX
YY1target_TPM_chrX <- subset(TPM_mRNA_sub, grepl("chrX",TPM_mRNA_sub$Chr)) %>%
  mutate(YY1target = factor(YY1target, levels=c("nonYY1", "YY1")))
#TPM of chrX by if they are YY1 targets
YY1target_TPM_chrX_box <-  ggplot(YY1target_TPM_chrX, aes(x=YY1target,y=GeneTPM)) +   
  geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_log10(expand = c(0, 0.5)) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test", label="p.format") +
  stat_summary(fun.data = give.n, geom = "text") 
YY1target_TPM_chrX_box

#calculate median TPM of chrX genes
median(subset(YY1target_TPM_chrX, YY1target == "YY1")$GeneTPM)

## [1] 70.81174

nrow(subset(YY1target_TPM_chrX, YY1target == "YY1"))

## [1] 64

median(subset(YY1target_TPM_chrX, YY1target == "nonYY1")$GeneTPM)

## [1] 30.69529

nrow(subset(YY1target_TPM_chrX, YY1target == "nonYY1"))

## [1] 346

#within all YY1 target genes, is YY1 enrichment correlated with expression level
TPM_mRNA_minimal <- TPM_mRNA[,c("Geneid","GeneTPM")]
YY1genes_minimal <- GeneTSS_YY1prom[,c(4,15)]
colnames(YY1genes_minimal) <- c("Geneid","YY1enrichment")
YY1_targets_TPM <- merge(YY1genes_minimal, TPM_mRNA_minimal, by = "Geneid", all.x = TRUE)
cor(YY1_targets_TPM$YY1enrichment, YY1_targets_TPM$GeneTPM)

## [1] -0.05388916

scatter_YY1_TPM <- ggplot(YY1_targets_TPM, aes(x=YY1enrichment, y=GeneTPM)) + 
  geom_point(shape=16) + theme_bw() +
  scale_y_log10()  + 
  scale_x_log10()  +
  stat_cor(method="spearman") 
scatter_YY1_TPM

#Initial TPM by YY1 v non-YY1 in ChrX data
#replace TPM in GenInfo table by TPM re-calculated above
C7H8_AR_table_byYY1_Info$TPM

## NULL

TPM_replace <- subset(TPM_mRNA, Geneid %in% C7H8_AR_table_byYY1_Info$Geneid)
C7H8_AR_table_byYY1_Info <- merge(C7H8_AR_table_byYY1_Info,TPM_replace[,c("Geneid","GeneTPM")])
#I do not know why the old and new TPM values are not identical, but they are extremely well correlated (R=0.99) so the differences are negligible
cor(C7H8_AR_table_byYY1_Info$InitialTPM, C7H8_AR_table_byYY1_Info$GeneTPM)

## [1] 0.9923476

comparisons <- list(c("nonYY1", "YY1"))
#plot boxplot
YY1target_TPM_box <- mutate(C7H8_AR_table_byYY1_Info, YY1target = factor(YY1target, levels=c("nonYY1", "YY1"))) %>%
  ggplot(aes(x=YY1target,y=GeneTPM)) +   geom_boxplot() + theme_bw() + theme(axis.ticks = element_line(colour = "grey27")) +
  scale_y_log10(expand = c(0, 0.5)) +
  stat_compare_means(comparisons = comparisons, method = "wilcox.test",label="p.format") +
  stat_summary(fun.data = give.n, geom = "text") 
YY1target_TPM_box

#calculate medians
median(subset(C7H8_AR_table_byYY1_Info, YY1target == "YY1")$GeneTPM)

## [1] 70.81174

nrow(subset(C7H8_AR_table_byYY1_Info, YY1target == "YY1"))

## [1] 64

median(subset(C7H8_AR_table_byYY1_Info, YY1target == "nonYY1")$GeneTPM)

## [1] 31.4976

nrow(subset(C7H8_AR_table_byYY1_Info, YY1target == "nonYY1"))

## [1] 335

#plot by YY1-target controlling for the same expression group
 Halftime_byExpression_byYY1 <- mutate(C7H8_AR_table_byYY1_Info, 
                                       Expression_group = factor(Expression_group, levels=c("lowexpressed","mediumexpressed","highexpressed"))) %>%
  ggplot(aes(x=Expression_group,y=Halftime,fill=YY1target) ) + theme_bw() + theme(axis.text.x=element_blank()) +
  geom_boxplot() +
  stat_compare_means(method = "wilcox.test", label="p.format",label.y=1) + 
   stat_summary(fun.data = give.n, geom = "text")
Halftime_byExpression_byYY1

Reviewer suggestion 2

Stratify peaks by quartiles rather than a binary persistent/depleted categorisation

The assumption that biallelically accessible peaks in NPCs = persistent peaks (by kinetics of accessibility decrease) may not hold for a quartile-based strategy. Thus, for this analysis I will keep these biallelic peaks as a separate (n=133), and then group the remaining 662 peaks into 4 groups of slow (aS) | intermediate (aI)| fast (aF) | biallalic (aB) accessibility decrease. The a stands for “accessibility” and distinguishes from fast/intermediate/slow silencing genes.
I then see with these fine-grained categorisation of REs whether numbers of YY1 motifs and YY1 / non-YY1 increase in these four categories. The association of YY1 binding with slower loss of accessibility holds regardless of analysis strategy.

#make equal-sized peak-accessbility groups
fast_threshold <- 2.39
slow_threshold <- 6.03
ATAC_NPC_AR_table_byYY1 <-  mutate(ATAC_NPC_AR_table_byYY1,
                                   four_categ = case_when(
                                     is.na(ATAC_NPC_AR_table_byYY1$peak_halftime) ~ "aBiallelic",
                                     ATAC_NPC_AR_table_byYY1$peak_halftime > slow_threshold ~ "aSlow",
                                     ATAC_NPC_AR_table_byYY1$peak_halftime < slow_threshold & 
                                       ATAC_NPC_AR_table_byYY1$peak_halftime > fast_threshold ~ "aIntermediate",
                                     ATAC_NPC_AR_table_byYY1$peak_halftime < fast_threshold ~ "aFast"))
#Distributions of the four categories in YY1 vs non-YY1 peaks
nonYY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("nonYY1"))
nonYY1_count <- c(length(which(nonYY1_pie$four_categ == "aBiallelic")),
                  length(which(nonYY1_pie$four_categ == "aSlow")),
                  length(which(nonYY1_pie$four_categ == "aIntermediate")),
                  length(which(nonYY1_pie$four_categ == "aFast")))
nonYY1_pie <- data.frame(four_categ=c("aBiallelic", "aSlow","aIntermediate","aFast"), nonYY1_count )

YY1_pie <- subset(ATAC_NPC_AR_table_byYY1, YY1target %in% c("YY1bound"))
YY1_count <- c(length(which(YY1_pie$four_categ == "aBiallelic")),
                  length(which(YY1_pie$four_categ == "aSlow")),
                  length(which(YY1_pie$four_categ == "aIntermediate")),
                  length(which(YY1_pie$four_categ == "aFast")))
YY1_pie <- data.frame(four_categ=c("aBiallelic", "aSlow","aIntermediate","aFast"), YY1_count )

nonYY1_pie <- mutate(nonYY1_pie, 
                       four_categ = factor(four_categ, levels=c("aBiallelic", "aSlow","aIntermediate","aFast")))
pie_nonYY1 <- ggplot(nonYY1_pie, aes(x="", y=nonYY1_count, fill=four_categ)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("nonYY1") +
  geom_text(aes(label = paste(nonYY1_count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_nonYY1

YY1_pie <- mutate(YY1_pie, 
                       four_categ = factor(four_categ, levels=c("aBiallelic", "aSlow","aIntermediate","aFast")))
pie_YY1 <- ggplot(YY1_pie, aes(x="", y=YY1_count, fill=four_categ)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("YY1") +
  geom_text(aes(label = paste(YY1_count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_YY1

#Distributions of YY1 vs non-YY1 within the four categories
#By YY1target
aFast_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aFast"))
aFast_YY1count <- c(length(which(aFast_pie$YY1target == "YY1bound")),
                              length(which(aFast_pie$YY1target == "nonYY1")))
aFast_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aFast_YY1count )

aIntermediate_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aIntermediate"))
aIntermediate_YY1count <- c(length(which(aIntermediate_pie$YY1target == "YY1bound")),
                   length(which(aIntermediate_pie$YY1target == "nonYY1")))
aIntermediate_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aIntermediate_YY1count )

aSlow_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aSlow"))
aSlow_YY1count <- c(length(which(aSlow_pie$YY1target == "YY1bound")),
                   length(which(aSlow_pie$YY1target == "nonYY1")))
aSlow_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aSlow_YY1count )

aBiallelic_pie <- subset(ATAC_NPC_AR_table_byYY1, four_categ %in% c("aBiallelic"))
aBiallelic_YY1count <- c(length(which(aBiallelic_pie$YY1target == "YY1bound")),
                   length(which(aBiallelic_pie$YY1target == "nonYY1")))
aBiallelic_pie <- data.frame(YY1target=c("YY1bound", "nonYY1"), aBiallelic_YY1count )


pie_aFast <- ggplot(aFast_pie, aes(x="", y=aFast_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aFast") +
  geom_text(aes(label = paste(aFast_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_aIntermediate <- ggplot(aIntermediate_pie, aes(x="", y=aIntermediate_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aIntermediate") +
  geom_text(aes(label = paste(aIntermediate_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_aSlow <- ggplot(aSlow_pie, aes(x="", y=aSlow_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aSlow") +
  geom_text(aes(label = paste(aSlow_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()
pie_aBiallelic <- ggplot(aBiallelic_pie, aes(x="", y=aBiallelic_YY1count, fill=YY1target)) +
  geom_bar(stat="identity", width=1, color="white") + ggtitle("aBiallelic") +
  geom_text(aes(label = paste(aBiallelic_YY1count)), position = position_stack(vjust = 0.5)) +
  theme_void()

YY1target_pie <- grid.arrange(pie_aFast, pie_aIntermediate, pie_aSlow, pie_aBiallelic, nrow = 1)

YY1target_pie

## TableGrob (1 x 4) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (1-1,4-4) arrange gtable[layout]

Reviewer suggestion 3

Compare YY1 target genes by SmcHD1-dependence categories

There seems to be some correlation between YY1 target genes and SmcHD1 dependence. However, this correspondence is not exact, and the fact that a different model system was used to define SmcHD1 dependence (SmcHD1 KO MEFs) makes this comparison less appropriate.

table(subset(C7H8_AR_table_byYY1_Info, YY1target == "YY1")$SmcHD1_dependence)

## 
##                MEF_escapee           SmcHD1_dependent 
##                          4                         13 
##       SmcHD1_not_dependent SmcHD1_partially_dependent 
##                          8                         34

table(subset(C7H8_AR_table_byYY1_Info, YY1target == "nonYY1")$SmcHD1_dependence)

## 
##                MEF_escapee           SmcHD1_dependent 
##                         14                         43 
##       SmcHD1_not_dependent SmcHD1_partially_dependent 
##                         93                        109

table(subset(C7H8_AR_table_byYY1_Info, SmcHD1_dependence == "SmcHD1_dependent")$YY1target)

## 
## nonYY1    YY1 
##     43     13

table(subset(C7H8_AR_table_byYY1_Info, SmcHD1_dependence == "MEF_escapee")$YY1target)

## 
## nonYY1    YY1 
##     14      4

Figure 2 - YY1 association

Joe Bowness