Prepare ENCODE ATAC/DNase Peaks 05

Explore and visualize the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_REGION_FOLDER = "encode_open_chromatin"

Import data

Code
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder)

vec = dir(txt_fdiry)
for (txt in vec){cat(txt, "\n")}
K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz 
K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz 
K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz 
K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz 
K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz 
K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz 
summary 
Code
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")

vec = dir(txt_fdiry)
for (txt in vec){cat(txt, "\n")}
description.tsv 
metadata.label.tsv 

Get column names

Code
### set file path
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname = "description.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_cname     = dat
vec_txt_cname = dat$Name
fun_display_table(head(dat))
Name Note
Chrom Name of the chromosome
ChromStart The starting position of the feature in the chromosome
ChromEnd The ending position of the feature in the chromosome
Name Name given to a region; Use '.' if no name is assigned.
Score Indicates how dark the peak will be displayed in the browser (0-1000).
Strand +/- to denote strand or orientation. Use '.' if no orientation is assigned.

Get metadata table

Code
### set file path
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname = "metadata.label.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_metadata = dat
print(dim(dat))
fun_display_table(dat)
[1] 6 3
Folder FName Label
encode_open_chromatin K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz dnase_ENCFF274YGF
encode_open_chromatin K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz dnase_ENCFF185XRG
encode_open_chromatin K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz atac_ENCFF558BLC
encode_open_chromatin K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz atac_ENCFF925CYR
encode_open_chromatin K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz atac_ENCFF333TAT
encode_open_chromatin K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz atac_ENCFF948AFM

Import data

Code
fun_str_map_file_label = function(txt_input){
    ### setup mapping
    dat  = dat_metadata
    vec1 = dat$FName
    vec2 = dat$Label

    ### mapping string
    txt_output = fun_str_map_match(txt_input, vec1, vec2, .default=txt)
    txt_output = gsub("^dnase", "DNase", txt_output)
    txt_output = gsub("^atac",  "ATAC",  txt_output)
    
    return(txt_output)
}
Code
### set directory
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fglob  = file.path(txt_fdiry, "*bed*")

### get file names and labels
vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)
vec_txt_label = fun_str_map_file_label(vec_txt_fname)

### read tables
lst = lapply(vec_txt_fpath, function(txt_fpath){
    dat = read_tsv(txt_fpath, col_names = vec_txt_cname, show_col_types = FALSE)
    return(dat)
})
names(lst) = vec_txt_label

### assign and show
lst_dat_import = lst
print(length(lst))
[1] 6

Check data

Code
lst = lst_dat_import
vec = names(lst)
for (txt in vec){cat(txt, "\n")}
DNase_ENCFF274YGF 
DNase_ENCFF185XRG 
ATAC_ENCFF558BLC 
ATAC_ENCFF925CYR 
ATAC_ENCFF333TAT 
ATAC_ENCFF948AFM 
Code
lst = lst_dat_import
dat = lst[[1]]
fun_display_table(head(dat, 3))
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak
chr1 181400 181530 . 0 . 0.299874 -1 -1 75
chr1 778660 778800 . 0 . 14.138300 -1 -1 75
chr1 779137 779200 . 0 . 0.331440 -1 -1 75
Code
lst = lst_dat_import
dat = lst[[2]]
fun_display_table(head(dat, 3))
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak
chr1 139369 139421 . 0 . 0.0872467 -1 -1 75
chr1 180800 180871 . 0 . 0.1371020 -1 -1 75
chr1 181108 181200 . 0 . 0.1246380 -1 -1 75
Code
lst = lst_dat_import
dat = lst[[6]]
fun_display_table(head(dat, 3))
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak
chr1 42030 42399 . 839 . 4.94006 56.98604 54.92928 250
chr1 68963 70035 . 1000 . 3.99240 43.21727 41.22760 115
chr1 68963 70035 . 1000 . 6.60048 110.44444 108.21621 760

Explore data

Chromosome distribution (Bar plots)

Code
### count chromosomes
lst = lst_dat_import
lst = lapply(lst, function(dat){
    dat = as.data.frame(table(dat$Chrom, dnn = "Chrom"))
    return(dat)
})

### merge results
dat = bind_rows(lst, .id = "Label")
dat = dat %>% tidyr::spread(Chrom, Freq) %>% replace(is.na(.), 0)

### summarize counts to frequency
dat = dat %>%
    tidyr::gather(Chrom, Count, -Label) %>%
    dplyr::group_by(Label) %>%
    dplyr::mutate(Total = sum(Count)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(Freq = Count / Total)

### assign and show
dat_stats_chrom = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 144   5
Label Chrom Count Total Freq
ATAC_ENCFF333TAT chr1 31171 269800 0.1155337
ATAC_ENCFF558BLC chr1 24045 203874 0.1179405
ATAC_ENCFF925CYR chr1 14206 123009 0.1154875
ATAC_ENCFF948AFM chr1 20798 181340 0.1146906
DNase_ENCFF185XRG chr1 16479 159277 0.1034613
DNase_ENCFF274YGF chr1 12452 118721 0.1048846
Code
### set chromosome names
vec = c(1:22, "X", "Y")
vec = paste0("chr", vec)
vec_txt_chrom = vec

### set factor levels
dat = dat_stats_chrom
dat = dat %>% dplyr::mutate(Chrom = factor(Chrom, levels = vec_txt_chrom))

### plot chromosome distribution                            
gp1 = ggplot(dat, aes(x=Count, y = Chrom, fill=Label)) + 
    geom_col() + 
    theme_cowplot() + 
    background_grid() +
    theme(legend.position = "none")

### plot chromosome distribution                            
gp2 = ggplot(dat, aes(x=Freq, y = Chrom, fill=Label)) + 
    geom_col() + 
    theme_cowplot() + 
    background_grid()

### combine plot
plt = plot_grid(gp1, gp2, nrow = 1, rel_widths = c(2, 3.1))

### show plot
options(repr.plot.height=7, repr.plot.width=12)
print(plt)

Distribution of region length (Combine)

Code
### arrange table for plot
lst = lst_dat_import
dat  = bind_rows(lst, .id = "Label")
dat = dat %>% 
    dplyr::mutate(Length = ChromEnd - ChromStart) %>%
    dplyr::filter(Length < 1000)

### create plot
gpt = ggplot(dat, aes(x = Length, color = Label)) + 
    geom_density() +
    theme_cowplot() + 
    background_grid()

### show plot
options(repr.plot.height=4, repr.plot.width=7)
print(gpt)

Distribution of region length (Split by ATAC and DNase)

Code
lst = lst_dat_import
vec = names(lst)
for(txt in vec){cat(txt, "\n")}
DNase_ENCFF274YGF 
DNase_ENCFF185XRG 
ATAC_ENCFF558BLC 
ATAC_ENCFF925CYR 
ATAC_ENCFF333TAT 
ATAC_ENCFF948AFM 
Code
### split data frame into ATAC and DNase
lst = lst_dat_import
df1 = bind_rows(lst[1:2], .id = "Label")
df2 = bind_rows(lst[3:6], .id = "Label")
df1 = df1 %>% dplyr::mutate(Group = "DNase")
df2 = df2 %>% dplyr::mutate(Group = "ATAC")

### assign
dat_region_dnase = df1
dat_region_atac  = df2
Code
### arrange table for plot
dat = dat_region_dnase
dat = dat %>% 
    dplyr::mutate(Length = ChromEnd - ChromStart) %>%
    dplyr::filter(Length < 500)

### create plot
gpt = ggplot(dat, aes(x = Length, color = Label)) + 
    geom_density() +
    theme_cowplot() + 
    background_grid()

### show plot
options(repr.plot.height=4, repr.plot.width=7)
print(gpt)

Code
### arrange table for plot
dat = dat_region_atac
dat = dat %>% 
    dplyr::mutate(Length = ChromEnd - ChromStart) %>%
    dplyr::filter(Length < 3000)

### create plot
gpt = ggplot(dat, aes(x = Length, color = Label)) + 
    geom_density() +
    theme_cowplot() + 
    background_grid()

### show plot
options(repr.plot.height=4, repr.plot.width=7)
print(gpt)

Save figures

Code
### set text size
theme_text = theme(
    title      = element_text(size = 16),
    axis.title = element_text(size = 16),
    axis.text  = element_text(size = 14)
)

ENCODE OCR (ATAC)

Code
### arrange table for plot
dat = dat_region_atac
dat = dat %>% 
    dplyr::mutate(Length = ChromEnd - ChromStart) %>%
    dplyr::filter(Length < 3000)
dat_region_arrange = dat

### summarize the number of regions (total)
dat = dat_region_arrange
vec = dat %>%
    group_by(Label) %>%
    summarise(Count = n(), .groups="drop") %>%
    dplyr::mutate(
        line1 = Label,
        line2 = paste("Count =", scales::comma(Count)),
        combined = paste0(line1, "\n", line2)
    ) %>%
    { set_names(.$combined, .$Label) }
vec_txt_label = vec

### create plot
dat = dat_region_arrange
gpt = ggplot(dat, aes(x = Length/1000, colour = Label)) +
    geom_density(linewidth = 1.5) +
    scale_x_continuous(breaks = seq(0, 3, by = 0.5)) +
    scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5)) +
    scale_color_discrete(
        ### replace each original level by our multi‐line version
        labels = vec_txt_label
    ) +
    labs(x = "Length (kb)", y = "Density", title = "Distribution of Region Length") +
    theme_cowplot() + 
    background_grid() +
    theme_text +
    theme(
        legend.position = "inside", 
        legend.position.inside = c(0.6, 0.6),
        legend.background = element_rect(fill = "white"),
        legend.key.spacing.y = unit(0.3, 'cm')
    )

### assign plot
gpt_density_length_atac = gpt

### show plot
options(repr.plot.height=5, repr.plot.width=7)
print(gpt)

ENCODE OCR (DNase)

Code
### arrange table for plot
dat = dat_region_dnase
dat = dat %>% 
    dplyr::mutate(Length = ChromEnd - ChromStart) %>%
    dplyr::filter(Length < 500)
dat_region_arrange = dat

### summarize the number of regions (total)
dat = dat_region_arrange
vec = dat %>%
    group_by(Label) %>%
    summarise(Count = n(), .groups="drop") %>%
    dplyr::mutate(
        line1 = Label,
        line2 = paste("Count =", scales::comma(Count)),
        combined = paste0(line1, "\n", line2)
    ) %>%
    { set_names(.$combined, .$Label) }
vec_txt_label = vec

### create plot
dat = dat_region_arrange
gpt = ggplot(dat, aes(x = Length/1000, colour = Label)) +
    geom_density(linewidth = 1.5) +
    scale_x_continuous(breaks = seq(0, 0.5, by = 0.1)) +
    #scale_y_continuous(breaks = c(0, 0.5, 1.0)) +
    scale_color_discrete(
        ### replace each original level by our multi‐line version
        labels = vec_txt_label
    ) +
    labs(x = "Length (kb)", y = "Density", title = "Distribution of Region Length") +
    theme_cowplot() + 
    background_grid() +
    theme_text +
    theme(
        legend.position = "inside", 
        legend.position.inside = c(0.6, 0.6),
        legend.background = element_rect(fill = "white"),
        legend.key.spacing.y = unit(0.3, 'cm')
    )

### assign plot
gpt_density_length_dnase = gpt

### show plot
options(repr.plot.height=5, repr.plot.width=7)
print(gpt)

Code
### combine plot
plt = plot_grid(
    gpt_density_length_atac, 
    gpt_density_length_dnase,
    nrow = 1
)
plt_export = plt

### show plot
options(repr.plot.height=5, repr.plot.width=14)
print(plt)

Code
txt_fdiry = "./"
txt_fname = "fig.region.encode_ocr.distribution.region_length.png"
txt_fpath = file.path(txt_fdiry, txt_fname)
ggsave(txt_fpath, plt_export, height = 5, width = 14, units = "in")

txt_fdiry = "./"
txt_fname = "fig.region.encode_ocr.distribution.region_length.svg"
txt_fpath = file.path(txt_fdiry, txt_fname)
ggsave(txt_fpath, plt_export, height = 5, width = 14, units = "in")