Prepare ENCODE ATAC/DNase Peaks 06

Comparison-region counts

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Import data

Helper function: loading data

Code
fun_load_data = function(txt_region_fdiry){
    ### set file directory
    txt_fdiry  = file.path(txt_region_fdiry, "summary")
    txt_fname = "description.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    ### get column names
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    vec_txt_cname = dat$Name

    ### set file directory
    txt_fdiry  = file.path(txt_region_fdiry, "summary")
    txt_fname = "metadata.label.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    ### get file labels
    dat_metadata = read_tsv(txt_fpath, show_col_types = FALSE)
    
    ### set directory
    txt_fdiry  = txt_region_fdiry
    txt_fglob  = file.path(txt_fdiry, "*bed*")
    
    ### get file names and labels
    vec_txt_fpath = Sys.glob(txt_fglob)
    vec_txt_fname = basename(vec_txt_fpath)
    vec_txt_label = fun_str_map_match(
        vec_txt_fname, 
        dat_metadata$FName, 
        dat_metadata$Label, 
        .default=vec_txt_fname)

    ### further modification of labels
    vec_txt_label = gsub("^dnase", "DNase", vec_txt_label)
    vec_txt_label = gsub("^atac",  "ATAC",  vec_txt_label)
    vec_txt_label = gsub(
        "fcc_astarr_macs_input_union", 
        "ASTARR Input (Union)",  
        vec_txt_label)
    vec_txt_label = gsub(
        "fcc_astarr_macs_input_overlap", 
        "ASTARR Input (Overlap)",  
        vec_txt_label)
    
    ### read all region files
    lst = lapply(vec_txt_fpath, function(txt_fpath){
        dat = read_tsv(txt_fpath, col_names = vec_txt_cname, show_col_types = FALSE)
        return(dat)
    })
    names(lst) = vec_txt_label
    
    return(lst)
}

Load data

Code
### set file directory
txt_folder = "fcc_astarr_macs_merge"
txt_fdiry  = file.path(FD_RES, "region", txt_folder)

### read tables
lst = fun_load_data(txt_fdiry)

### assign and show
lst_dat_region_astarr_input = lst
print(lapply(lst, nrow))
$`ASTARR Input (Overlap)`
[1] 150042

$`ASTARR Input (Union)`
[1] 246852
Code
### set file directory
txt_folder = "encode_open_chromatin"
txt_fdiry  = file.path(FD_RES, "region", txt_folder)

### read tables
lst = fun_load_data(txt_fdiry)

### assign and show
lst_dat_region_encode_ocr = lst
print(lapply(lst, nrow))
$DNase_ENCFF274YGF
[1] 118721

$DNase_ENCFF185XRG
[1] 159277

$ATAC_ENCFF558BLC
[1] 203874

$ATAC_ENCFF925CYR
[1] 123009

$ATAC_ENCFF333TAT
[1] 269800

$ATAC_ENCFF948AFM
[1] 181340

Import metadata

Import metadata

Code
### set file directory
txt_folder = "encode_open_chromatin"
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "metadata.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_meta_info = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 10 12
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Bio_Replicates Analysis md5sum File_Name File_URL
DNase-seq ENCSR000EKS ENCFF972GVB bigWig bigWig read-depth normalized signal GRCh38 1 ENCODE4 v3.0.0 GRCh38 1b0432087f9087c0a9e4f0f5a9d08deb K562.hg38.ENCSR000EKS.ENCFF972GVB.DNase.bw https://www.encodeproject.org/files/ENCFF972GVB/@@download/ENCFF972GVB.bigWig
DNase-seq ENCSR000EKS ENCFF274YGF bed narrowPeak bed peaks GRCh38 1 ENCODE4 v3.0.0 GRCh38 9262ab2b89cd60d05deb831cdd6b509e K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz https://www.encodeproject.org/files/ENCFF274YGF/@@download/ENCFF274YGF.bed.gz
DNase-seq ENCSR000EOT ENCFF414OGC bigWig bigWig read-depth normalized signal GRCh38 1 ENCODE4 v3.0.0-alpha.2 GRCh38 ac6a28c1889b241d0f4f9ba28a1e514d K562.hg38.ENCSR000EOT.ENCFF414OGC.DNase.bw https://www.encodeproject.org/files/ENCFF414OGC/@@download/ENCFF414OGC.bigWig
DNase-seq ENCSR000EOT ENCFF185XRG bed narrowPeak bed peaks GRCh38 1 ENCODE4 v3.0.0-alpha.2 GRCh38 04653af177917b3dda96b9454fd8f90e K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz https://www.encodeproject.org/files/ENCFF185XRG/@@download/ENCFF185XRG.bed.gz
ATAC-seq ENCSR483RKN ENCFF600FDO bigWig bigWig signal p-value GRCh38 1, 2 ENCODE4 v1.9.1 GRCh38 044f8568fb4c7735cf01e4f8d0d3e5b9 K562.hg38.ENCSR483RKN.ENCFF600FDO.ATAC.bw https://www.encodeproject.org/files/ENCFF600FDO/@@download/ENCFF600FDO.bigWig
ATAC-seq ENCSR483RKN ENCFF558BLC bed narrowPeak bed pseudoreplicated peaks GRCh38 1, 2 ENCODE4 v1.9.1 GRCh38 3cd5e262b185a5459cfeaa5a2f1a2f18 K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz https://www.encodeproject.org/files/ENCFF558BLC/@@download/ENCFF558BLC.bed.gz

Select column needs for information

Code
### select the columns 
dat = dat_meta_info
dat = dat %>% 
    dplyr::mutate(Method = Output_Type) %>%
    dplyr::select(Assay, Index_Experiment, Index_File, Method)

### assign and show
dat_meta_simplify = dat
fun_display_table(dat)
Assay Index_Experiment Index_File Method
DNase-seq ENCSR000EKS ENCFF972GVB read-depth normalized signal
DNase-seq ENCSR000EKS ENCFF274YGF peaks
DNase-seq ENCSR000EOT ENCFF414OGC read-depth normalized signal
DNase-seq ENCSR000EOT ENCFF185XRG peaks
ATAC-seq ENCSR483RKN ENCFF600FDO signal p-value
ATAC-seq ENCSR483RKN ENCFF558BLC pseudoreplicated peaks
ATAC-seq ENCSR483RKN ENCFF925CYR IDR thresholded peaks
ATAC-seq ENCSR868FGK ENCFF357GNC signal p-value
ATAC-seq ENCSR868FGK ENCFF948AFM IDR thresholded peaks
ATAC-seq ENCSR868FGK ENCFF333TAT pseudoreplicated peaks

Arrange table

Code
lst = lst_dat_region_encode_ocr
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::mutate(Region = fun_gen_region(dat$Chrom, dat$ChromStart, dat$ChromEnd))
    return(dat)
})

### assign and show
lst_dat_region_encode_ocr_arrange = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$DNase_ENCFF274YGF
[1] 118721     11

$DNase_ENCFF185XRG
[1] 159277     11

$ATAC_ENCFF558BLC
[1] 203874     11

$ATAC_ENCFF925CYR
[1] 123009     11

$ATAC_ENCFF333TAT
[1] 269800     11

$ATAC_ENCFF948AFM
[1] 181340     11
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak Region
chr1 181400 181530 . 0 . 0.299874 -1 -1 75 chr1:181400-181530
chr1 778660 778800 . 0 . 14.138300 -1 -1 75 chr1:778660-778800
chr1 779137 779200 . 0 . 0.331440 -1 -1 75 chr1:779137-779200

Explore: Region count

Code
lst = lst_dat_region_astarr_input
dat = lst[[1]]
fun_display_table(head(dat, 3))
Chrom ChromStart ChromEnd Region
chr1 10038 10405 chr1:10038-10405
chr1 14282 14614 chr1:14282-14614
chr1 16025 16338 chr1:16025-16338
Code
lst = lst_dat_region_encode_ocr_arrange
dat = lst[[1]]
fun_display_table(head(dat, 3))
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak Region
chr1 181400 181530 . 0 . 0.299874 -1 -1 75 chr1:181400-181530
chr1 778660 778800 . 0 . 14.138300 -1 -1 75 chr1:778660-778800
chr1 779137 779200 . 0 . 0.331440 -1 -1 75 chr1:779137-779200

Duplicates in ATAC peak calls - https://tinyurl.com/3c6up2kr

after looking at the track for a while, now I am guessing it is because there are multiple summits within many regions.

Code
lst = lst_dat_region_encode_ocr_arrange
dat = lst[["ATAC_ENCFF925CYR"]]
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak Region
chr1 778328 779235 . 1000 . 2.37705 8.90270 6.89830 74 chr1:778328-779235
chr1 778328 779235 . 1000 . 28.93443 749.94922 745.95093 458 chr1:778328-779235
chr1 778328 779235 . 1000 . 3.36066 20.42077 18.19629 788 chr1:778328-779235
chr1 817237 818202 . 1000 . 2.88577 8.24392 6.26027 608 chr1:817237-818202
chr1 817237 818202 . 1000 . 3.53618 12.45234 10.35933 844 chr1:817237-818202
chr1 817237 818202 . 1000 . 8.38235 66.38497 63.80160 275 chr1:817237-818202
Code
lst = lst_dat_region_encode_ocr_arrange
dat = lst[["ATAC_ENCFF925CYR"]]
dat = dat %>% dplyr::filter(Chrom == "chr2")
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Score Strand SignalValue PValue QValue Peak Region
chr2 224451 224707 . 927 . 9.35378 40.79785 38.37319 130 chr2:224451-224707
chr2 264346 265449 . 1000 . 14.91811 361.37784 358.03406 484 chr2:264346-265449
chr2 264346 265449 . 1000 . 1.85032 5.14065 3.29151 87 chr2:264346-265449
chr2 264346 265449 . 1000 . 8.50589 150.79123 147.89729 820 chr2:264346-265449
chr2 289727 290464 . 1000 . 11.69504 112.29874 109.52348 308 chr2:289727-290464
chr2 289727 290464 . 1000 . 12.37189 123.78361 120.97000 599 chr2:289727-290464

Summarize count of rows and region

Code
lst = lst_dat_region_encode_ocr_arrange
lst = lapply(lst, function(dat){
    vec = dat$Region
    res = c(length(vec), length(unique(vec)))
    res = scales::comma(res)
    names(res) = c("Number of Rows", "Number of Region")
    return(res)
})

dat = bind_rows(lst, .id = "Region")
fun_display_table(dat)
Region Number of Rows Number of Region
DNase_ENCFF274YGF 118,721 118,721
DNase_ENCFF185XRG 159,277 159,277
ATAC_ENCFF558BLC 203,874 107,082
ATAC_ENCFF925CYR 123,009 51,861
ATAC_ENCFF333TAT 269,800 161,693
ATAC_ENCFF948AFM 181,340 90,015

Summarize count and arrange table

ASTARR Input MACS Peaks

Code
### arrange count table
lst = lst_dat_region_astarr_input
lst = lapply(lst, function(dat){
    vec = dat$Region
    res = c(length(vec), length(unique(vec)))
    names(res) = c("Count_Row", "Count_Region")
    return(res)
})
dat = bind_rows(lst, .id = "Label")

### assign and show
dat_count_astarr_input = dat
fun_display_table(dat)
Label Count_Row Count_Region
ASTARR Input (Overlap) 150042 150042
ASTARR Input (Union) 246852 246852
Code
### arrange info table
dat = tibble(
    Label  = dat$Label, 
    Assay  = "ATAC-STARR-Input",
    Index_Experiment = "ENCSR312UQM",
    Index_File = ".",
    Method = c(
        "Overlap peaks by bp across replicates", 
        "Union peaks by bp across replicates"
    )
)

### join table
dat = dplyr::left_join(dat, dat_count_astarr_input, by = "Label")

### assign and show
dat_meta_astarr_input = dat
fun_display_table(dat)
Label Assay Index_Experiment Index_File Method Count_Row Count_Region
ASTARR Input (Overlap) ATAC-STARR-Input ENCSR312UQM . Overlap peaks by bp across replicates 150042 150042
ASTARR Input (Union) ATAC-STARR-Input ENCSR312UQM . Union peaks by bp across replicates 246852 246852

ENCODE ATAC/DNase

Code
### arrange count table
lst = lst_dat_region_encode_ocr_arrange
lst = lapply(lst, function(dat){
    vec = dat$Region
    res = c(length(vec), length(unique(vec)))
    names(res) = c("Count_Row", "Count_Region")
    return(res)
})
dat = bind_rows(lst, .id = "Label")

### assign and show
dat_count_encode_ocr = dat
fun_display_table(dat)
Label Count_Row Count_Region
DNase_ENCFF274YGF 118721 118721
DNase_ENCFF185XRG 159277 159277
ATAC_ENCFF558BLC 203874 107082
ATAC_ENCFF925CYR 123009 51861
ATAC_ENCFF333TAT 269800 161693
ATAC_ENCFF948AFM 181340 90015
Code
### arrange table
dat = dat_count_encode_ocr
dat = dat %>% 
    tidyr::separate(
        col  = "Label", 
        into = c("Assay", "Index_File"),
        remove = FALSE
    ) %>%
    dplyr::select(Label, Index_File, Count_Row, Count_Region)

### join table
dat = dplyr::left_join(dat, dat_meta_simplify, by = "Index_File")
    
### assign and show
dat_meta_encode_ocr = dat
fun_display_table(dat)
Label Index_File Count_Row Count_Region Assay Index_Experiment Method
DNase_ENCFF274YGF ENCFF274YGF 118721 118721 DNase-seq ENCSR000EKS peaks
DNase_ENCFF185XRG ENCFF185XRG 159277 159277 DNase-seq ENCSR000EOT peaks
ATAC_ENCFF558BLC ENCFF558BLC 203874 107082 ATAC-seq ENCSR483RKN pseudoreplicated peaks
ATAC_ENCFF925CYR ENCFF925CYR 123009 51861 ATAC-seq ENCSR483RKN IDR thresholded peaks
ATAC_ENCFF333TAT ENCFF333TAT 269800 161693 ATAC-seq ENCSR868FGK pseudoreplicated peaks
ATAC_ENCFF948AFM ENCFF948AFM 181340 90015 ATAC-seq ENCSR868FGK IDR thresholded peaks

Merge tables

Code
### concatenate tables
dat = bind_rows(dat_meta_astarr_input , dat_meta_encode_ocr)

### assign and show
dat_count_merge = dat
fun_display_table(dat)
Label Assay Index_Experiment Index_File Method Count_Row Count_Region
ASTARR Input (Overlap) ATAC-STARR-Input ENCSR312UQM . Overlap peaks by bp across replicates 150042 150042
ASTARR Input (Union) ATAC-STARR-Input ENCSR312UQM . Union peaks by bp across replicates 246852 246852
DNase_ENCFF274YGF DNase-seq ENCSR000EKS ENCFF274YGF peaks 118721 118721
DNase_ENCFF185XRG DNase-seq ENCSR000EOT ENCFF185XRG peaks 159277 159277
ATAC_ENCFF558BLC ATAC-seq ENCSR483RKN ENCFF558BLC pseudoreplicated peaks 203874 107082
ATAC_ENCFF925CYR ATAC-seq ENCSR483RKN ENCFF925CYR IDR thresholded peaks 123009 51861
ATAC_ENCFF333TAT ATAC-seq ENCSR868FGK ENCFF333TAT pseudoreplicated peaks 269800 161693
ATAC_ENCFF948AFM ATAC-seq ENCSR868FGK ENCFF948AFM IDR thresholded peaks 181340 90015

Show results

Code
dat = dat_count_merge
dat = dat %>% 
    dplyr::select(-Label) %>% 
    dplyr::mutate(Count_Row    = scales::comma(Count_Row)) %>%
    dplyr::mutate(Count_Region = scales::comma(Count_Region))
dat %>% kableExtra::kable("markdown")


|Assay            |Index_Experiment |Index_File  |Method                                |Count_Row |Count_Region |
|:----------------|:----------------|:-----------|:-------------------------------------|:---------|:------------|
|ATAC-STARR-Input |ENCSR312UQM      |.           |Overlap peaks by bp across replicates |150,042   |150,042      |
|ATAC-STARR-Input |ENCSR312UQM      |.           |Union peaks by bp across replicates   |246,852   |246,852      |
|DNase-seq        |ENCSR000EKS      |ENCFF274YGF |peaks                                 |118,721   |118,721      |
|DNase-seq        |ENCSR000EOT      |ENCFF185XRG |peaks                                 |159,277   |159,277      |
|ATAC-seq         |ENCSR483RKN      |ENCFF558BLC |pseudoreplicated peaks                |203,874   |107,082      |
|ATAC-seq         |ENCSR483RKN      |ENCFF925CYR |IDR thresholded peaks                 |123,009   |51,861       |
|ATAC-seq         |ENCSR868FGK      |ENCFF333TAT |pseudoreplicated peaks                |269,800   |161,693      |
|ATAC-seq         |ENCSR868FGK      |ENCFF948AFM |IDR thresholded peaks                 |181,340   |90,015       |