Prepare ENCODE ATAC/DNase Peaks 04

Generate metadata for the peak file

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_REGION_FOLDER = "encode_open_chromatin"

Define column description

The peak file is in narrowPeak format, which is a standard six field bed with four additional fields (BED6+4 format)

Code
### ENCODE narrowPeak: Narrow (or Point-Source) Peaks format
dat = tribble(
    ~Name,        ~Note,
    "Chrom",      "Name of the chromosome",
    "ChromStart", "The starting position of the feature in the chromosome",
    "ChromEnd",   "The ending position of the feature in the chromosome",
    "Name",       "Name given to a region; Use '.' if no name is assigned.",
    "Score",      "Indicates how dark the peak will be displayed in the browser (0-1000).",
    "Strand",     "+/- to denote strand or orientation. Use '.' if no orientation is assigned.",
    "SignalValue","Measurement of overall (usually, average) enrichment for the region.",
    "PValue",     "Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.",
    "QValue",     "Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.",
    "Peak",       "Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called."
    
)

### assign and show
dat_cnames = dat
fun_display_table(dat)
Name Note
Chrom Name of the chromosome
ChromStart The starting position of the feature in the chromosome
ChromEnd The ending position of the feature in the chromosome
Name Name given to a region; Use '.' if no name is assigned.
Score Indicates how dark the peak will be displayed in the browser (0-1000).
Strand +/- to denote strand or orientation. Use '.' if no orientation is assigned.
SignalValue Measurement of overall (usually, average) enrichment for the region.
PValue Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
QValue Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
Peak Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.

Define file labeling

Code
### set directory
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder)
txt_fglob  = file.path(txt_fdiry, "*bed*")

### get file names
vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

### init info table
dat = data.frame(
    "Folder" = txt_folder,
    "FName"  = vec_txt_fname
)

### arrange table
dat = dat %>% tidyr::separate(
        FName, 
        c("Biosample", "Genome", "Index_Experiment", "Index_File", "Assay", "File_Type", "File_Ext"),
        sep = "\\.",
        remove = FALSE
    ) %>%
    dplyr::mutate(Label = paste(tolower(Assay), Index_File, sep="_")) %>%
    dplyr::select(Folder, FName, Label) 

### assign and show
dat_region_label = dat
fun_display_table(dat)
Folder FName Label
encode_open_chromatin K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz DNase_ENCFF274YGF
encode_open_chromatin K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz DNase_ENCFF185XRG
encode_open_chromatin K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz ATAC_ENCFF558BLC
encode_open_chromatin K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz ATAC_ENCFF925CYR
encode_open_chromatin K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz ATAC_ENCFF333TAT
encode_open_chromatin K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz ATAC_ENCFF948AFM

Save results

Code
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "description.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_cnames
write_tsv(dat, txt_fpath)
Code
txt_folder = TXT_REGION_FOLDER
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "metadata_region_label.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_region_label
write_tsv(dat, txt_fpath)