Region annotation 11 (ASTARR MACS peaks)

Summarize annotations (Main)

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
FP_REGION_LABEL_A = file.path(
    FD_RES, 
    "region/summary/metadata.label.astarr_macs.tsv"
)
FP_REGION_LABEL_B = file.path(
    FD_RES,
    "region/genome_tss/summary/metadata.label.tsv"
)

Import data

Code
dat_region_label_A = read_tsv(FP_REGION_LABEL_A, show_col_types = FALSE)
dat_region_label_A
A spec_tbl_df: 2 × 3
Folder FName Label
<chr> <chr> <chr>
fcc_astarr_macs K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz fcc_astarr_macs_input_overlap
fcc_astarr_macs K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.bed.gz fcc_astarr_macs_input_union
Code
dat_region_label_B = read_tsv(FP_REGION_LABEL_B, show_col_types = FALSE)
dat_region_label_B
A spec_tbl_df: 2 × 3
Folder FName Label
<chr> <chr> <chr>
genome_tss K562.hg38.TSS.selected_by_highest_Pol2_signal.bed.gz genome_tss_pol2
genome_tss K562.hg38.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz genome_tss_pol2_rnaseq

Main

init

Code
dat = dat_region_label_A
lst = split(dat, 1:nrow(dat))
lst = lapply(lst, as.list)
lst_region_label_A = lst
lst
$`1`
$Folder
'fcc_astarr_macs'
$FName
'K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz'
$Label
'fcc_astarr_macs_input_overlap'
$`2`
$Folder
'fcc_astarr_macs'
$FName
'K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.bed.gz'
$Label
'fcc_astarr_macs_input_union'
Code
dat = dat_region_label_B
vec = unique(dat$Folder)
vec_txt_region_folder_B = vec
for(txt in vec){cat(txt, "\n")}
genome_tss 

Loop

Code
### loop through region A
for (idx in seq_along(lst_region_label_A)){
    
    ### get region A
    lst = lst_region_label_A[[idx]]
    txt_region_folder_A = lst$Folder
    txt_region_label_A  = lst$Label

    ### get column names for region A
    txt_fdiry = file.path(FD_RES, "region", txt_region_folder_A, "summary")
    txt_fname = "description.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    dat_cname = read_tsv(txt_fpath, show_col_types = FALSE)
    vec_txt_cname_A = dat_cname$Name

    ### loop through region B
    for (txt_region_folder_B in vec_txt_region_folder_B){
        
        ### get column names for region B
        txt_fdiry = file.path(FD_RES, "region", txt_region_folder_B, "summary")
        txt_fname = "description.tsv"
        txt_fpath = file.path(txt_fdiry, txt_fname)
        dat_cname = read_tsv(txt_fpath, show_col_types = FALSE)
        vec_txt_cname_B = dat_cname$Name

        ### get total column names for annotation table
        vec_txt_cname = c(
            paste0(vec_txt_cname_A, "_A"),
            paste0(vec_txt_cname_B, "_B"),
            "Distance"
        )

        ### set file directory
        txt_fdiry = file.path(
            FD_RES, 
            "region_closest",
            txt_region_label_A, 
            txt_region_folder_B
        )
        txt_fname = "*bed.gz"
        txt_fglob = file.path(txt_fdiry, txt_fname)

        ### get region pair table files
        vec_txt_fpath = Sys.glob(txt_fglob)
        vec_txt_fname = basename(vec_txt_fpath)

        ### show progress
        cat("===========================================\n")
        cat("Read region pairs...", "\n")
        cat("Region A:", txt_region_folder_A, "|", txt_region_label_A, "\n")
        cat("Region B:", txt_region_folder_B, "\n")
        cat("FDiry:   ", "\n")
        print(txt_fdiry)
        cat("FName:   ", "\n")
        print(vec_txt_fname)
        cat("\n")
        flush.console()

        ### import region pair tables
        lst = lapply(vec_txt_fname, function(txt_fname){
            ### set file directory
            txt_fpath = file.path(txt_fdiry, txt_fname)
            
            ### get annotation region A and region B
            vec = str_split(txt_fname, "\\.")[[1]]
            txt_annot_A = vec[1]
            txt_annot_B = vec[2]

            ### read table and add annotation labels
            dat = read_tsv(txt_fpath, col_names = vec_txt_cname, show_col_types = FALSE)
            dat = dat %>% 
                dplyr::mutate(
                    Region_A = fun_gen_region(Chrom_A, ChromStart_A, ChromEnd_A),
                    Region_B = fun_gen_region(Chrom_B, ChromStart_B, ChromEnd_B),
                    Annotation_A = txt_annot_A,
                    Annotation_B = txt_annot_B
                )
            return(dat)
        }) 

        ### concatenate tables and assign
        dat = bind_rows(lst)
        dat_region_annot_pair = dat
        
        ### set directory
        txt_fdiry = file.path(
            FD_RES, 
            "region_closest",
            txt_region_label_A,
            "summary"
        )
        txt_fname = paste("region", "pair",  txt_region_folder_B, "tsv", sep = ".")
        txt_fpath = file.path(txt_fdiry, txt_fname)
        
        ### show progress
        cat("Save region pairs...",  "\n")
        cat("FDiry:", txt_fdiry, "\n")
        cat("FName:", txt_fname, "\n")
        cat("\n")
        flush.console()

        ### write tables
        dir.create(txt_fdiry, showWarnings = FALSE)
        write_tsv(dat_region_annot_pair, txt_fpath)
    }
}
===========================================
Read region pairs... 
Region A: fcc_astarr_macs | fcc_astarr_macs_input_overlap 
Region B: genome_tss 
FDiry:    
[1] "/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/genome_tss"
FName:    
[1] "fcc_astarr_macs_input_overlap.genome_tss_pol2.bed.gz"       
[2] "fcc_astarr_macs_input_overlap.genome_tss_pol2_rnaseq.bed.gz"

Save region pairs... 
FDiry: /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary 
FName: region.pair.genome_tss.tsv 

===========================================
Read region pairs... 
Region A: fcc_astarr_macs | fcc_astarr_macs_input_union 
Region B: genome_tss 
FDiry:    
[1] "/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/genome_tss"
FName:    
[1] "fcc_astarr_macs_input_union.genome_tss_pol2.bed.gz"       
[2] "fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.bed.gz"

Save region pairs... 
FDiry: /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary 
FName: region.pair.genome_tss.tsv