Region annotation 11 (ASTARR MACS peaks)

Summarize annotations (Main)

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Prepare

Set global variable

Code
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}
fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 
Code
TXT_FNAME_INP = "region.pair.genome_tss.tsv"

View files

Code
txt_fdiry = file.path(FD_RES, "region_closest", "*", "summary")
txt_fname = TXT_FNAME_INP
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.pair.genome_tss.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.pair.genome_tss.tsv 

Import data

Import ATAC-TSS region distance

Code
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_closest", txt_folder, "summary")
    txt_fname = TXT_FNAME_INP
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$fcc_astarr_macs_input_overlap
[1] 304915     15

$fcc_astarr_macs_input_union
[1] 499336     15
Chrom_A ChromStart_A ChromEnd_A Region_A Chrom_B ChromStart_B ChromEnd_B Region_B Gene_B Score_B Group_B Label_B Distance Annotation_A Annotation_B
chr1 10038 10405 chr1:10038-10405 chr1 11873 11874 chr1:11873-11874 DDX11L1 0.00023 TSS_Pol2 DDX11L1 1469 fcc_astarr_macs_input_overlap genome_tss_pol2
chr1 14282 14614 chr1:14282-14614 chr1 11873 11874 chr1:11873-11874 DDX11L1 0.00023 TSS_Pol2 DDX11L1 2409 fcc_astarr_macs_input_overlap genome_tss_pol2
chr1 16025 16338 chr1:16025-16338 chr1 17436 17437 chr1:17436-17437 MIR6859-1 9.43812 TSS_Pol2 MIR6859-1 1099 fcc_astarr_macs_input_overlap genome_tss_pol2

Arrange table

Code
### setup column name mappign
vec_txt_cname1 = c(
    "Chrom_A", "ChromStart_A", "ChromEnd_A", "Region_A", 
    "Annotation_A", "Annotation_B",
    "Region_B", 
    "Score_B",
    "Gene_B",
    "Distance"
)
vec_txt_cname2 = c(
    "Chrom", "ChromStart", "ChromEnd", "Region", 
    "Annotation_A", "Annotation_B",
    "Region_TSS",
    "Score_Pol2",
    "Gene",
    "Distance2TSS"
)

vec = vec_txt_cname1
names(vec) = vec_txt_cname2
vec_txt_lookup = vec

### loop and arrange table
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    dat = dat %>% 
        dplyr::select(all_of(vec_txt_cname1)) %>%
        dplyr::rename(all_of(vec_txt_lookup))
    return(dat)
})

### assign and show
lst_dat_region_annot_arrange = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$fcc_astarr_macs_input_overlap
[1] 304915     10

$fcc_astarr_macs_input_union
[1] 499336     10
Chrom ChromStart ChromEnd Region Annotation_A Annotation_B Region_TSS Score_Pol2 Gene Distance2TSS
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:11873-11874 0.00023 DDX11L1 1469
chr1 14282 14614 chr1:14282-14614 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:11873-11874 0.00023 DDX11L1 2409
chr1 16025 16338 chr1:16025-16338 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:17436-17437 9.43812 MIR6859-1 1099

Define TSS Proximity

Helper function for labeling region based on their TSS proximity

Code
fun_label_tss_proximity = function(vec_num_distance){
    vec_txt_label = ifelse(
        vec_num_distance <= 2000,
        "Proximal",
        "Distal"
    )
    return(vec_txt_label)
}

Define TSS proximity

Code
### loop and arrange table
lst = lst_dat_region_annot_arrange
lst = lapply(lst, function(dat){
    dat = dat %>% dplyr::mutate(
        TSS_Proximity = fun_label_tss_proximity(Distance2TSS),
    )
    return(dat)
})

### assign and show
lst_dat_region_annot_label_tss_proximity = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$fcc_astarr_macs_input_overlap
[1] 304915     11

$fcc_astarr_macs_input_union
[1] 499336     11
Chrom ChromStart ChromEnd Region Annotation_A Annotation_B Region_TSS Score_Pol2 Gene Distance2TSS TSS_Proximity
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:11873-11874 0.00023 DDX11L1 1469 Proximal
chr1 14282 14614 chr1:14282-14614 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:11873-11874 0.00023 DDX11L1 2409 Distal
chr1 16025 16338 chr1:16025-16338 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:17436-17437 9.43812 MIR6859-1 1099 Proximal

Export results

Code
for (txt_folder in VEC_TXT_FOLDER){

    ### get table
    dat_region_annot_result = lst_dat_region_annot_label_tss_proximity[[txt_folder]]
    
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_closest", txt_folder, "summary")
    dir.create(txt_fdiry, showWarnings = FALSE)

    ### set file directory
    txt_fname = paste("region.summary", "genome_tss", "tss_proximity", "tsv", sep = ".")
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### save table
    dat = dat_region_annot_result
    dat = dat %>% 
        dplyr::arrange(Chrom, ChromStart, ChromEnd) %>%
        dplyr::distinct()
    write_tsv(dat, txt_fpath)
    
    ### show progress
    cat("Save file:", "\n")
    cat("Folder:", txt_folder, "\n")
    cat(txt_fpath, "\n")
    cat("\n")
    flush.console()
}
Save file: 
Folder: fcc_astarr_macs_input_overlap 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.summary.genome_tss.tss_proximity.tsv 

Save file: 
Folder: fcc_astarr_macs_input_union 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.summary.genome_tss.tss_proximity.tsv