Region annotation 11 (ASTARR MACS peaks)

Summarize annotations (Main)

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Prepare

Set global variable

Code
vec = c(
    "fcc_astarr_macs_input_overlap",
    "fcc_astarr_macs_input_union"
)
names(vec) = vec

VEC_TXT_FOLDER = vec
for(txt in vec){cat(txt, "\n")}
fcc_astarr_macs_input_overlap 
fcc_astarr_macs_input_union 
Code
TXT_FNAME_INP = "region.summary.genome_tss.tss_proximity.tsv"

View files

Code
txt_fdiry = file.path(FD_RES, "region_closest", "*", "summary")
txt_fname = TXT_FNAME_INP
txt_fglob = file.path(txt_fdiry, txt_fname)

vec = Sys.glob(txt_fglob)
for(txt in vec){cat(txt, "\n")}
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.summary.genome_tss.tss_proximity.tsv 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.summary.genome_tss.tss_proximity.tsv 

Import data

Import ATAC-TSS region distance

Code
### loop to import data
lst = lapply(VEC_TXT_FOLDER, function(txt_folder){
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_closest", txt_folder, "summary")
    txt_fname = TXT_FNAME_INP
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### read table
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})

### assign and show
lst_dat_region_annot_import = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$fcc_astarr_macs_input_overlap
[1] 304915     11

$fcc_astarr_macs_input_union
[1] 499336     11
Chrom ChromStart ChromEnd Region Annotation_A Annotation_B Region_TSS Score_Pol2 Gene Distance2TSS TSS_Proximity
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:11873-11874 0.00023 DDX11L1 1469 Proximal
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2_rnaseq chr1:29370-29371 0.00023 WASH7P 18966 Distal
chr1 14282 14614 chr1:14282-14614 fcc_astarr_macs_input_overlap genome_tss_pol2 chr1:11873-11874 0.00023 DDX11L1 2409 Distal

Check count

Code
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    res = table(dat$Annotation_B, dnn = "Annotation")
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(head(dat))
Annotation fcc_astarr_macs_input_overlap fcc_astarr_macs_input_union
genome_tss_pol2 153357 250787
genome_tss_pol2_rnaseq 151558 248549

Import gene essential label

Code
### set file directory
txt_fdiry = file.path(FD_RES, "annotation", "gene_essential")
txt_fname = "demap.v24Q2.AchillesCommonEssentialControls.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_gene_essential_import = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 1247    4
Gene Source Version File
AAMP demap v24Q2 AchillesCommonEssentialControls
AARS1 demap v24Q2 AchillesCommonEssentialControls
AASDHPPT demap v24Q2 AchillesCommonEssentialControls

Label TSS essential

Code
### init
dat = dat_gene_essential_import
vec = dat$Gene
vec_txt_gene_essential = vec

### loop and arrange table
lst = lst_dat_region_annot_import
lst = lapply(lst, function(dat){
    dat = dat %>%
        dplyr::mutate(TSS_Essential_Label = (Gene %in% vec_txt_gene_essential)) %>%
        dplyr::mutate(TSS_Essential_Label = ifelse(TSS_Essential_Label, 1, 0))

    dat = dat %>% 
        dplyr::group_by(
            Chrom, ChromStart, ChromEnd, Region, 
            Annotation_A, 
            Annotation_B,
            TSS_Proximity
        ) %>%
        dplyr::summarise(
            TSS_Essential_Count = sum(TSS_Essential_Label),
            .groups = "drop"
        )
    return(dat)
})

### assign and show
lst_dat_region_annot_count_tss_essential = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$fcc_astarr_macs_input_overlap
[1] 300084      8

$fcc_astarr_macs_input_union
[1] 493704      8
Chrom ChromStart ChromEnd Region Annotation_A Annotation_B TSS_Proximity TSS_Essential_Count
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2 Proximal 0
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2_rnaseq Distal 0
chr1 14282 14614 chr1:14282-14614 fcc_astarr_macs_input_overlap genome_tss_pol2 Distal 0

Explore: count

Code
lst = lst_dat_region_annot_count_tss_essential
lst = lapply(lst, function(dat){
    res = table(
        dat$Annotation_B,
        dat$TSS_Proximity, 
        dat$TSS_Essential_Count, 
        dnn = c(
            "TSS_Annotation",
            "TSS_Proximity", 
            "TSS_Essential_Count"
        )
    )
    dat = as.data.frame(res)
    return(dat)
})

dat = bind_rows(lst, .id = "Region")
dat = dat %>% dplyr::filter(Freq != 0)
dat = dat %>% tidyr::spread(Region, Freq)
fun_display_table(dat)
TSS_Annotation TSS_Proximity TSS_Essential_Count fcc_astarr_macs_input_overlap fcc_astarr_macs_input_union
genome_tss_pol2 Distal 0 124224 213239
genome_tss_pol2 Distal 1 5242 9493
genome_tss_pol2 Proximal 0 19283 22819
genome_tss_pol2 Proximal 1 1251 1258
genome_tss_pol2 Proximal 2 42 43
genome_tss_pol2_rnaseq Distal 0 125411 212114
genome_tss_pol2_rnaseq Distal 1 12816 22595
genome_tss_pol2_rnaseq Proximal 0 10515 10836
genome_tss_pol2_rnaseq Proximal 1 1258 1264
genome_tss_pol2_rnaseq Proximal 2 42 43

Summary

Helper function

Code
fun_label_region_category = function(txt_tss_proximity, num_tss_essential){
    if (txt_tss_proximity == "Distal"){
        return("Distal")
    }
    if (num_tss_essential > 0){
        return("Proximal:Essential")
    } else {
        return("Proximal:Non-Essential")
    }
}

Summarize TSS essential

Code
lst = lst_dat_region_annot_count_tss_essential
lst = lapply(lst, function(dat){
    dat = dat %>%
        dplyr::mutate(
            TSS_Category = purrr::pmap_chr(
                list(TSS_Proximity, TSS_Essential_Count),
                fun_label_region_category
            )
        )
    return(dat)
})

### assign and show
lst_dat_region_annot_summary_tss_essential = lst

res = lapply(lst, dim)
print(res)

dat = lst[[1]]
fun_display_table(head(dat, 3))
$fcc_astarr_macs_input_overlap
[1] 300084      9

$fcc_astarr_macs_input_union
[1] 493704      9
Chrom ChromStart ChromEnd Region Annotation_A Annotation_B TSS_Proximity TSS_Essential_Count TSS_Category
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2 Proximal 0 Proximal:Non-Essential
chr1 10038 10405 chr1:10038-10405 fcc_astarr_macs_input_overlap genome_tss_pol2_rnaseq Distal 0 Distal
chr1 14282 14614 chr1:14282-14614 fcc_astarr_macs_input_overlap genome_tss_pol2 Distal 0 Distal

Check: unique region?

Code
lst = lst_dat_region_annot_summary_tss_essential
lst = lapply(lst, function(dat){
    lst = split(dat$Region, dat$Annotation_B)
    lst = lapply(lst, function(vec){
        res = any(duplicated(vec))
        return(res)
    })
    res = unlist(lst)
    return(res)
})

print(lst)
$fcc_astarr_macs_input_overlap
       genome_tss_pol2 genome_tss_pol2_rnaseq 
                 FALSE                  FALSE 

$fcc_astarr_macs_input_union
       genome_tss_pol2 genome_tss_pol2_rnaseq 
                 FALSE                  FALSE 

Export results

Code
for (txt_folder in VEC_TXT_FOLDER){

    ### get table
    dat_region_annot_result = lst_dat_region_annot_summary_tss_essential[[txt_folder]]
    
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_closest", txt_folder, "summary")
    dir.create(txt_fdiry, showWarnings = FALSE)

    ### set file directory
    txt_fname = paste("region.summary", "genome_tss", "tss_essential", "tsv", sep = ".")
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### save table
    dat = dat_region_annot_result
    dat = dat %>% 
        dplyr::arrange(Chrom, ChromStart, ChromEnd) %>%
        dplyr::distinct()
    write_tsv(dat, txt_fpath)
    
    ### show progress
    cat("Save file:", "\n")
    cat("Folder:", txt_folder, "\n")
    cat(txt_fpath, "\n")
    cat("\n")
    flush.console()
}
Save file: 
Folder: fcc_astarr_macs_input_overlap 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/summary/region.summary.genome_tss.tss_essential.tsv 

Save file: 
Folder: fcc_astarr_macs_input_union 
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/summary/region.summary.genome_tss.tss_essential.tsv