Prepare TF Modules 01

Explore the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Import data

Code
txt_fdiry = file.path(
    FD_RES, 
    "region_nuc", 
    "fcc_astarr_macs"
)

vec = dir(txt_fdiry)
for (txt in vec){cat(txt, "\n")}
K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz 
K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.bed.gz 
summary 
Code
### set file directory
txt_fdiry = file.path(FD_RES, "region_nuc", "fcc_astarr_macs")
txt_fname = "*bed.gz"
txt_fglob = file.path(txt_fdiry, txt_fname)

### get files
vec_txt_fpath = Sys.glob(txt_fglob)
vec_txt_fname = basename(vec_txt_fpath)

### read files
lst = lapply(vec_txt_fpath, function(txt_fpath){
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})
names(lst) = vec_txt_fname

### assign data
lst_dat_import = lst

Check data

Code
lst = lst_dat_import
dat = lst[[1]]
fun_display_table(head(dat))
#1_usercol 2_usercol 3_usercol 4_usercol 5_pct_at 6_pct_gc 7_num_A 8_num_C 9_num_G 10_num_T 11_num_N 12_num_oth 13_seq_len
chr1 10038 10405 chr1:10038-10405 0.476839 0.523161 122 192 0 53 0 0 367
chr1 14282 14614 chr1:14282-14614 0.421687 0.578313 65 102 90 75 0 0 332
chr1 16025 16338 chr1:16025-16338 0.412141 0.587859 62 89 95 67 0 0 313
chr1 17288 17689 chr1:17288-17689 0.374065 0.625935 69 144 107 81 0 0 401
chr1 28934 29499 chr1:28934-29499 0.228319 0.771681 66 247 189 63 0 0 565
chr1 115429 115969 chr1:115429-115969 0.618519 0.381481 160 119 87 174 0 0 540
Code
lst = lst_dat_import
dat = lst[[2]]
fun_display_table(head(dat))
#1_usercol 2_usercol 3_usercol 4_usercol 5_pct_at 6_pct_gc 7_num_A 8_num_C 9_num_G 10_num_T 11_num_N 12_num_oth 13_seq_len
chr1 10015 10442 chr1:10015-10442 0.477752 0.522248 141 223 0 63 0 0 427
chr1 14253 14645 chr1:14253-14645 0.426020 0.573980 75 124 101 92 0 0 392
chr1 16015 16477 chr1:16015-16477 0.458874 0.541126 102 124 126 110 0 0 462
chr1 17237 17772 chr1:17237-17772 0.385047 0.614953 94 182 147 112 0 0 535
chr1 28903 29613 chr1:28903-29613 0.240845 0.759155 85 303 236 86 0 0 710
chr1 30803 31072 chr1:30803-31072 0.498141 0.501859 47 84 51 87 0 0 269

Arrange table

Code
### arrange tables
lst = lst_dat_import
lst = lapply(lst, function(dat){
    ### arrange columns
    dat = dat %>%
        dplyr::mutate(
            Region = fun_gen_region(`#1_usercol`, `2_usercol`, `3_usercol`)
        ) %>%
        dplyr::select(
            `#1_usercol`, `2_usercol`, `3_usercol`, 
            Region,
            contains("pct_gc"), 
            contains("seq_len")
        )

    ### rename column
    colnames(dat) = c("Chrom", "ChromStart", "ChromEnd", "Region", "pGC", "Length")
    return(dat)
})

### assign tables
lst_dat_arrange = lst

Show data

Code
lst = lst_dat_arrange
dat = lst[[1]]
print(dim(dat))
fun_display_table(head(dat))
[1] 150042      6
Chrom ChromStart ChromEnd Region pGC Length
chr1 10038 10405 chr1:10038-10405 0.523161 367
chr1 14282 14614 chr1:14282-14614 0.578313 332
chr1 16025 16338 chr1:16025-16338 0.587859 313
chr1 17288 17689 chr1:17288-17689 0.625935 401
chr1 28934 29499 chr1:28934-29499 0.771681 565
chr1 115429 115969 chr1:115429-115969 0.381481 540
Code
lst = lst_dat_arrange
dat = lst[[2]]
print(dim(dat))
fun_display_table(head(dat))
[1] 246852      6
Chrom ChromStart ChromEnd Region pGC Length
chr1 10015 10442 chr1:10015-10442 0.522248 427
chr1 14253 14645 chr1:14253-14645 0.573980 392
chr1 16015 16477 chr1:16015-16477 0.541126 462
chr1 17237 17772 chr1:17237-17772 0.614953 535
chr1 28903 29613 chr1:28903-29613 0.759155 710
chr1 30803 31072 chr1:30803-31072 0.501859 269

Export results

Code
lst = lst_dat_arrange
for (idx in names(lst)){
    ### get data
    dat = lst[[idx]]
    
    ### process file name
    txt = idx
    txt = str_replace(txt, "bed.gz", "tsv")
        
    ### set file directory
    txt_fdiry = file.path(FD_RES, "region_nuc", "fcc_astarr_macs", "summary")
    txt_fname = txt
    txt_fpath = file.path(txt_fdiry, txt_fname)

    ### write data
    write_tsv(dat, txt_fpath)
    
    ### show progress
    cat("Save table:", txt_fname, "\n")
}
Save table: K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.tsv 
Save table: K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.tsv