Set environment
Code
suppressMessages (suppressWarnings (source ("../run_config_project_sing.R" )))
show_env ()
You are working on Singularity: singularity_proj_encode_fcc
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data
You are working with ENCODE FCC
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT REF (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references
Import data
Code
txt_fdiry = file.path (
FD_RES,
"region_nuc" ,
"fcc_astarr_macs" ,
"summary"
)
vec = dir (txt_fdiry)
for (txt in vec){cat (txt, " \n " )}
K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.tsv
K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.tsv
Code
### set file directory
txt_fdiry = file.path (FD_RES, "region_nuc" , "fcc_astarr_macs" , "summary" )
txt_fname = "*tsv"
txt_fglob = file.path (txt_fdiry, txt_fname)
### get files
vec_txt_fpath = Sys.glob (txt_fglob)
vec_txt_fname = basename (vec_txt_fpath)
### read files
lst = lapply (vec_txt_fpath, function (txt_fpath){
dat = read_tsv (txt_fpath, show_col_types = FALSE )
return (dat)
})
names (lst) = vec_txt_fname
### assign data
lst_dat_region_import = lst
Check data
Code
lst = lst_dat_region_import
dat = lst[[1 ]]
print (dim (dat))
fun_display_table (head (dat))
chr1
10038
10405
chr1:10038-10405
0.523161
367
chr1
14282
14614
chr1:14282-14614
0.578313
332
chr1
16025
16338
chr1:16025-16338
0.587859
313
chr1
17288
17689
chr1:17288-17689
0.625935
401
chr1
28934
29499
chr1:28934-29499
0.771681
565
chr1
115429
115969
chr1:115429-115969
0.381481
540
Code
lst = lst_dat_region_import
dat = lst[[2 ]]
print (dim (dat))
fun_display_table (head (dat))
chr1
10015
10442
chr1:10015-10442
0.522248
427
chr1
14253
14645
chr1:14253-14645
0.573980
392
chr1
16015
16477
chr1:16015-16477
0.541126
462
chr1
17237
17772
chr1:17237-17772
0.614953
535
chr1
28903
29613
chr1:28903-29613
0.759155
710
chr1
30803
31072
chr1:30803-31072
0.501859
269
Arrange table
Helper function to map label
Code
fun_str_map_label = function (vec_txt_input){
vec_txt_pattern = c ("overlap" , "union" )
vec_txt_replace = c ("ATAC (Overlap)" , "ATAC (Union)" )
vec_txt_output = fun_str_map_detect (
vec_txt_input,
vec_txt_pattern,
vec_txt_replace
)
return (vec_txt_output)
}
### test function
fun_str_map_label ("ASTARR_overlap" )
'ATAC (Overlap)'
Combine tables
Code
### Concatenate tables
lst = lst_dat_region_import
dat = bind_rows (lst, .id = "Label" )
### Update labels
dat = dat %>% dplyr:: mutate (Label = fun_str_map_label (Label))
### assign and show
dat_region_arrange = dat
print (dim (dat))
print (table (dat$ Label))
fun_display_table (head (dat, 3 ))
[1] 396894 7
ATAC (Overlap) ATAC (Union)
150042 246852
ATAC (Overlap)
chr1
10038
10405
chr1:10038-10405
0.523161
367
ATAC (Overlap)
chr1
14282
14614
chr1:14282-14614
0.578313
332
ATAC (Overlap)
chr1
16025
16338
chr1:16025-16338
0.587859
313
Explore data
Distribution of region length (double check)
Code
### arrange table
dat = dat_region_arrange
dat = dat %>%
dplyr:: mutate (Length = ChromEnd - ChromStart) %>%
dplyr:: filter (Length < 3000 )
### generate plot
gpt = ggplot (dat, aes (x = Length, color = Label)) +
geom_density (linewidth= 2 ) +
theme_cowplot () +
background_grid () +
theme (
legend.position = "inside" ,
legend.position.inside = c (0.5 , 0.7 ),
legend.background = element_rect (fill = "white" )
)
### show plot
options (repr.plot.height= 4 , repr.plot.width= 7 )
print (gpt)
Distribution of GC content
Code
### init
dat = dat_region_arrange
### generate plot
gpt = ggplot (dat, aes (x = pGC, color = Label)) +
geom_density (linewidth= 2 ) +
theme_cowplot () +
background_grid () +
theme (
legend.position = "inside" ,
legend.position.inside = c (0.6 , 0.7 ),
legend.background = element_rect (fill = "white" )
)
### show plot
options (repr.plot.height= 4 , repr.plot.width= 7 )
print (gpt)
Export results
Summarize information
Code
fun_mode_continuous = function (x, ...) {
obj = density (x, ...)
num = obj$ x[which.max (obj$ y)]
return (num)
}
Code
### compute counts and info
dat = dat_region_arrange
dat = dat %>%
group_by (Label) %>%
summarise (
Mean = mean (pGC),
Mode = fun_mode_continuous (pGC),
.groups= "drop" )
dat_region_annot = dat
### build a named vector of new legend labels, each with two lines:
### original label
### Count = 12345
dat = dat_region_annot
vec = dat %>%
dplyr:: mutate (
line1 = Label,
line2 = paste ("Mean =" , round (Mean, 2 ), " " ),
line3 = paste ("Mode =" , round (Mode, 2 ), " " ),
combined = paste0 (line1, " \n " , line2, " \n " , line3)
) %>%
{ set_names (.$ combined, .$ Label) }
vec_txt_region_label = vec
print (vec_txt_region_label)
ATAC (Overlap)
"ATAC (Overlap)\nMean = 0.53 \nMode = 0.49 "
ATAC (Union)
"ATAC (Union)\nMean = 0.51 \nMode = 0.48 "
Create plot
Code
### set text size
theme_text <- theme (
title = element_text (size = 16 ),
axis.title = element_text (size = 16 ),
axis.text = element_text (size = 14 ),
legend.title = element_text (size = 14 ),
legend.text = element_text (size = 12 )
)
Code
### init
dat = dat_region_arrange
### generate plot
gpt = ggplot (dat, aes (x = pGC, color = Label)) +
geom_density (linewidth= 2 ) +
theme_cowplot () +
background_grid () +
labs (x = "GC%" , y = "Density" , title = "Distribution of GC content" ) +
scale_x_continuous (breaks = seq (0 , 1 , by = 0.1 )) +
scale_y_continuous (breaks = seq (0 , 5 , by = 2 )) +
scale_color_discrete (labels = vec_txt_region_label) +
theme_text +
theme (
legend.position = "inside" ,
legend.position.inside = c (0.65 , 0.65 ),
legend.background = element_rect (fill = "white" ),
legend.key.spacing.y = unit (0.3 , 'cm' )
)
### assign plot
gpt_export = gpt
### show plot
options (repr.plot.height= 4 , repr.plot.width= 7 )
print (gpt)
Save plot
Code
txt_fdiry = "./"
txt_fname = "fig.region.fcc_astarr_macs_input.distribution.gc_content.png"
txt_fpath = file.path (txt_fdiry, txt_fname)
ggsave (txt_fpath, gpt_export, height = 4 , width = 7 , units = "in" )
txt_fdiry = "./"
txt_fname = "fig.region.fcc_astarr_macs_input.distribution.gc_content.svg"
txt_fpath = file.path (txt_fdiry, txt_fname)
ggsave (txt_fpath, gpt_export, height = 4 , width = 7 , units = "in" )