Set environment
Code
suppressMessages (suppressWarnings (source ("../run_config_project_sing.R" )))
show_env ()
You are working on Singularity: singularity_proj_encode_fcc
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data
You are working with ENCODE FCC
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT REF (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references
Set global variables
Code
TXT_FOLDER_INP = "encode_chipseq_histone"
TXT_FOLDER_OUT = "encode_chipseq_histone_250120"
Code
txt_folder = TXT_FOLDER_INP
txt_fdiry = file.path (FD_REF, txt_folder)
vec = dir (txt_fdiry)
for (txt in vec){cat (txt, " \n " )}
files.chipseq_histone.default_analysis.250120.txt
files.chipseq_histone.default_files.250120.txt
metadata.chipseq_histone.default_analysis.250120.tsv
metadata.chipseq_histone.default_files.250120.tsv
Explore
File type
Code
dat = dat_metadata_chipseq_v1
table (dat$ Assay, dat$ ` File type ` )
bed bigBed bigWig
Histone ChIP-seq 19 19 19
Code
dat = dat_metadata_chipseq_v2
table (dat$ Assay, dat$ ` File type ` )
bam bed bigBed bigWig
Histone ChIP-seq 84 88 88 120
Output type
Code
dat = dat_metadata_chipseq_v1
table (dat$ ` File type ` , dat$ ` Output type ` )
pseudoreplicated peaks replicated peaks signal p-value
bed 14 5 0
bigBed 14 5 0
bigWig 0 0 19
Code
dat = dat_metadata_chipseq_v2
table (dat$ ` File type ` , dat$ ` Output type ` )
alignments fold change over control pseudoreplicated peaks
bam 42 0 0
bed 0 0 60
bigBed 0 0 60
bigWig 0 60 0
replicated peaks signal p-value unfiltered alignments
bam 0 0 42
bed 28 0 0
bigBed 28 0 0
bigWig 0 60 0
Genome assembly
Code
dat = dat_metadata_chipseq_v1
table (dat$ ` File assembly ` )
Code
dat = dat_metadata_chipseq_v2
table (dat$ ` File assembly ` )
Biosample
Code
dat = dat_metadata_chipseq_v1
table (dat$ ` Biosample term name ` )
Code
dat = dat_metadata_chipseq_v2
table (dat$ ` Biosample term name ` )
Target
Code
dat = dat_metadata_chipseq_v1
table (dat$ ` Experiment target ` )
H2AFZ-human H3K27ac-human H3K27me3-human H3K36me3-human H3K4me1-human
3 3 6 6 6
H3K4me2-human H3K4me3-human H3K79me2-human H3K9ac-human H3K9me1-human
3 12 3 6 3
H3K9me3-human H4K20me1-human
3 3
Code
dat = dat_metadata_chipseq_v2
table (dat$ ` Experiment target ` )
H2AFZ-human H3K27ac-human H3K27me3-human H3K36me3-human H3K4me1-human
18 28 46 46 46
H3K4me2-human H3K4me3-human H3K79me2-human H3K9ac-human H3K9me1-human
18 72 28 36 6
H3K9me3-human H4K20me1-human
18 18
Prepare download files
NarrowPeak files
Code
### filter table
dat = dat_metadata_chipseq_simplify_v1
dat = dat %>% dplyr:: filter (File_Type == "bed" , File_Format == "bed narrowPeak" )
### assign and show
dat_metadata_chipseq_bed_narrowpeak = dat
print (dim (dat))
fun_display_table (head (dat, 3 ))
Histone ChIP-seq
ENCSR000AKU
ENCFF689QIJ
bed narrowPeak
bed
pseudoreplicated peaks
GRCh38
H3K4me3
1, 2
ENCODE4 v1.5.1 GRCh38
5dea2993c0831ae344a989d601c09178
ENCFF689QIJ.bed.gz
https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz
Histone ChIP-seq
ENCSR000AKQ
ENCFF323WOT
bed narrowPeak
bed
pseudoreplicated peaks
GRCh38
H3K27me3
1, 2, 3
ENCODE4 v1.8.0 GRCh38
4422969d0b63260e2fcb83e10fdcc02f
ENCFF323WOT.bed.gz
https://www.encodeproject.org/files/ENCFF323WOT/@@download/ENCFF323WOT.bed.gz
Histone ChIP-seq
ENCSR000EWC
ENCFF540NGG
bed narrowPeak
bed
pseudoreplicated peaks
GRCh38
H3K4me1
1, 2
ENCODE4 v1.5.1 GRCh38
63db47e5b9b98dbebff2ce20df066106
ENCFF540NGG.bed.gz
https://www.encodeproject.org/files/ENCFF540NGG/@@download/ENCFF540NGG.bed.gz
Signal p-value
Code
### filter table
dat = dat_metadata_chipseq_simplify_v1
dat = dat %>% dplyr:: filter (File_Type == "bigWig" , Output_Type == "signal p-value" )
### assign and show
dat_metadata_chipseq_bigwig_pvalue = dat
print (dim (dat))
fun_display_table (head (dat, 3 ))
Histone ChIP-seq
ENCSR000AKU
ENCFF767UON
bigWig
bigWig
signal p-value
GRCh38
H3K4me3
1, 2
ENCODE4 v1.5.1 GRCh38
4c102d45be8326062895ed0a03d4ded7
ENCFF767UON.bigWig
https://www.encodeproject.org/files/ENCFF767UON/@@download/ENCFF767UON.bigWig
Histone ChIP-seq
ENCSR000AKQ
ENCFF582IMB
bigWig
bigWig
signal p-value
GRCh38
H3K27me3
1, 2, 3
ENCODE4 v1.8.0 GRCh38
2ca48f44075eef7118a387260f2f95b9
ENCFF582IMB.bigWig
https://www.encodeproject.org/files/ENCFF582IMB/@@download/ENCFF582IMB.bigWig
Histone ChIP-seq
ENCSR000EWC
ENCFF287LBI
bigWig
bigWig
signal p-value
GRCh38
H3K4me1
1, 2
ENCODE4 v1.5.1 GRCh38
28df1a757a2e5517209c10d57f0ce03e
ENCFF287LBI.bigWig
https://www.encodeproject.org/files/ENCFF287LBI/@@download/ENCFF287LBI.bigWig
Signal fold change
Count the replicates for each file
Code
### helper function
fun = function (txt){
lst = str_split (txt, "," )
lst = lapply (lst, length)
vec = unlist (lst)
return (vec)
}
### count the replicates for each file
dat = dat_metadata_chipseq_simplify_v2
dat = dat %>% dplyr:: filter (File_Type == "bigWig" , Output_Type == "fold change over control" )
dat = dat %>% dplyr:: mutate (Count_Replicates = fun (Bio_Replicates))
dat = dat %>% dplyr:: select (
Index_Experiment,
Index_File,
Bio_Replicates,
Count_Replicates)
### assign and show
dat_count_replicates = dat
print (dim (dat))
fun_display_table (head (dat))
ENCSR000APD
ENCFF112LWK
3
1
ENCSR000APD
ENCFF544AVW
1, 2, 3
3
ENCSR000APD
ENCFF767ECL
1
1
ENCSR000APD
ENCFF457RYD
2
1
ENCSR000AKV
ENCFF286WRJ
1, 2
2
ENCSR000AKV
ENCFF236VCK
1
1
Get the metatable of file with maximum replicates
Code
### get the file with most replicates
dat = dat_count_replicates
dat = dat %>%
dplyr:: group_by (Index_Experiment) %>%
dplyr:: slice_max (Count_Replicates)
vec_txt_file_subset = dat$ Index_File
### filter the table
dat = dat_metadata_chipseq_simplify_v2
dat = dat %>% dplyr:: filter (Index_File %in% vec_txt_file_subset)
### assign and show
dat_metadata_chipseq_bigwig_fold_change = dat
print (dim (dat))
fun_display_table (head (dat, 3 ))
Histone ChIP-seq
ENCSR000APD
ENCFF544AVW
bigWig
bigWig
fold change over control
GRCh38
H3K79me2
1, 2, 3
ENCODE4 v1.8.0 GRCh38
61dc50179ae8d880b972c3697a6a2fc2
ENCFF544AVW.bigWig
https://www.encodeproject.org/files/ENCFF544AVW/@@download/ENCFF544AVW.bigWig
Histone ChIP-seq
ENCSR000AKV
ENCFF286WRJ
bigWig
bigWig
fold change over control
GRCh38
H3K9ac
1, 2
ENCODE4 v1.6.1 GRCh38
ccd7b8c413fdb998ffd799ec52dd5098
ENCFF286WRJ.bigWig
https://www.encodeproject.org/files/ENCFF286WRJ/@@download/ENCFF286WRJ.bigWig
Histone ChIP-seq
ENCSR000APC
ENCFF621DJP
bigWig
bigWig
fold change over control
GRCh38
H2AFZ
1, 2
ENCODE4 v1.6.0 GRCh38
3492c0e4a64e29231558f9e1e2fe520e
ENCFF621DJP.bigWig
https://www.encodeproject.org/files/ENCFF621DJP/@@download/ENCFF621DJP.bigWig
Check results
Code
dat = dat_metadata_chipseq_bigwig_fold_change
table (dat$ Output_Type)
fold change over control
19
Save results
Check equal size
Code
lst = list (
"region_narrowPeak" = dat_metadata_chipseq_bed_narrowpeak,
"signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
"signal_pvalue" = dat_metadata_chipseq_bigwig_pvalue
)
for (idx in names (lst)){
### show
cat (idx, " \n " )
### extract data
dat_metadata = lst[[idx]]
dat = dat_metadata
print (dim (dat))
}
region_narrowPeak
[1] 19 13
signal_fold_change
[1] 19 13
signal_pvalue
[1] 19 13
Check equal experiment
Code
lst = list (
"region_narrowPeak" = dat_metadata_chipseq_bed_narrowpeak,
"signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
"signal_pvalue" = dat_metadata_chipseq_bigwig_pvalue
)
lst = lapply (lst, function (dat){
vec = dat$ Index_Experiment
vec = sort (vec)
})
print (all (lst[[1 ]] == lst[[2 ]]))
print (all (lst[[1 ]] == lst[[3 ]]))
Export merged metadata
Code
### combine the metatables
lst_dat_metadata = list (
"region_narrowPeak" = dat_metadata_chipseq_bed_narrowpeak,
"signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
"signal_pvalue" = dat_metadata_chipseq_bigwig_pvalue
)
### merged table
dat = bind_rows (lst_dat_metadata)
### assign and show
dat_metadata_chipseq_merge = dat
print (dim (dat))
fun_display_table (head (dat, 3 ))
Histone ChIP-seq
ENCSR000AKU
ENCFF689QIJ
bed narrowPeak
bed
pseudoreplicated peaks
GRCh38
H3K4me3
1, 2
ENCODE4 v1.5.1 GRCh38
5dea2993c0831ae344a989d601c09178
ENCFF689QIJ.bed.gz
https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz
Histone ChIP-seq
ENCSR000AKQ
ENCFF323WOT
bed narrowPeak
bed
pseudoreplicated peaks
GRCh38
H3K27me3
1, 2, 3
ENCODE4 v1.8.0 GRCh38
4422969d0b63260e2fcb83e10fdcc02f
ENCFF323WOT.bed.gz
https://www.encodeproject.org/files/ENCFF323WOT/@@download/ENCFF323WOT.bed.gz
Histone ChIP-seq
ENCSR000EWC
ENCFF540NGG
bed narrowPeak
bed
pseudoreplicated peaks
GRCh38
H3K4me1
1, 2
ENCODE4 v1.5.1 GRCh38
63db47e5b9b98dbebff2ce20df066106
ENCFF540NGG.bed.gz
https://www.encodeproject.org/files/ENCFF540NGG/@@download/ENCFF540NGG.bed.gz
Code
### set directory
txt_fdiry = file.path (FD_DAT, "external" , TXT_FOLDER_OUT)
txt_fname = "metadata.merged.tsv"
txt_fpath = file.path (txt_fdiry, txt_fname)
### write table and show
dat = dat_metadata_chipseq_merge
write_tsv (dat, txt_fpath)
cat ("Save table:" , txt_fpath, " \n " )
Save table: /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/metadata.merged.tsv
Code
### set directory
txt_fdiry = "."
txt_fname = "table.chipseq_histone.metadata.files.tsv"
txt_fpath = file.path (txt_fdiry, txt_fname)
### write table and show
dat = dat_metadata_chipseq_merge
write_tsv (dat, txt_fpath)
cat ("Save table:" , txt_fpath, " \n " )
Save table: ./table.chipseq_histone.metadata.files.tsv
Export: metadata table, file list, and checksum: split for parallel execution
Code
### combine multiple metadata
lst = list (
"region_narrowPeak" = dat_metadata_chipseq_bed_narrowpeak,
"signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
"signal_pvalue" = dat_metadata_chipseq_bigwig_pvalue
)
### loop through each metadata
for (idx in names (lst)){
### show progress
cat (idx, " \n " )
### create folder
txt_fdiry = file.path (FD_DAT, "external" , TXT_FOLDER_OUT, idx)
txt_cmd = paste ("mkdir -p" , txt_fdiry)
system (txt_cmd)
### extract data
dat_metadata = lst[[idx]]
### get file url
dat = dat_metadata
dat = dat %>% dplyr:: select (File_URL)
dat_download_furl = dat
### get md5sum for each file
dat = dat_metadata
dat = dat %>% dplyr:: select (md5sum, File_Name)
dat_download_md5sum = dat
### write file list
txt_fname = "files.txt"
txt_fpath = file.path (txt_fdiry, txt_fname)
dat = dat_download_furl
write_tsv (dat, txt_fpath, col_names = FALSE )
### write checksum file
txt_fname = "checksum_md5sum.txt"
txt_fpath = file.path (txt_fdiry, txt_fname)
dat = dat_download_md5sum
write_tsv (dat, txt_fpath, col_names = FALSE )
### write metadata info
txt_fname = "metadata.tsv"
txt_fpath = file.path (txt_fdiry, txt_fname)
dat = dat_metadata
write_tsv (dat, txt_fpath)
}
region_narrowPeak
signal_fold_change
signal_pvalue
Code
DT:: datatable (
dat_metadata_chipseq_merge,
options = list (
pageLength = 10 , # rows per page
lengthMenu = c (5 ,15 ,30 ),
searchHighlight = TRUE
),
class = "stripe hover" ,
rownames = FALSE
)