Prepare ENCODE ChIP-seq Histone 03

Check download files

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_FOLDER_INP = "encode_chipseq_histone"
TXT_FOLDER_OUT = "encode_chipseq_histone_250120"

Import data

Code
### set file path
txt_foldr = TXT_FOLDER_OUT
txt_fdiry = file.path(FD_DAT, "external", txt_foldr)

### get file directory
vec_txt_foldr = dir(txt_fdiry)
vec_txt_fdiry = file.path(txt_fdiry, vec_txt_foldr)

Get all downloaded files

Code
### get file names
lst = lapply(vec_txt_fdiry, function(txt_fdiry){
    vec = list.files(
        path       = txt_fdiry,
        pattern    = "\\.(bigWig|bed\\.gz)$", 
        full.names = FALSE,
        recursive  = FALSE
    )
    return(vec)
})
names(lst) = vec_txt_foldr

### assign
lst_vec_txt_files = lst
lst
$region_narrowPeak
  1. 'ENCFF122CSI.bed.gz'
  2. 'ENCFF135ZLM.bed.gz'
  3. 'ENCFF148UQI.bed.gz'
  4. 'ENCFF193ERO.bed.gz'
  5. 'ENCFF209OQD.bed.gz'
  6. 'ENCFF213OTI.bed.gz'
  7. 'ENCFF323WOT.bed.gz'
  8. 'ENCFF462AVD.bed.gz'
  9. 'ENCFF540NGG.bed.gz'
  10. 'ENCFF544LXB.bed.gz'
  11. 'ENCFF561OUZ.bed.gz'
  12. 'ENCFF689QIJ.bed.gz'
  13. 'ENCFF706WUF.bed.gz'
  14. 'ENCFF749KLQ.bed.gz'
  15. 'ENCFF801AHF.bed.gz'
  16. 'ENCFF885FQN.bed.gz'
  17. 'ENCFF891CHI.bed.gz'
  18. 'ENCFF909RKY.bed.gz'
  19. 'ENCFF963GZJ.bed.gz'
$signal_fold_change
  1. 'ENCFF139KZL.bigWig'
  2. 'ENCFF242ENK.bigWig'
  3. 'ENCFF253TOF.bigWig'
  4. 'ENCFF286WRJ.bigWig'
  5. 'ENCFF317VHO.bigWig'
  6. 'ENCFF347YYH.bigWig'
  7. 'ENCFF381NDD.bigWig'
  8. 'ENCFF399SGM.bigWig'
  9. 'ENCFF544AVW.bigWig'
  10. 'ENCFF583BKU.bigWig'
  11. 'ENCFF601JGK.bigWig'
  12. 'ENCFF605FAF.bigWig'
  13. 'ENCFF607SUJ.bigWig'
  14. 'ENCFF621DJP.bigWig'
  15. 'ENCFF654SLZ.bigWig'
  16. 'ENCFF660WUG.bigWig'
  17. 'ENCFF806YEZ.bigWig'
  18. 'ENCFF911JVK.bigWig'
  19. 'ENCFF959YJV.bigWig'
$signal_pvalue
  1. 'ENCFF054RSU.bigWig'
  2. 'ENCFF071GML.bigWig'
  3. 'ENCFF178QDA.bigWig'
  4. 'ENCFF202EVH.bigWig'
  5. 'ENCFF220RGS.bigWig'
  6. 'ENCFF239EBH.bigWig'
  7. 'ENCFF287LBI.bigWig'
  8. 'ENCFF334HSS.bigWig'
  9. 'ENCFF405ZDL.bigWig'
  10. 'ENCFF457URZ.bigWig'
  11. 'ENCFF461RKK.bigWig'
  12. 'ENCFF465GBD.bigWig'
  13. 'ENCFF582IMB.bigWig'
  14. 'ENCFF605EVL.bigWig'
  15. 'ENCFF632NQA.bigWig'
  16. 'ENCFF633OZC.bigWig'
  17. 'ENCFF694ODT.bigWig'
  18. 'ENCFF767UON.bigWig'
  19. 'ENCFF847BFA.bigWig'

Import metatable

Code
### set file path
txt_fname = "metadata.tsv"

### get file directory
vec_txt_fpath = file.path(vec_txt_fdiry, txt_fname)

### read metadata
lst = lapply(vec_txt_fpath, function(txt_fpath){
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})
names(lst) = vec_txt_foldr

### assign and show
lst_dat_metadata = lst
for(idx in names(lst)){
    dat = lst[[idx]]
    cat(idx, "\n"); flush.console()
    fun_display_table(head(dat, 3))
    cat("\n")
}
region_narrowPeak 
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000AKU ENCFF689QIJ bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 5dea2993c0831ae344a989d601c09178 ENCFF689QIJ.bed.gz https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz
Histone ChIP-seq ENCSR000AKQ ENCFF323WOT bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K27me3 1, 2, 3 ENCODE4 v1.8.0 GRCh38 4422969d0b63260e2fcb83e10fdcc02f ENCFF323WOT.bed.gz https://www.encodeproject.org/files/ENCFF323WOT/@@download/ENCFF323WOT.bed.gz
Histone ChIP-seq ENCSR000EWC ENCFF540NGG bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me1 1, 2 ENCODE4 v1.5.1 GRCh38 63db47e5b9b98dbebff2ce20df066106 ENCFF540NGG.bed.gz https://www.encodeproject.org/files/ENCFF540NGG/@@download/ENCFF540NGG.bed.gz

signal_fold_change 
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000APD ENCFF544AVW bigWig bigWig fold change over control GRCh38 H3K79me2 1, 2, 3 ENCODE4 v1.8.0 GRCh38 61dc50179ae8d880b972c3697a6a2fc2 ENCFF544AVW.bigWig https://www.encodeproject.org/files/ENCFF544AVW/@@download/ENCFF544AVW.bigWig
Histone ChIP-seq ENCSR000AKV ENCFF286WRJ bigWig bigWig fold change over control GRCh38 H3K9ac 1, 2 ENCODE4 v1.6.1 GRCh38 ccd7b8c413fdb998ffd799ec52dd5098 ENCFF286WRJ.bigWig https://www.encodeproject.org/files/ENCFF286WRJ/@@download/ENCFF286WRJ.bigWig
Histone ChIP-seq ENCSR000APC ENCFF621DJP bigWig bigWig fold change over control GRCh38 H2AFZ 1, 2 ENCODE4 v1.6.0 GRCh38 3492c0e4a64e29231558f9e1e2fe520e ENCFF621DJP.bigWig https://www.encodeproject.org/files/ENCFF621DJP/@@download/ENCFF621DJP.bigWig

signal_pvalue 
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000AKU ENCFF767UON bigWig bigWig signal p-value GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 4c102d45be8326062895ed0a03d4ded7 ENCFF767UON.bigWig https://www.encodeproject.org/files/ENCFF767UON/@@download/ENCFF767UON.bigWig
Histone ChIP-seq ENCSR000AKQ ENCFF582IMB bigWig bigWig signal p-value GRCh38 H3K27me3 1, 2, 3 ENCODE4 v1.8.0 GRCh38 2ca48f44075eef7118a387260f2f95b9 ENCFF582IMB.bigWig https://www.encodeproject.org/files/ENCFF582IMB/@@download/ENCFF582IMB.bigWig
Histone ChIP-seq ENCSR000EWC ENCFF287LBI bigWig bigWig signal p-value GRCh38 H3K4me1 1, 2 ENCODE4 v1.5.1 GRCh38 28df1a757a2e5517209c10d57f0ce03e ENCFF287LBI.bigWig https://www.encodeproject.org/files/ENCFF287LBI/@@download/ENCFF287LBI.bigWig

Check data

Check if the files downloaded matched the metatable

Code
for (txt_foldr in vec_txt_foldr){
    ### 
    lst  = lst_dat_metadata
    dat  = lst[[txt_foldr]]
    vec1 = sort(dat$File_Name)
    
    ###
    lst  = lst_vec_txt_files
    vec2 = sort(lst[[txt_foldr]])
    
    cat(txt_foldr, "\n")
    cat("#Files (Info):    ", length(vec1), "\n")
    cat("#Files (Download):", length(vec2), "\n")
    cat("Matched?", all(vec1 == vec2), "\n")
    cat("\n")
}
region_narrowPeak 
#Files (Info):     19 
#Files (Download): 19 
Matched? TRUE 

signal_fold_change 
#Files (Info):     19 
#Files (Download): 19 
Matched? TRUE 

signal_pvalue 
#Files (Info):     19 
#Files (Download): 19 
Matched? TRUE