Prepare ENCODE ChIP-seq Histone 01

Generate download script to download the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_FOLDER_INP = "encode_chipseq_histone"
TXT_FOLDER_OUT = "encode_chipseq_histone_250120"
Code
txt_folder = TXT_FOLDER_INP
txt_fdiry  = file.path(FD_REF, txt_folder)

vec = dir(txt_fdiry)
for (txt in vec){cat(txt, "\n")}
files.chipseq_histone.default_analysis.250120.txt 
files.chipseq_histone.default_files.250120.txt 
metadata.chipseq_histone.default_analysis.250120.tsv 
metadata.chipseq_histone.default_files.250120.tsv 

Import metadata from reference

Code
### set file path
txt_folder = TXT_FOLDER_INP
txt_fdiry  = file.path(FD_REF, txt_folder)
txt_fname  = "metadata.chipseq_histone.default_files.250120.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### show and assign
dat_metadata_chipseq_v1 = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 57 59
File accession File format File type File format type Output type File assembly Experiment accession Assay Donor(s) Biosample term id Biosample term name Biosample type Biosample organism Biosample treatments Biosample treatments amount Biosample treatments duration Biosample genetic modifications methods Biosample genetic modifications categories Biosample genetic modifications targets Biosample genetic modifications gene targets Biosample genetic modifications site coordinates Biosample genetic modifications zygosity Experiment target Library made from Library depleted in Library extraction method Library lysis method Library crosslinking method Library strand specific Experiment date released Project RBNS protein concentration Library fragmentation method Library size range Biological replicate(s) Technical replicate(s) Read length Mapped read length Run type Paired end Paired with Index of Derived from Size Lab md5sum dbxrefs File download URL Genome annotation Platform Controlled by File Status s3_uri Azure URL File analysis title File analysis status Audit WARNING Audit NOT_COMPLIANT Audit ERROR
ENCFF767UON bigWig bigWig NA signal p-value GRCh38 ENCSR000AKU Histone ChIP-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA H3K4me3-human DNA NA NA NA NA NA 2011-02-10 ENCODE NA see document NA 1, 2 1_1, 2_1 NA NA NA NA NA NA /files/ENCFF420ARY/, /files/ENCFF666YXZ/, /files/ENCFF564SVK/, /files/ENCFF685PPQ/, /files/ENCFF226FKB/ 797629786 ENCODE Processing Pipeline 4c102d45be8326062895ed0a03d4ded7 NA https://www.encodeproject.org/files/ENCFF767UON/@@download/ENCFF767UON.bigWig NA NA NA released s3://encode-public/2020/09/30/ed53d64d-7a8c-4423-a9cd-7c5d80e80b20/ENCFF767UON.bigWig https://datasetencode.blob.core.windows.net/dataset/2020/09/30/ed53d64d-7a8c-4423-a9cd-7c5d80e80b20/ENCFF767UON.bigWig?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.5.1 GRCh38 released low read depth, control low read depth, mild to moderate bottlenecking, inconsistent platforms insufficient read depth NA
ENCFF689QIJ bed narrowPeak bed narrowPeak pseudoreplicated peaks GRCh38 ENCSR000AKU Histone ChIP-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA H3K4me3-human DNA NA NA NA NA NA 2011-02-10 ENCODE NA see document NA 1, 2 1_1, 2_1 NA NA NA NA NA NA /files/ENCFF420ARY/, /files/ENCFF666YXZ/, /files/ENCFF564SVK/, /files/ENCFF685PPQ/, /files/ENCFF356LFX/, /files/ENCFF226FKB/ 1111370 ENCODE Processing Pipeline 5dea2993c0831ae344a989d601c09178 NA https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz NA NA NA released s3://encode-public/2020/09/30/e8359d31-b2ab-44d2-8443-75a04a57723e/ENCFF689QIJ.bed.gz https://datasetencode.blob.core.windows.net/dataset/2020/09/30/e8359d31-b2ab-44d2-8443-75a04a57723e/ENCFF689QIJ.bed.gz?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.5.1 GRCh38 released low read depth, control low read depth, mild to moderate bottlenecking, inconsistent platforms insufficient read depth NA
ENCFF217BRB bigBed narrowPeak bigBed narrowPeak pseudoreplicated peaks GRCh38 ENCSR000AKU Histone ChIP-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA H3K4me3-human DNA NA NA NA NA NA 2011-02-10 ENCODE NA see document NA 1, 2 1_1, 2_1 NA NA NA NA NA NA /files/ENCFF689QIJ/ 2416308 ENCODE Processing Pipeline 9cea2c6388211c15a9dd800ed86b8bab NA https://www.encodeproject.org/files/ENCFF217BRB/@@download/ENCFF217BRB.bigBed NA NA NA released s3://encode-public/2020/09/30/c480ed4a-14f0-4188-be39-adb818e5c5fa/ENCFF217BRB.bigBed https://datasetencode.blob.core.windows.net/dataset/2020/09/30/c480ed4a-14f0-4188-be39-adb818e5c5fa/ENCFF217BRB.bigBed?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.5.1 GRCh38 released low read depth, control low read depth, mild to moderate bottlenecking, inconsistent platforms insufficient read depth NA
Code
### set file path
txt_folder = TXT_FOLDER_INP
txt_fdiry  = file.path(FD_REF, txt_folder)
txt_fname  = "metadata.chipseq_histone.default_analysis.250120.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### show and assign
dat_metadata_chipseq_v2 = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 380  59
File accession File format File type File format type Output type File assembly Experiment accession Assay Donor(s) Biosample term id Biosample term name Biosample type Biosample organism Biosample treatments Biosample treatments amount Biosample treatments duration Biosample genetic modifications methods Biosample genetic modifications categories Biosample genetic modifications targets Biosample genetic modifications gene targets Biosample genetic modifications site coordinates Biosample genetic modifications zygosity Experiment target Library made from Library depleted in Library extraction method Library lysis method Library crosslinking method Library strand specific Experiment date released Project RBNS protein concentration Library fragmentation method Library size range Biological replicate(s) Technical replicate(s) Read length Mapped read length Run type Paired end Paired with Index of Derived from Size Lab md5sum dbxrefs File download URL Genome annotation Platform Controlled by File Status s3_uri Azure URL File analysis title File analysis status Audit WARNING Audit NOT_COMPLIANT Audit ERROR
ENCFF677CFG bam bam NA unfiltered alignments GRCh38 ENCSR000APD Histone ChIP-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA H3K79me2-human DNA NA NA NA NA NA 2011-02-10 ENCODE NA see document 200-600 1 1_1 NA 36 NA NA NA NA /files/ENCFF110MCL/, /files/ENCFF000BYO/ 1454235591 ENCODE Processing Pipeline 4376f4b6f7bc508b2359556756403670 NA https://www.encodeproject.org/files/ENCFF677CFG/@@download/ENCFF677CFG.bam NA NA NA released s3://encode-public/2022/06/16/827755a0-0780-44b9-b857-46fad0a4df36/ENCFF677CFG.bam https://datasetencode.blob.core.windows.net/dataset/2022/06/16/827755a0-0780-44b9-b857-46fad0a4df36/ENCFF677CFG.bam?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.8.0 GRCh38 released low read length, inconsistent platforms control insufficient read depth, insufficient read depth NA
ENCFF112LWK bigWig bigWig NA fold change over control GRCh38 ENCSR000APD Histone ChIP-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA H3K79me2-human DNA NA NA NA NA NA 2011-02-10 ENCODE NA see document 200-600 3 3_1 NA NA NA NA NA NA /files/ENCFF132YSI/, /files/ENCFF226FKB/, /files/ENCFF666YXZ/, /files/ENCFF420ARY/ 484259878 ENCODE Processing Pipeline 106649b0439fa81bd50cbddcbb0e4736 NA https://www.encodeproject.org/files/ENCFF112LWK/@@download/ENCFF112LWK.bigWig NA NA NA released s3://encode-public/2022/06/16/e05ed78a-b5e7-490a-98ae-ba7da1de26b1/ENCFF112LWK.bigWig https://datasetencode.blob.core.windows.net/dataset/2022/06/16/e05ed78a-b5e7-490a-98ae-ba7da1de26b1/ENCFF112LWK.bigWig?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.8.0 GRCh38 released low read length, inconsistent platforms control insufficient read depth, insufficient read depth NA
ENCFF544AVW bigWig bigWig NA fold change over control GRCh38 ENCSR000APD Histone ChIP-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA H3K79me2-human DNA NA NA NA NA NA 2011-02-10 ENCODE NA see document 200-600 1, 2, 3 1_1, 2_1, 3_1 NA NA NA NA NA NA /files/ENCFF711PLM/, /files/ENCFF132YSI/, /files/ENCFF649PBO/, /files/ENCFF226FKB/, /files/ENCFF666YXZ/, /files/ENCFF420ARY/ 927611153 ENCODE Processing Pipeline 61dc50179ae8d880b972c3697a6a2fc2 NA https://www.encodeproject.org/files/ENCFF544AVW/@@download/ENCFF544AVW.bigWig NA NA NA released s3://encode-public/2022/06/16/b460dd47-f161-4ac0-9456-1c7dbd66ad54/ENCFF544AVW.bigWig https://datasetencode.blob.core.windows.net/dataset/2022/06/16/b460dd47-f161-4ac0-9456-1c7dbd66ad54/ENCFF544AVW.bigWig?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.8.0 GRCh38 released low read length, inconsistent platforms control insufficient read depth, insufficient read depth NA

Explore

File type

Code
dat = dat_metadata_chipseq_v1
table(dat$Assay, dat$`File type`)
                  
                   bed bigBed bigWig
  Histone ChIP-seq  19     19     19
Code
dat = dat_metadata_chipseq_v2
table(dat$Assay, dat$`File type`)
                  
                   bam bed bigBed bigWig
  Histone ChIP-seq  84  88     88    120

Output type

Code
dat = dat_metadata_chipseq_v1
table(dat$`File type`, dat$`Output type`)
        
         pseudoreplicated peaks replicated peaks signal p-value
  bed                        14                5              0
  bigBed                     14                5              0
  bigWig                      0                0             19
Code
dat = dat_metadata_chipseq_v2
table(dat$`File type`, dat$`Output type`)
        
         alignments fold change over control pseudoreplicated peaks
  bam            42                        0                      0
  bed             0                        0                     60
  bigBed          0                        0                     60
  bigWig          0                       60                      0
        
         replicated peaks signal p-value unfiltered alignments
  bam                   0              0                    42
  bed                  28              0                     0
  bigBed               28              0                     0
  bigWig                0             60                     0

Genome assembly

Code
dat = dat_metadata_chipseq_v1
table(dat$`File assembly`)

GRCh38 
    57 
Code
dat = dat_metadata_chipseq_v2
table(dat$`File assembly`)

GRCh38 
   380 

Biosample

Code
dat = dat_metadata_chipseq_v1
table(dat$`Biosample term name`)

K562 
  57 
Code
dat = dat_metadata_chipseq_v2
table(dat$`Biosample term name`)

K562 
 380 

Target

Code
dat = dat_metadata_chipseq_v1
table(dat$`Experiment target`)

   H2AFZ-human  H3K27ac-human H3K27me3-human H3K36me3-human  H3K4me1-human 
             3              3              6              6              6 
 H3K4me2-human  H3K4me3-human H3K79me2-human   H3K9ac-human  H3K9me1-human 
             3             12              3              6              3 
 H3K9me3-human H4K20me1-human 
             3              3 
Code
dat = dat_metadata_chipseq_v2
table(dat$`Experiment target`)

   H2AFZ-human  H3K27ac-human H3K27me3-human H3K36me3-human  H3K4me1-human 
            18             28             46             46             46 
 H3K4me2-human  H3K4me3-human H3K79me2-human   H3K9ac-human  H3K9me1-human 
            18             72             28             36              6 
 H3K9me3-human H4K20me1-human 
            18             18 

Arrange metadata tables

Code
fun = function(dat_input){
    ### init
    dat = dat_input
    
    ### rename some columns
    dat = dat %>% 
        dplyr::mutate(
            Index_Experiment = `Experiment accession`,
            Index_File       = `File accession`,
            File_Format      = `File format`,
            File_Type        = `File type`,
            Output_Type      = `Output type`,
            Genome           = `File assembly`,
            Target           = str_remove(`Experiment target`, "-human"),
            Bio_Replicates   = `Biological replicate(s)`,
            Analysis         = `File analysis title`,
            File_Name        = basename(`File download URL`),
            File_URL         = `File download URL`
        )
    
    ### select the needed columns
    dat = dat %>%
        dplyr::select(
            Assay,
            Index_Experiment,
            Index_File,
            File_Format,
            File_Type,
            Output_Type,
            Genome,
            Target,
            Bio_Replicates,
            Analysis,
            md5sum,
            File_Name,
            File_URL
        )

    ### return the simplified table
    return(dat)
}
Code
### arrange the metadata of default files
dat = dat_metadata_chipseq_v1
dat = fun(dat)

### assign and show
dat_metadata_chipseq_simplify_v1 = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 57 13
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000AKU ENCFF767UON bigWig bigWig signal p-value GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 4c102d45be8326062895ed0a03d4ded7 ENCFF767UON.bigWig https://www.encodeproject.org/files/ENCFF767UON/@@download/ENCFF767UON.bigWig
Histone ChIP-seq ENCSR000AKU ENCFF689QIJ bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 5dea2993c0831ae344a989d601c09178 ENCFF689QIJ.bed.gz https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz
Histone ChIP-seq ENCSR000AKU ENCFF217BRB bigBed narrowPeak bigBed pseudoreplicated peaks GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 9cea2c6388211c15a9dd800ed86b8bab ENCFF217BRB.bigBed https://www.encodeproject.org/files/ENCFF217BRB/@@download/ENCFF217BRB.bigBed
Code
### arrange the metadata of default files
dat = dat_metadata_chipseq_v2
dat = fun(dat)

### assign and show
dat_metadata_chipseq_simplify_v2 = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 380  13
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000APD ENCFF677CFG bam bam unfiltered alignments GRCh38 H3K79me2 1 ENCODE4 v1.8.0 GRCh38 4376f4b6f7bc508b2359556756403670 ENCFF677CFG.bam https://www.encodeproject.org/files/ENCFF677CFG/@@download/ENCFF677CFG.bam
Histone ChIP-seq ENCSR000APD ENCFF112LWK bigWig bigWig fold change over control GRCh38 H3K79me2 3 ENCODE4 v1.8.0 GRCh38 106649b0439fa81bd50cbddcbb0e4736 ENCFF112LWK.bigWig https://www.encodeproject.org/files/ENCFF112LWK/@@download/ENCFF112LWK.bigWig
Histone ChIP-seq ENCSR000APD ENCFF544AVW bigWig bigWig fold change over control GRCh38 H3K79me2 1, 2, 3 ENCODE4 v1.8.0 GRCh38 61dc50179ae8d880b972c3697a6a2fc2 ENCFF544AVW.bigWig https://www.encodeproject.org/files/ENCFF544AVW/@@download/ENCFF544AVW.bigWig

Prepare download files

NarrowPeak files

Code
### filter table
dat = dat_metadata_chipseq_simplify_v1
dat = dat %>% dplyr::filter(File_Type == "bed", File_Format == "bed narrowPeak")

### assign and show
dat_metadata_chipseq_bed_narrowpeak = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 19 13
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000AKU ENCFF689QIJ bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 5dea2993c0831ae344a989d601c09178 ENCFF689QIJ.bed.gz https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz
Histone ChIP-seq ENCSR000AKQ ENCFF323WOT bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K27me3 1, 2, 3 ENCODE4 v1.8.0 GRCh38 4422969d0b63260e2fcb83e10fdcc02f ENCFF323WOT.bed.gz https://www.encodeproject.org/files/ENCFF323WOT/@@download/ENCFF323WOT.bed.gz
Histone ChIP-seq ENCSR000EWC ENCFF540NGG bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me1 1, 2 ENCODE4 v1.5.1 GRCh38 63db47e5b9b98dbebff2ce20df066106 ENCFF540NGG.bed.gz https://www.encodeproject.org/files/ENCFF540NGG/@@download/ENCFF540NGG.bed.gz

Signal p-value

Code
### filter table
dat = dat_metadata_chipseq_simplify_v1
dat = dat %>% dplyr::filter(File_Type == "bigWig", Output_Type == "signal p-value")

### assign and show
dat_metadata_chipseq_bigwig_pvalue = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 19 13
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000AKU ENCFF767UON bigWig bigWig signal p-value GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 4c102d45be8326062895ed0a03d4ded7 ENCFF767UON.bigWig https://www.encodeproject.org/files/ENCFF767UON/@@download/ENCFF767UON.bigWig
Histone ChIP-seq ENCSR000AKQ ENCFF582IMB bigWig bigWig signal p-value GRCh38 H3K27me3 1, 2, 3 ENCODE4 v1.8.0 GRCh38 2ca48f44075eef7118a387260f2f95b9 ENCFF582IMB.bigWig https://www.encodeproject.org/files/ENCFF582IMB/@@download/ENCFF582IMB.bigWig
Histone ChIP-seq ENCSR000EWC ENCFF287LBI bigWig bigWig signal p-value GRCh38 H3K4me1 1, 2 ENCODE4 v1.5.1 GRCh38 28df1a757a2e5517209c10d57f0ce03e ENCFF287LBI.bigWig https://www.encodeproject.org/files/ENCFF287LBI/@@download/ENCFF287LBI.bigWig

Signal fold change

Count the replicates for each file

Code
### helper function
fun = function(txt){
    lst = str_split(txt, ",")
    lst = lapply(lst, length)
    vec = unlist(lst)
    return(vec)
}

### count the replicates for each file
dat = dat_metadata_chipseq_simplify_v2
dat = dat %>% dplyr::filter(File_Type == "bigWig", Output_Type == "fold change over control")
dat = dat %>% dplyr::mutate(Count_Replicates = fun(Bio_Replicates))
dat = dat %>% dplyr::select(
    Index_Experiment,
    Index_File,
    Bio_Replicates,
    Count_Replicates)

### assign and show
dat_count_replicates = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 60  4
Index_Experiment Index_File Bio_Replicates Count_Replicates
ENCSR000APD ENCFF112LWK 3 1
ENCSR000APD ENCFF544AVW 1, 2, 3 3
ENCSR000APD ENCFF767ECL 1 1
ENCSR000APD ENCFF457RYD 2 1
ENCSR000AKV ENCFF286WRJ 1, 2 2
ENCSR000AKV ENCFF236VCK 1 1

Get the metatable of file with maximum replicates

Code
### get the file with most replicates
dat = dat_count_replicates
dat = dat %>% 
    dplyr::group_by(Index_Experiment) %>% 
    dplyr::slice_max(Count_Replicates)
vec_txt_file_subset = dat$Index_File

### filter the table
dat = dat_metadata_chipseq_simplify_v2
dat = dat %>% dplyr::filter(Index_File %in% vec_txt_file_subset)

### assign and show
dat_metadata_chipseq_bigwig_fold_change = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 19 13
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000APD ENCFF544AVW bigWig bigWig fold change over control GRCh38 H3K79me2 1, 2, 3 ENCODE4 v1.8.0 GRCh38 61dc50179ae8d880b972c3697a6a2fc2 ENCFF544AVW.bigWig https://www.encodeproject.org/files/ENCFF544AVW/@@download/ENCFF544AVW.bigWig
Histone ChIP-seq ENCSR000AKV ENCFF286WRJ bigWig bigWig fold change over control GRCh38 H3K9ac 1, 2 ENCODE4 v1.6.1 GRCh38 ccd7b8c413fdb998ffd799ec52dd5098 ENCFF286WRJ.bigWig https://www.encodeproject.org/files/ENCFF286WRJ/@@download/ENCFF286WRJ.bigWig
Histone ChIP-seq ENCSR000APC ENCFF621DJP bigWig bigWig fold change over control GRCh38 H2AFZ 1, 2 ENCODE4 v1.6.0 GRCh38 3492c0e4a64e29231558f9e1e2fe520e ENCFF621DJP.bigWig https://www.encodeproject.org/files/ENCFF621DJP/@@download/ENCFF621DJP.bigWig

Check results

Code
dat = dat_metadata_chipseq_bigwig_fold_change
table(dat$Output_Type)

fold change over control 
                      19 

Save results

Check equal size

Code
lst = list(
    "region_narrowPeak"  = dat_metadata_chipseq_bed_narrowpeak,
    "signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
    "signal_pvalue"      = dat_metadata_chipseq_bigwig_pvalue
)

for (idx in names(lst)){
    ### show
    cat(idx, "\n")

    ### extract data
    dat_metadata = lst[[idx]]

    dat = dat_metadata
    print(dim(dat))
}
region_narrowPeak 
[1] 19 13
signal_fold_change 
[1] 19 13
signal_pvalue 
[1] 19 13

Check equal experiment

Code
lst = list(
    "region_narrowPeak"  = dat_metadata_chipseq_bed_narrowpeak,
    "signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
    "signal_pvalue"      = dat_metadata_chipseq_bigwig_pvalue
)

lst = lapply(lst, function(dat){
    vec = dat$Index_Experiment
    vec = sort(vec)
})

print(all(lst[[1]] == lst[[2]]))
print(all(lst[[1]] == lst[[3]]))
[1] TRUE
[1] TRUE

Export merged metadata

Code
### combine the metatables
lst_dat_metadata = list(
    "region_narrowPeak"  = dat_metadata_chipseq_bed_narrowpeak,
    "signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change,
    "signal_pvalue"      = dat_metadata_chipseq_bigwig_pvalue
)

### merged table
dat = bind_rows(lst_dat_metadata)

### assign and show
dat_metadata_chipseq_merge = dat
print(dim(dat))
fun_display_table(head(dat, 3))
[1] 57 13
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Target Bio_Replicates Analysis md5sum File_Name File_URL
Histone ChIP-seq ENCSR000AKU ENCFF689QIJ bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me3 1, 2 ENCODE4 v1.5.1 GRCh38 5dea2993c0831ae344a989d601c09178 ENCFF689QIJ.bed.gz https://www.encodeproject.org/files/ENCFF689QIJ/@@download/ENCFF689QIJ.bed.gz
Histone ChIP-seq ENCSR000AKQ ENCFF323WOT bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K27me3 1, 2, 3 ENCODE4 v1.8.0 GRCh38 4422969d0b63260e2fcb83e10fdcc02f ENCFF323WOT.bed.gz https://www.encodeproject.org/files/ENCFF323WOT/@@download/ENCFF323WOT.bed.gz
Histone ChIP-seq ENCSR000EWC ENCFF540NGG bed narrowPeak bed pseudoreplicated peaks GRCh38 H3K4me1 1, 2 ENCODE4 v1.5.1 GRCh38 63db47e5b9b98dbebff2ce20df066106 ENCFF540NGG.bed.gz https://www.encodeproject.org/files/ENCFF540NGG/@@download/ENCFF540NGG.bed.gz
Code
### set directory
txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_OUT)
txt_fname = "metadata.merged.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### write table and show
dat = dat_metadata_chipseq_merge
write_tsv(dat, txt_fpath)
cat("Save table:", txt_fpath, "\n")
Save table: /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/metadata.merged.tsv 
Code
### set directory
txt_fdiry = "."
txt_fname = "table.chipseq_histone.metadata.files.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### write table and show
dat = dat_metadata_chipseq_merge
write_tsv(dat, txt_fpath)
cat("Save table:", txt_fpath, "\n")
Save table: ./table.chipseq_histone.metadata.files.tsv 

Export: metadata table, file list, and checksum: split for parallel execution

Code
### combine multiple metadata
lst = list(
    "region_narrowPeak"  = dat_metadata_chipseq_bed_narrowpeak,
    "signal_fold_change" = dat_metadata_chipseq_bigwig_fold_change, 
    "signal_pvalue"      = dat_metadata_chipseq_bigwig_pvalue
)

### loop through each metadata
for (idx in names(lst)){
    ### show progress
    cat(idx, "\n")
    
    ### create folder
    txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_OUT, idx)
    txt_cmd   = paste("mkdir -p", txt_fdiry)
    system(txt_cmd)
    
    ### extract data
    dat_metadata = lst[[idx]]

    ### get file url
    dat = dat_metadata
    dat = dat %>% dplyr::select(File_URL)
    dat_download_furl = dat
    
    ### get md5sum for each file
    dat = dat_metadata
    dat = dat %>% dplyr::select(md5sum, File_Name)
    dat_download_md5sum = dat

    ### write file list
    txt_fname = "files.txt"
    txt_fpath = file.path(txt_fdiry, txt_fname)
    
    dat = dat_download_furl
    write_tsv(dat, txt_fpath, col_names = FALSE)

    ### write checksum file
    txt_fname = "checksum_md5sum.txt"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    dat = dat_download_md5sum
    write_tsv(dat, txt_fpath, col_names = FALSE)

    ### write metadata info
    txt_fname = "metadata.tsv"
    txt_fpath = file.path(txt_fdiry, txt_fname)

    dat = dat_metadata
    write_tsv(dat, txt_fpath)  
}
region_narrowPeak 
signal_fold_change 
signal_pvalue 
Code
DT::datatable(
    dat_metadata_chipseq_merge,
    options = list(
        pageLength = 10,         # rows per page
        lengthMenu = c(5,15,30), 
        searchHighlight = TRUE
    ),
    class = "stripe hover",
    rownames = FALSE
)