Prepare ENCODE Chromatin State 01

Generate download script to download the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_FOLDER_REF = "encode_chromatin_states"
TXT_FOLDER_OUT = "encode_chromatin_states"
Code
txt_fdiry  = file.path(FD_REF, TXT_FOLDER_REF)
vec = dir(txt_fdiry)
for (txt in vec){cat(txt, "\n")}
ccres_v4.silencer.rest.tsv 
ccres_v4.silencer.starr.tsv 
ENCODE_K562_hg38_chromatin_states.tsv 
Human epigenomes with ChromHMM state (DAC, Kaili Fan).xlsx 
K562.ENCSR365YNI.ENCAN395TNA.metadata.tsv 
K562.ENCSR913HQX.ENCAN130HDM.metadata.tsv 

Import metadata from reference file

Code
### set file path
txt_folder = TXT_FOLDER_REF
txt_fdiry  = file.path(FD_REF, txt_folder)
txt_fname = "ENCODE_K562_hg38_chromatin_states.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### show and assign
dat_metadata_selected = dat
print(dim(dat))
fun_display_table(dat)
[1]  2 11
Assay Biosample Index_Experiment Index_Process Index_File File_Type Output_Type Genome Encyclopedia version Lab Description
cCREs K562 ENCSR913HQX Lab custom GRCh38 (ENCAN130HDM) processed data ENCFF286VQG bed bed9+ candidate Cis-Regulatory Elements hg38 ENCODE v4 Zhiping Weng, Umass candidate regulatory elements for GRCh38 in K562
ChromHMM K562 ENCSR365YNI Lab custom GRCh38 (ENCAN395TNA) processed data ENCFF106BGJ bed bed9 semi-automated genome annotation hg38 ENCODE v4 Zhiping Weng, Umass ChromHMM 15-state model of K562
Code
### set file path
txt_folder = TXT_FOLDER_REF
txt_fdiry  = file.path(FD_REF, txt_folder)
txt_fname = "*metadata.tsv"
txt_fglob = file.path(txt_fdiry, txt_fname)
vec_txt_fpath = Sys.glob(txt_fglob)

### read table
lst = lapply(vec_txt_fpath, function(txt_fpath){
    dat = read_tsv(txt_fpath, show_col_types = FALSE)
    return(dat)
})
dat = bind_rows(lst)

### show and assign
dat_metadata_import = dat
print(dim(dat))
fun_display_table(head(dat))
[1]  4 32
File accession File format Output type Assay term name Dataset accession Annotation type Software used Encyclopedia Version Biosample term id Biosample term name Biosample type Life stage Age Age units Organism Targets Dataset date released Project Lab md5sum dbxrefs File download URL Assembly Controlled by File Status Derived from S3 URL Azure URL Size Audit WARNING Audit NOT_COMPLIANT Audit ERROR
ENCFF106BGJ bed bed9 semi-automated genome annotation NA ENCSR365YNI chromatin state NA ENCODE v4 EFO:0002067 K562 cell line NA NA NA Homo sapiens NA 2023-05-18 ENCODE Zhiping Weng, UMass 354acb4f74c90c666679706d7685d271 NA https://www.encodeproject.org/files/ENCFF106BGJ/@@download/ENCFF106BGJ.bed.gz GRCh38 NA released /files/ENCFF747HEB/, /files/ENCFF181ANT/, /files/ENCFF121RHF/, /files/ENCFF907MNY/, /files/ENCFF089KHK/, /files/ENCFF229LAF/, /files/ENCFF816ECC/, /files/ENCFF035SOZ/, /files/ENCFF652TXG/, /files/ENCFF508LLH/, /files/ENCFF155UQU/, /files/ENCFF104THG/ https://encode-public.s3.amazonaws.com/2023/04/10/7afe8ded-9f34-45d5-9668-929479c00348/ENCFF106BGJ.bed.gz https://datasetencode.blob.core.windows.net/dataset/2023/04/10/7afe8ded-9f34-45d5-9668-929479c00348/ENCFF106BGJ.bed.gz?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D 4013023 NA NA NA
ENCFF644IFL bigBed bed9 semi-automated genome annotation NA ENCSR365YNI chromatin state NA ENCODE v4 EFO:0002067 K562 cell line NA NA NA Homo sapiens NA 2023-05-18 ENCODE Zhiping Weng, UMass 25dbef1ad57395142d47714109a9f293 NA https://www.encodeproject.org/files/ENCFF644IFL/@@download/ENCFF644IFL.bigBed GRCh38 NA released /files/ENCFF106BGJ/ https://encode-public.s3.amazonaws.com/2023/04/10/158df12e-e8e5-4e08-99a4-426715054cea/ENCFF644IFL.bigBed https://datasetencode.blob.core.windows.net/dataset/2023/04/10/158df12e-e8e5-4e08-99a4-426715054cea/ENCFF644IFL.bigBed?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D 6228846 NA NA NA
ENCFF286VQG bed bed9+ candidate Cis-Regulatory Elements NA ENCSR913HQX candidate Cis-Regulatory Elements NA ENCODE v4 EFO:0002067 K562 cell line NA NA NA Homo sapiens NA 2022-12-20 ENCODE Zhiping Weng, UMass 344a9e65f749a3e5703de96a3577101b NA https://www.encodeproject.org/files/ENCFF286VQG/@@download/ENCFF286VQG.bed.gz GRCh38 NA released /files/ENCFF414OGC/, /files/ENCFF806YEZ/, /files/ENCFF849TDM/, /files/ENCFF736UDR/ https://encode-public.s3.amazonaws.com/2022/11/23/179cba53-f3d4-47af-af56-c4e5a3cabeac/ENCFF286VQG.bed.gz https://datasetencode.blob.core.windows.net/dataset/2022/11/23/179cba53-f3d4-47af-af56-c4e5a3cabeac/ENCFF286VQG.bed.gz?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D 32698783 NA NA NA
ENCFF144YZB bigBed bed9+ candidate Cis-Regulatory Elements NA ENCSR913HQX candidate Cis-Regulatory Elements NA ENCODE v4 EFO:0002067 K562 cell line NA NA NA Homo sapiens NA 2022-12-20 ENCODE Zhiping Weng, UMass d0d6e398da1d883418f6732869b86f9f NA https://www.encodeproject.org/files/ENCFF144YZB/@@download/ENCFF144YZB.bigBed GRCh38 NA released /files/ENCFF286VQG/ https://encode-public.s3.amazonaws.com/2022/11/23/03b1647b-8c32-4708-bcd0-3853d49fb395/ENCFF144YZB.bigBed https://datasetencode.blob.core.windows.net/dataset/2022/11/23/03b1647b-8c32-4708-bcd0-3853d49fb395/ENCFF144YZB.bigBed?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D 72324869 NA NA NA

Explore table

Check genome assembly

Code
dat = dat_metadata_import
table(dat$Assembly)

GRCh38 
     4 

Check biosample

Code
dat = dat_metadata_import
table(dat$`Biosample term name`)

K562 
   4 

Check data

Code
vec1 = dat_metadata_selected$Index_File
vec2 = dat_metadata_import$`File accession`
all(vec1 %in% vec2)
TRUE

Arrange metadata tables

Helper function

Code
fun_simplify_table = function(dat){
    
    ### rename some columns
    dat = dat %>% 
        dplyr::mutate(
            #Index_Experiment = `Experiment accession`,
            Index_Dataset    = `Dataset accession`, 
            Index_File       = `File accession`,
            File_Format      = `File format`,
            File_Type        = `File type`,
            Output_Type      = `Output type`,
            #Genome           = `File assembly`,
            #Target           = str_remove(`Experiment target`, "-human"),
            #Bio_Replicates   = `Biological replicate(s)`,
            Analysis         = Index_Process, #`File analysis title`,
            Derived_From     = `Derived from`,
            File_Name        = basename(`File download URL`),
            File_URL         = `File download URL`
        )
    
    ### select the needed columns
    dat = dat %>%
        dplyr::select(
            Assay,
            Index_Experiment,
            Index_File,
            File_Format,
            File_Type,
            Output_Type,
            Genome,
            #Target,
            #Bio_Replicates,
            Analysis,
            Derived_From,
            md5sum,
            File_Name,
            File_URL
        )

    ### return the simplified table
    return(dat)
}
Code
fun_simplify_table = function(dat){
    
    ### rename some columns
    dat = dat %>% 
        dplyr::mutate(
            Index_Experiment = `Dataset accession`, 
            Index_File       = `File accession`,
            Derived_From     = `Derived from`,
            File_Name        = basename(`File download URL`),
            File_URL         = `File download URL`
        )
    
    ### select the needed columns
    dat = dat %>%
        dplyr::select(
            Index_Experiment,
            Index_File,
            Derived_From,
            md5sum,
            File_Name,
            File_URL
        )

    ### return the simplified table
    return(dat)
}

Simplify the metatable

Code
### arrange and simplify the table
dat = dat_metadata_import
dat = fun_simplify_table(dat)
head(dat)
A tibble: 4 × 6
Index_Experiment Index_File Derived_From md5sum File_Name File_URL
<chr> <chr> <chr> <chr> <chr> <chr>
ENCSR365YNI ENCFF106BGJ /files/ENCFF747HEB/, /files/ENCFF181ANT/, /files/ENCFF121RHF/, /files/ENCFF907MNY/, /files/ENCFF089KHK/, /files/ENCFF229LAF/, /files/ENCFF816ECC/, /files/ENCFF035SOZ/, /files/ENCFF652TXG/, /files/ENCFF508LLH/, /files/ENCFF155UQU/, /files/ENCFF104THG/ 354acb4f74c90c666679706d7685d271 ENCFF106BGJ.bed.gz https://www.encodeproject.org/files/ENCFF106BGJ/@@download/ENCFF106BGJ.bed.gz
ENCSR365YNI ENCFF644IFL /files/ENCFF106BGJ/ 25dbef1ad57395142d47714109a9f293 ENCFF644IFL.bigBed https://www.encodeproject.org/files/ENCFF644IFL/@@download/ENCFF644IFL.bigBed
ENCSR913HQX ENCFF286VQG /files/ENCFF414OGC/, /files/ENCFF806YEZ/, /files/ENCFF849TDM/, /files/ENCFF736UDR/ 344a9e65f749a3e5703de96a3577101b ENCFF286VQG.bed.gz https://www.encodeproject.org/files/ENCFF286VQG/@@download/ENCFF286VQG.bed.gz
ENCSR913HQX ENCFF144YZB /files/ENCFF286VQG/ d0d6e398da1d883418f6732869b86f9f ENCFF144YZB.bigBed https://www.encodeproject.org/files/ENCFF144YZB/@@download/ENCFF144YZB.bigBed
Code
### arrange and simplify the table
dat = dat_metadata_import
dat = fun_simplify_table(dat)

### subset by selected files
vec = c("Index_Experiment", "Index_File")
dat = dplyr::left_join(dat_metadata_selected, dat, by = vec)

### assign and show
dat_metadata_simplify = dat
print(dim(dat))
fun_display_table(dat)
[1]  2 15
Assay Biosample Index_Experiment Index_Process Index_File File_Type Output_Type Genome Encyclopedia version Lab Description Derived_From md5sum File_Name File_URL
cCREs K562 ENCSR913HQX Lab custom GRCh38 (ENCAN130HDM) processed data ENCFF286VQG bed bed9+ candidate Cis-Regulatory Elements hg38 ENCODE v4 Zhiping Weng, Umass candidate regulatory elements for GRCh38 in K562 /files/ENCFF414OGC/, /files/ENCFF806YEZ/, /files/ENCFF849TDM/, /files/ENCFF736UDR/ 344a9e65f749a3e5703de96a3577101b ENCFF286VQG.bed.gz https://www.encodeproject.org/files/ENCFF286VQG/@@download/ENCFF286VQG.bed.gz
ChromHMM K562 ENCSR365YNI Lab custom GRCh38 (ENCAN395TNA) processed data ENCFF106BGJ bed bed9 semi-automated genome annotation hg38 ENCODE v4 Zhiping Weng, Umass ChromHMM 15-state model of K562 /files/ENCFF747HEB/, /files/ENCFF181ANT/, /files/ENCFF121RHF/, /files/ENCFF907MNY/, /files/ENCFF089KHK/, /files/ENCFF229LAF/, /files/ENCFF816ECC/, /files/ENCFF035SOZ/, /files/ENCFF652TXG/, /files/ENCFF508LLH/, /files/ENCFF155UQU/, /files/ENCFF104THG/ 354acb4f74c90c666679706d7685d271 ENCFF106BGJ.bed.gz https://www.encodeproject.org/files/ENCFF106BGJ/@@download/ENCFF106BGJ.bed.gz

Prepare download files

Helper function

Code
fun_map_assay_label = function(txt){
    vec1 = c("ATAC-seq", "DNase-seq")
    vec2 = c("ATAC",     "DNase")
    res  = fun_str_map_detect(txt, vec1, vec2, .default=txt)
    return(res)
}
Code
fun_map_file_ext = function(txt){
    vec1 = c("bigWig", "bed narrowPeak")
    vec2 = c("bw",     "bed.gz")
    res  = fun_str_map_detect(txt, vec1, vec2, .default=txt)
    return(res)
}

Rename filename

Code
### rename filename
dat = dat_metadata_simplify
dat = dat %>% dplyr::mutate(
    File_Name = paste(
        "K562",
        "hg38",
        Index_Experiment,
        Index_File,
        Assay,
        "bed.gz",
    sep = ".")
)

### assign and show
dat_metadata_arrange = dat
fun_display_table(dat)
Assay Biosample Index_Experiment Index_Process Index_File File_Type Output_Type Genome Encyclopedia version Lab Description Derived_From md5sum File_Name File_URL
cCREs K562 ENCSR913HQX Lab custom GRCh38 (ENCAN130HDM) processed data ENCFF286VQG bed bed9+ candidate Cis-Regulatory Elements hg38 ENCODE v4 Zhiping Weng, Umass candidate regulatory elements for GRCh38 in K562 /files/ENCFF414OGC/, /files/ENCFF806YEZ/, /files/ENCFF849TDM/, /files/ENCFF736UDR/ 344a9e65f749a3e5703de96a3577101b K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.bed.gz https://www.encodeproject.org/files/ENCFF286VQG/@@download/ENCFF286VQG.bed.gz
ChromHMM K562 ENCSR365YNI Lab custom GRCh38 (ENCAN395TNA) processed data ENCFF106BGJ bed bed9 semi-automated genome annotation hg38 ENCODE v4 Zhiping Weng, Umass ChromHMM 15-state model of K562 /files/ENCFF747HEB/, /files/ENCFF181ANT/, /files/ENCFF121RHF/, /files/ENCFF907MNY/, /files/ENCFF089KHK/, /files/ENCFF229LAF/, /files/ENCFF816ECC/, /files/ENCFF035SOZ/, /files/ENCFF652TXG/, /files/ENCFF508LLH/, /files/ENCFF155UQU/, /files/ENCFF104THG/ 354acb4f74c90c666679706d7685d271 K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz https://www.encodeproject.org/files/ENCFF106BGJ/@@download/ENCFF106BGJ.bed.gz

Check results

Code
dat = dat_metadata_arrange
dat = dat %>% dplyr::select(Index_Experiment, Index_File, File_Name)
fun_display_table(dat)
Index_Experiment Index_File File_Name
ENCSR913HQX ENCFF286VQG K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.bed.gz
ENCSR365YNI ENCFF106BGJ K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz

Checksum table

Code
### get md5sum for each file
dat = dat_metadata_arrange
dat = dat %>% dplyr::select(md5sum, File_Name)

### assign and show
dat_download_checksum = dat
fun_display_table(dat)
md5sum File_Name
344a9e65f749a3e5703de96a3577101b K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.bed.gz
354acb4f74c90c666679706d7685d271 K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz

Generate download scripts

wget -O FILE URL
Code
### setup download file wget command
dat = dat_metadata_arrange
dat = dat %>% dplyr::mutate(
        CMD = paste(
            "wget", "--append-output=run_download.log.txt", "-O", File_Name, File_URL
        )
    )

### add Shebang and initial commands
dat = dat %>% dplyr::select(CMD)
dat = rbind('echo -n "" > run_download.log.txt', dat)
colnames(dat) = "#!/bin/bash"

### assign and show
dat_download_script = dat
fun_display_table(dat)
#!/bin/bash
echo -n "" > run_download.log.txt
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.bed.gz https://www.encodeproject.org/files/ENCFF286VQG/@@download/ENCFF286VQG.bed.gz
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz https://www.encodeproject.org/files/ENCFF106BGJ/@@download/ENCFF106BGJ.bed.gz

Save results

Code
### set output path
txt_folder = TXT_FOLDER_OUT
txt_fdiry  = file.path(FD_DAT, "external", txt_folder)

### create directory if not exist
dir.create(txt_fdiry, showWarnings = FALSE)

### write checksum file
txt_fname  = "run_download_files.sh"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dat = dat_download_script
write_tsv(dat, txt_fpath)

### save table
txt_fname  = "checksum_md5sum.txt"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dat = dat_download_checksum
write_tsv(dat, txt_fpath, col_names = FALSE)