Prepare Hi-C data 01 (in situ Hi-C)

Generate download script to download the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 

Import metadata from reference file

Code
txt = file.path(FD_REF, "encode_hic")
dir(txt)
'ENCODE_K562_hg38_hic.tsv'
Code
### set file path
txt_fdiry = file.path(FD_REF, "encode_hic")
txt_fname = "ENCODE_K562_hg38_hic.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_metadata_import = dat
fun_display_table(dat)
Assay Biosample Index_Experiment Index_Process Index_File File_Type Output_Type Genome Lab
HiC (in situ Hi-C) K562 ENCSR545YBD ENCODE4 v1.15.0 GRCh38 (ENCAN133QZO) processed data ENCFF616PUW hic mapping quality thresholded contact matrix hg38 Erez Aiden, Baylor
HiC (in situ Hi-C) K562 ENCSR545YBD ENCODE4 v1.15.0 GRCh38 (ENCAN133QZO) processed data ENCFF693XIL bedpe loops hg38 Erez Aiden, Baylor
HiC (in situ Hi-C) K562 ENCSR545YBD ENCODE4 v1.15.0 GRCh38 (ENCAN133QZO) processed data ENCFF271SAF bedpe contact domains hg38 Erez Aiden, Baylor
HiC (intact Hi-C) K562 ENCSR479XDG ENCODE4 v1.14.2 GRCh38 (ENCAN923OBU) processed data ENCFF621AIY hic mapping quality thresholded contact matrix hg38 Erez Aiden, Baylor
HiC (intact Hi-C) K562 ENCSR479XDG ENCODE4 v1.14.2 GRCh38 (ENCAN923OBU) processed data ENCFF256ZMD bedpe loops hg38 Erez Aiden, Baylor
HiC (intact Hi-C) K562 ENCSR479XDG ENCODE4 v1.14.2 GRCh38 (ENCAN923OBU) processed data ENCFF126GED bedpe contact domains hg38 Erez Aiden, Baylor
Code
dat = dat_metadata_import
dat = dat %>% dplyr::filter(str_detect(Assay, "in situ"))

### assign and show
dat_metadata = dat
fun_display_table(dat)
Assay Biosample Index_Experiment Index_Process Index_File File_Type Output_Type Genome Lab
HiC (in situ Hi-C) K562 ENCSR545YBD ENCODE4 v1.15.0 GRCh38 (ENCAN133QZO) processed data ENCFF616PUW hic mapping quality thresholded contact matrix hg38 Erez Aiden, Baylor
HiC (in situ Hi-C) K562 ENCSR545YBD ENCODE4 v1.15.0 GRCh38 (ENCAN133QZO) processed data ENCFF693XIL bedpe loops hg38 Erez Aiden, Baylor
HiC (in situ Hi-C) K562 ENCSR545YBD ENCODE4 v1.15.0 GRCh38 (ENCAN133QZO) processed data ENCFF271SAF bedpe contact domains hg38 Erez Aiden, Baylor

Generate download commands

wget -O FILE URL
Code
### define helper function
fun_get_file_ext = function(txt){
    vec1 = c("hic", "bedpe")
    vec2 = c("hic", "bedpe.gz")
    res  = fun_str_map_detect(txt, vec1, vec2, .default=txt)
    return(res)
}

### define helper function
fun_get_file_label = function(txt){
    vec1 = c("matrix", "loops", "contact domain")
    vec2 = c("matrix", "loops", "contact_domain")
    res  = fun_str_map_detect(txt, vec1, vec2, .default=txt)
    return(res)
}


### test function
dat = dat_metadata

fun = fun_get_file_ext
txt = dat$File_Type
res = fun(txt)
tmp = data.frame("Input" = txt, "Output" = res)
print(tmp)
cat("\n")

fun = fun_get_file_label
txt = dat$Output_Type
res = fun(txt)
tmp = data.frame("Input" = txt, "Output" = res)
print(tmp)
cat("\n")
  Input   Output
1   hic      hic
2 bedpe bedpe.gz
3 bedpe bedpe.gz

                                       Input         Output
1 mapping quality thresholded contact matrix         matrix
2                                      loops          loops
3                            contact domains contact_domain
Code
### init
dat = dat_metadata

### setup download file name and wget command
dat = dat %>%
    dplyr::mutate(
        File_Label = fun_get_file_label(Output_Type),
        File_Ext1  = fun_get_file_ext(File_Type),
        File_Ext2  = fun_get_file_ext(File_Type)
    ) %>%
    dplyr::mutate(
        File_Name = paste(
            Biosample, 
            Genome, 
            Index_Experiment, 
            Index_File,
            "hic_insitu",
            File_Label,
            File_Ext1, 
            sep=".")
    ) %>%
    dplyr::mutate(
        File_URL_Download = file.path(
            "https://www.encodeproject.org/files",
            Index_File,
            "@@download",
            paste(Index_File, File_Ext2, sep = ".")
        )
    ) %>%
    dplyr::mutate(
        CMD = paste("wget", "--append-output=run_download.log.txt", "-O", File_Name, File_URL_Download)
    )

### add Shebang and initial commands
#dat = dat %>% dplyr::select(Assay, Biosample, Index_Experiment, Index_File, File_Name, CMD)
dat = dat %>% dplyr::select(CMD)
dat = rbind('echo -n "" > run_download.log.txt', dat)
colnames(dat) = "#!/bin/bash"

### assign and show
dat_cmd = dat
fun_display_table(dat)
#!/bin/bash
echo -n "" > run_download.log.txt
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR545YBD.ENCFF616PUW.hic_insitu.matrix.hic https://www.encodeproject.org/files/ENCFF616PUW/@@download/ENCFF616PUW.hic
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR545YBD.ENCFF693XIL.hic_insitu.loops.bedpe.gz https://www.encodeproject.org/files/ENCFF693XIL/@@download/ENCFF693XIL.bedpe.gz
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR545YBD.ENCFF271SAF.hic_insitu.contact_domain.bedpe.gz https://www.encodeproject.org/files/ENCFF271SAF/@@download/ENCFF271SAF.bedpe.gz

Save to script

Save the command lines for each row into a bash script

Code
### set output path
txt_fdiry = file.path(FD_DAT, "external", "hic_insitu_K562_ENCSR545YBD")
txt_fname = "run_download.sh"
txt_fpath = file.path(txt_fdiry, txt_fname)

### save table
dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_cmd
write_tsv(dat, txt_fpath)