Prepare ENCODE RNA-seq 01

Generate download script to download the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_FOLDER_REF = "encode_rnaseq"
TXT_FOLDER_OUT = "encode_rnaseq"
Code
txt_fdiry  = file.path(FD_REF, TXT_FOLDER_REF)
dir(txt_fdiry)
  1. 'ENCODE_K562_hg38_RNAseq_tmp.tsv'
  2. 'ENCODE_K562_hg38_RNAseq.tsv'

Import metadata from reference file

Code
### set file path
txt_folder = TXT_FOLDER_REF
txt_fdiry  = file.path(FD_REF, txt_folder)
txt_fname = "ENCODE_K562_hg38_RNAseq.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### show and assign
dat_metadata_import = dat
fun_display_table(dat)
File accession File format File type File format type Output type File assembly Experiment accession Assay Donor(s) Biosample term id Biosample term name Biosample type Biosample organism Biosample treatments Biosample treatments amount Biosample treatments duration Biosample genetic modifications methods Biosample genetic modifications categories Biosample genetic modifications targets Biosample genetic modifications gene targets Biosample genetic modifications site coordinates Biosample genetic modifications zygosity Experiment target Library made from Library depleted in Library extraction method Library lysis method Library crosslinking method Library strand specific Experiment date released Project RBNS protein concentration Library fragmentation method Library size range Biological replicate(s) Technical replicate(s) Read length Mapped read length Run type Paired end Paired with Index of Derived from Size Lab md5sum dbxrefs File download URL Genome annotation Platform Controlled by File Status s3_uri Azure URL File analysis title File analysis status Audit WARNING Audit NOT_COMPLIANT Audit ERROR
ENCFF421TJX tsv tsv NA gene quantifications GRCh38 ENCSR615EEK total RNA-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA NA RNA NA NA NA NA reverse 1/4/2021 ENCODE NA NA NA 1 1_1 NA NA NA NA NA NA /files/ENCFF285DRD/, /files/ENCFF820EYJ/ 11061127 ENCODE Processing Pipeline a7875635bd3e8a70902a43d4b814832c NA https://www.encodeproject.org/files/ENCFF421TJX/@@download/ENCFF421TJX.tsv V29 NA NA released s3://encode-public/2020/10/30/2033273a-286f-4c94-a652-9d75098cdfb5/ENCFF421TJX.tsv https://datasetencode.blob.core.windows.net/dataset/2020/10/30/2033273a-286f-4c94-a652-9d75098cdfb5/ENCFF421TJX.tsv?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.2.1 GRCh38 V29 released NA unreplicated experiment NA
ENCFF585HTZ bigWig bigWig NA plus strand signal of unique reads GRCh38 ENCSR615EEK total RNA-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA NA RNA NA NA NA NA reverse 1/4/2021 ENCODE NA NA NA 1 1_1 NA NA NA NA NA NA /files/ENCFF833WFD/, /files/GRCh38_EBV.chrom.sizes/ 112512459 ENCODE Processing Pipeline cc22df00ccf3af13fd208828480a7f04 NA https://www.encodeproject.org/files/ENCFF585HTZ/@@download/ENCFF585HTZ.bigWig V29 NA NA released s3://encode-public/2020/10/30/2b5767d4-57fc-4a6f-b50b-660d164f77b4/ENCFF585HTZ.bigWig https://datasetencode.blob.core.windows.net/dataset/2020/10/30/2b5767d4-57fc-4a6f-b50b-660d164f77b4/ENCFF585HTZ.bigWig?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.2.1 GRCh38 V29 released NA unreplicated experiment NA
ENCFF876JOV bigWig bigWig NA minus strand signal of unique reads GRCh38 ENCSR615EEK total RNA-seq /human-donors/ENCDO000AAD/ EFO:0002067 K562 cell line Homo sapiens NA NA NA NA NA NA NA NA NA NA RNA NA NA NA NA reverse 1/4/2021 ENCODE NA NA NA 1 1_1 NA NA NA NA NA NA /files/ENCFF833WFD/, /files/GRCh38_EBV.chrom.sizes/ 109644032 ENCODE Processing Pipeline cd9f7432892b5d9355ff405220508915 NA https://www.encodeproject.org/files/ENCFF876JOV/@@download/ENCFF876JOV.bigWig V29 NA NA released s3://encode-public/2020/10/30/92b9cac8-9587-417b-b3bc-abc7059bf075/ENCFF876JOV.bigWig https://datasetencode.blob.core.windows.net/dataset/2020/10/30/92b9cac8-9587-417b-b3bc-abc7059bf075/ENCFF876JOV.bigWig?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D ENCODE4 v1.2.1 GRCh38 V29 released NA unreplicated experiment NA

Explore table

Check genome assembly

Code
dat = dat_metadata_import
table(dat$`File assembly`)

GRCh38 
     3 

Check biosample

Code
dat = dat_metadata_import
table(dat$`Biosample term name`)

K562 
   3 

Arrange metadata tables

Helper function

Code
fun_simplify_table = function(dat){
    
    ### rename some columns
    dat = dat %>% 
        dplyr::mutate(
            Index_Experiment = `Experiment accession`,
            Index_File       = `File accession`,
            File_Format      = `File format`,
            File_Type        = `File type`,
            Output_Type      = `Output type`,
            Genome           = `File assembly`,
            #Target           = str_remove(`Experiment target`, "-human"),
            Bio_Replicates   = `Biological replicate(s)`,
            Analysis         = `File analysis title`,
            File_Name        = basename(`File download URL`),
            File_URL         = `File download URL`
        )
    
    ### select the needed columns
    dat = dat %>%
        dplyr::select(
            Assay,
            Index_Experiment,
            Index_File,
            File_Format,
            File_Type,
            Output_Type,
            Genome,
            #Target,
            Bio_Replicates,
            Analysis,
            md5sum,
            File_Name,
            File_URL
        )

    ### return the simplified table
    return(dat)
}

Simplify the metatable

Code
### arrange and simplify the table
dat = dat_metadata_import
dat = fun_simplify_table(dat)

### assign and show
dat_metadata_simplify = dat
fun_display_table(dat)
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Bio_Replicates Analysis md5sum File_Name File_URL
total RNA-seq ENCSR615EEK ENCFF421TJX tsv tsv gene quantifications GRCh38 1 ENCODE4 v1.2.1 GRCh38 V29 a7875635bd3e8a70902a43d4b814832c ENCFF421TJX.tsv https://www.encodeproject.org/files/ENCFF421TJX/@@download/ENCFF421TJX.tsv
total RNA-seq ENCSR615EEK ENCFF585HTZ bigWig bigWig plus strand signal of unique reads GRCh38 1 ENCODE4 v1.2.1 GRCh38 V29 cc22df00ccf3af13fd208828480a7f04 ENCFF585HTZ.bigWig https://www.encodeproject.org/files/ENCFF585HTZ/@@download/ENCFF585HTZ.bigWig
total RNA-seq ENCSR615EEK ENCFF876JOV bigWig bigWig minus strand signal of unique reads GRCh38 1 ENCODE4 v1.2.1 GRCh38 V29 cd9f7432892b5d9355ff405220508915 ENCFF876JOV.bigWig https://www.encodeproject.org/files/ENCFF876JOV/@@download/ENCFF876JOV.bigWig

Prepare download files

Helper function

Code
fun_map_file_label = function(txt){
    vec1 = c("quantifications", "plus strand",   "minus strand")
    vec2 = c("tsv",             "strand_pos.bw", "strand_neg.bw")
    res  = fun_str_map_detect(txt, vec1, vec2, .default=txt)
    return(res)
}

Rename filename

Code
### rename filename
dat = dat_metadata_simplify
dat = dat %>% dplyr::mutate(
    File_Name = paste(
        "K562",
        "hg38",
        Index_Experiment,
        Index_File,
        "RNAseq_total",
        fun_map_file_label(Output_Type),
    sep = ".")
)

### assign and show
dat_metadata_arrange = dat
fun_display_table(dat)
Assay Index_Experiment Index_File File_Format File_Type Output_Type Genome Bio_Replicates Analysis md5sum File_Name File_URL
total RNA-seq ENCSR615EEK ENCFF421TJX tsv tsv gene quantifications GRCh38 1 ENCODE4 v1.2.1 GRCh38 V29 a7875635bd3e8a70902a43d4b814832c K562.hg38.ENCSR615EEK.ENCFF421TJX.RNAseq_total.tsv https://www.encodeproject.org/files/ENCFF421TJX/@@download/ENCFF421TJX.tsv
total RNA-seq ENCSR615EEK ENCFF585HTZ bigWig bigWig plus strand signal of unique reads GRCh38 1 ENCODE4 v1.2.1 GRCh38 V29 cc22df00ccf3af13fd208828480a7f04 K562.hg38.ENCSR615EEK.ENCFF585HTZ.RNAseq_total.strand_pos.bw https://www.encodeproject.org/files/ENCFF585HTZ/@@download/ENCFF585HTZ.bigWig
total RNA-seq ENCSR615EEK ENCFF876JOV bigWig bigWig minus strand signal of unique reads GRCh38 1 ENCODE4 v1.2.1 GRCh38 V29 cd9f7432892b5d9355ff405220508915 K562.hg38.ENCSR615EEK.ENCFF876JOV.RNAseq_total.strand_neg.bw https://www.encodeproject.org/files/ENCFF876JOV/@@download/ENCFF876JOV.bigWig

Check results

Code
dat = dat_metadata_arrange
dat = dat %>% dplyr::select(Index_Experiment, Index_File, File_Format, File_Name)
fun_display_table(dat)
Index_Experiment Index_File File_Format File_Name
ENCSR615EEK ENCFF421TJX tsv K562.hg38.ENCSR615EEK.ENCFF421TJX.RNAseq_total.tsv
ENCSR615EEK ENCFF585HTZ bigWig K562.hg38.ENCSR615EEK.ENCFF585HTZ.RNAseq_total.strand_pos.bw
ENCSR615EEK ENCFF876JOV bigWig K562.hg38.ENCSR615EEK.ENCFF876JOV.RNAseq_total.strand_neg.bw

Checksum table

Code
### get md5sum for each file
dat = dat_metadata_arrange
dat = dat %>% dplyr::select(md5sum, File_Name)

### assign and show
dat_download_checksum = dat
fun_display_table(dat)
md5sum File_Name
a7875635bd3e8a70902a43d4b814832c K562.hg38.ENCSR615EEK.ENCFF421TJX.RNAseq_total.tsv
cc22df00ccf3af13fd208828480a7f04 K562.hg38.ENCSR615EEK.ENCFF585HTZ.RNAseq_total.strand_pos.bw
cd9f7432892b5d9355ff405220508915 K562.hg38.ENCSR615EEK.ENCFF876JOV.RNAseq_total.strand_neg.bw

Generate download scripts

wget -O FILE URL
Code
### setup download file wget command
dat = dat_metadata_arrange
dat = dat %>% dplyr::mutate(
        CMD = paste(
            "wget", "--append-output=run_download.log.txt", "-O", File_Name, File_URL
        )
    )

### add Shebang and initial commands
dat = dat %>% dplyr::select(CMD)
dat = rbind('echo -n "" > run_download.log.txt', dat)
colnames(dat) = "#!/bin/bash"

### assign and show
dat_download_script = dat
fun_display_table(dat)
#!/bin/bash
echo -n "" > run_download.log.txt
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR615EEK.ENCFF421TJX.RNAseq_total.tsv https://www.encodeproject.org/files/ENCFF421TJX/@@download/ENCFF421TJX.tsv
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR615EEK.ENCFF585HTZ.RNAseq_total.strand_pos.bw https://www.encodeproject.org/files/ENCFF585HTZ/@@download/ENCFF585HTZ.bigWig
wget --append-output=run_download.log.txt -O K562.hg38.ENCSR615EEK.ENCFF876JOV.RNAseq_total.strand_neg.bw https://www.encodeproject.org/files/ENCFF876JOV/@@download/ENCFF876JOV.bigWig

Save results

Code
### set output path
txt_folder = TXT_FOLDER_OUT
txt_fdiry  = file.path(FD_DAT, "external", txt_folder)

### create directory if not exist
dir.create(txt_fdiry, showWarnings = FALSE)

### write checksum file
txt_fname  = "run_download_files.sh"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dat = dat_download_script
write_tsv(dat, txt_fpath)

### save table
txt_fname  = "checksum_md5sum.txt"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dat = dat_download_checksum
write_tsv(dat, txt_fpath, col_names = FALSE)