Prepare ENCODE ChIP-seq Histone 02

Download the data

set environment

Code
source ../run_config_project.sh
show_env
You are working on             Duke Server: RCC
BASE DIRECTORY (FD_BASE):      /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO):      /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK):      /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA):      /data/reddylab/Kuei/data
CONTAINER DIR. (FD_SING):      /data/reddylab/Kuei/container

You are working with           ENCODE FCC
PATH OF PROJECT (FD_PRJ):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA    (FD_DAT):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE    (FD_NBK):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS    (FD_DOC):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG     (FD_LOG):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT REF     (FD_REF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references
PROJECT IMAGE   (FP_PRJ_SIF):  /data/reddylab/Kuei/container/project/singularity_proj_encode_fcc.sif
PROJECT CONF.   (FP_CNF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts/config_project.sh

Set global variables

Code
TXT_FOLDER="encode_chipseq_histone_250120"

Execute

Preview scripts

Code
FN_EXE=run_download_files.sh
FP_EXE=${FD_EXE}/${FN_EXE}

chmod +x ${FP_EXE}
cat ${FP_EXE}
#!/bin/bash

### print start message
timer_start=`date +%s`
echo "Hostname:          " $(hostname)
echo "Slurm Array Index: " ${SLURM_ARRAY_TASK_ID}
echo "Time Stamp:        " $(date +"%m-%d-%y+%T")
echo

### get arguments
FP_CNF=${1}
FP_DOWNLOAD_FILE_LIST=${2}
FD_DOWNLOAD_DIRECTORY=${3}

### set environment
source ${FP_CNF}

### show I/O
cd   ${FD_DOWNLOAD_DIRECTORY}
echo "Change directory:"
echo $(pwd)
echo

### execute: download file
echo "Download files..."
xargs -L 1 curl -O -J -L < ${FP_DOWNLOAD_FILE_LIST}
echo

### print end message
timer=`date +%s`
runtime=$(echo "${timer} - ${timer_start}" | bc -l)
echo
echo 'Done!'
echo "Run Time: $(displaytime ${runtime})"
echo

Preview folders and files

Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}
ls ${FD_OUT}
region_narrowPeak  signal_fold_change  signal_pvalue
Code
FD_OUTS=($(ls -d ${FD_DAT}/external/${TXT_FOLDER}/*))
for FD_OUT in ${FD_OUTS[@]}; do
    FOLDER_SUB=$(basename ${FD_OUT})
    echo ${FD_OUT}
    echo "Folder:" ${FOLDER_SUB}
    echo
done
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/region_narrowPeak
Folder: region_narrowPeak

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/signal_fold_change
Folder: signal_fold_change

/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/signal_pvalue
Folder: signal_pvalue
Code
FP_INPS=($(ls ${FD_DAT}/external/${TXT_FOLDER}/*/files.txt))
for FP_INP in ${FP_INPS[@]}; do
    FD_OUT=$(dirname  ${FP_INP})
    FOLDER_SUB=$(basename ${FD_OUT})
    NUM=$(cat ${FP_INP} | wc -l)
    
    echo "Folder:" ${FOLDER_SUB}
    echo "Count: " ${NUM} 
    echo
done
Folder: region_narrowPeak
Count:  19

Folder: signal_fold_change
Count:  19

Folder: signal_pvalue
Count:  19
Code
FP_INPS=($(ls ${FD_DAT}/external/${TXT_FOLDER}/*/checksum_md5sum.txt))
for FP_INP in ${FP_INPS[@]}; do
    FD_OUT=$(dirname  ${FP_INP})
    FOLDER_SUB=$(basename ${FD_OUT})
    NUM=$(cat ${FP_INP} | wc -l)
    
    echo "Folder:" ${FOLDER_SUB}
    echo "Count: " ${NUM} 
    echo
done
Folder: region_narrowPeak
Count:  19

Folder: signal_fold_change
Count:  19

Folder: signal_pvalue
Count:  19

Run download script

Code
### init: exe
FN_EXE=run_download_files.sh
FP_EXE=${FD_EXE}/${FN_EXE}

### init: directory
FD_OUTS=($(ls -d ${FD_DAT}/external/${TXT_FOLDER}/*))

### loop and execute
for FD_OUT in ${FD_OUTS[@]}; do

    ### init: file list
    FP_INP=${FD_OUT}/files.txt
    NUM=$(cat ${FP_INP} | wc -l)
    
    ### init: log file
    TXT_FOLDER_SUB=$(basename ${FD_OUT})
    FN_LOG=download.encode_chipseq_histone_250120.${TXT_FOLDER_SUB}.txt
    FP_LOG=${FD_LOG}/${FN_LOG}

    ### show progress
    echo "Folder:" ${FOLDER_SUB}
    echo "Count: " ${NUM} 
    echo
    
    ### execute
    cd ${FD_OUT}
    sbatch \
        --cpus-per-task 4 \
        --mem 4G \
        --output ${FP_LOG} \
        ${FP_EXE} ${FP_CNF} ${FP_INP} ${FD_OUT}
    echo
done
Folder: signal_pvalue
Count:  19

Submitted batch job 275736
Folder: signal_pvalue
Count:  19

Submitted batch job 275737
Folder: signal_pvalue
Count:  19

Submitted batch job 275738

Run checksum

Code
### init: exe
FN_EXE=run_checksum_files.sh
FP_EXE=${FD_EXE}/${FN_EXE}

### init: directory
FD_OUTS=($(ls -d ${FD_DAT}/external/${TXT_FOLDER}/*))

### loop and execute
for FD_OUT in ${FD_OUTS[@]}; do

    ### init: I/O file
    FP_INP=${FD_OUT}/checksum_md5sum.txt
    FP_OUT=${FD_OUT}/checksum_results.txt
    NUM=$(cat ${FP_INP} | wc -l)
    
    ### init: log file
    TXT_FOLDER_SUB=$(basename ${FD_OUT})
    FN_LOG=checksum.encode_chipseq_histone_250120.${TXT_FOLDER_SUB}.txt
    FP_LOG=${FD_LOG}/${FN_LOG}

    ### show progress
    echo "Folder:" ${FOLDER_SUB}
    echo "Count: " ${NUM} 
    echo
    
    ### execute
    cd ${FD_OUT}
    sbatch \
        --cpus-per-task 4 \
        --mem 4G \
        --output ${FP_LOG} \
        ${FP_EXE} ${FP_CNF} ${FD_OUT} ${FP_INP} ${FP_OUT}
    echo
done
Folder: signal_pvalue
Count:  19

Submitted batch job 275742

Folder: signal_pvalue
Count:  19

Submitted batch job 275743

Folder: signal_pvalue
Count:  19

Submitted batch job 275744

Review

Check output files

Code
ls ${FD_DAT}/external/${TXT_FOLDER}
region_narrowPeak  signal_fold_change  signal_pvalue
Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}/region_narrowPeak

cd ${FD_OUT}
ls -sh {*bed.gz,*.tsv} | wc -l
ls -sh {*bed.gz,*.tsv}
20
 648K ENCFF122CSI.bed.gz   204K ENCFF462AVD.bed.gz   3.1M ENCFF801AHF.bed.gz
 2.3M ENCFF135ZLM.bed.gz   2.7M ENCFF540NGG.bed.gz   596K ENCFF885FQN.bed.gz
 2.7M ENCFF148UQI.bed.gz   1.4M ENCFF544LXB.bed.gz  1004K ENCFF891CHI.bed.gz
 3.6M ENCFF193ERO.bed.gz   1.4M ENCFF561OUZ.bed.gz   804K ENCFF909RKY.bed.gz
 2.0M ENCFF209OQD.bed.gz   1.1M ENCFF689QIJ.bed.gz   504K ENCFF963GZJ.bed.gz
 2.8M ENCFF213OTI.bed.gz   752K ENCFF706WUF.bed.gz   8.0K metadata.tsv
 2.3M ENCFF323WOT.bed.gz   1.6M ENCFF749KLQ.bed.gz
Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}/signal_fold_change

cd ${FD_OUT}
ls -sh {*bigWig,*.tsv} | wc -l
ls -sh {*bigWig,*.tsv}
20
610M ENCFF139KZL.bigWig  434M ENCFF399SGM.bigWig  652M ENCFF654SLZ.bigWig
1.4G ENCFF242ENK.bigWig  885M ENCFF544AVW.bigWig  510M ENCFF660WUG.bigWig
1.1G ENCFF253TOF.bigWig  558M ENCFF583BKU.bigWig  345M ENCFF806YEZ.bigWig
593M ENCFF286WRJ.bigWig  970M ENCFF601JGK.bigWig  302M ENCFF911JVK.bigWig
1.2G ENCFF317VHO.bigWig  658M ENCFF605FAF.bigWig  380M ENCFF959YJV.bigWig
399M ENCFF347YYH.bigWig  1.4G ENCFF607SUJ.bigWig  8.0K metadata.tsv
1.5G ENCFF381NDD.bigWig  770M ENCFF621DJP.bigWig
Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}/signal_pvalue

cd ${FD_OUT}
ls -sh {*bigWig,*.tsv} | wc -l
ls -sh {*bigWig,*.tsv}
20
685M ENCFF054RSU.bigWig  857M ENCFF334HSS.bigWig  899M ENCFF632NQA.bigWig
337M ENCFF071GML.bigWig  1.1G ENCFF405ZDL.bigWig  369M ENCFF633OZC.bigWig
568M ENCFF178QDA.bigWig  965M ENCFF457URZ.bigWig  675M ENCFF694ODT.bigWig
837M ENCFF202EVH.bigWig  487M ENCFF461RKK.bigWig  761M ENCFF767UON.bigWig
732M ENCFF220RGS.bigWig  1.3G ENCFF465GBD.bigWig  613M ENCFF847BFA.bigWig
642M ENCFF239EBH.bigWig  1.4G ENCFF582IMB.bigWig  8.0K metadata.tsv
496M ENCFF287LBI.bigWig  1.3G ENCFF605EVL.bigWig

Check checksum results

Code
ls ${FD_DAT}/external/${TXT_FOLDER}
region_narrowPeak  signal_fold_change  signal_pvalue
Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}/region_narrowPeak
FP_OUT=${FD_OUT}/checksum_results.txt

cat ${FP_OUT}
cat ${FP_OUT} | grep "FAILED" && echo "FAILED" || echo "All PASSED"
ENCFF689QIJ.bed.gz: OK
ENCFF323WOT.bed.gz: OK
ENCFF540NGG.bed.gz: OK
ENCFF462AVD.bed.gz: OK
ENCFF749KLQ.bed.gz: OK
ENCFF909RKY.bed.gz: OK
ENCFF209OQD.bed.gz: OK
ENCFF891CHI.bed.gz: OK
ENCFF148UQI.bed.gz: OK
ENCFF706WUF.bed.gz: OK
ENCFF122CSI.bed.gz: OK
ENCFF213OTI.bed.gz: OK
ENCFF193ERO.bed.gz: OK
ENCFF801AHF.bed.gz: OK
ENCFF544LXB.bed.gz: OK
ENCFF561OUZ.bed.gz: OK
ENCFF885FQN.bed.gz: OK
ENCFF963GZJ.bed.gz: OK
ENCFF135ZLM.bed.gz: OK
All PASSED
Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}/signal_fold_change
FP_OUT=${FD_OUT}/checksum_results.txt

cat ${FP_OUT}
cat ${FP_OUT} | grep "FAILED" && echo "FAILED" || echo "All PASSED"
ENCFF544AVW.bigWig: OK
ENCFF286WRJ.bigWig: OK
ENCFF621DJP.bigWig: OK
ENCFF959YJV.bigWig: OK
ENCFF911JVK.bigWig: OK
ENCFF242ENK.bigWig: OK
ENCFF806YEZ.bigWig: OK
ENCFF583BKU.bigWig: OK
ENCFF660WUG.bigWig: OK
ENCFF605FAF.bigWig: OK
ENCFF654SLZ.bigWig: OK
ENCFF317VHO.bigWig: OK
ENCFF139KZL.bigWig: OK
ENCFF399SGM.bigWig: OK
ENCFF347YYH.bigWig: OK
ENCFF381NDD.bigWig: OK
ENCFF607SUJ.bigWig: OK
ENCFF601JGK.bigWig: OK
ENCFF253TOF.bigWig: OK
All PASSED
Code
FD_OUT=${FD_DAT}/external/${TXT_FOLDER}/signal_pvalue
FP_OUT=${FD_OUT}/checksum_results.txt

cat ${FP_OUT}
cat ${FP_OUT} | grep "FAILED" && echo "FAILED" || echo "All PASSED"
ENCFF767UON.bigWig: OK
ENCFF582IMB.bigWig: OK
ENCFF287LBI.bigWig: OK
ENCFF220RGS.bigWig: OK
ENCFF054RSU.bigWig: OK
ENCFF694ODT.bigWig: OK
ENCFF334HSS.bigWig: OK
ENCFF239EBH.bigWig: OK
ENCFF178QDA.bigWig: OK
ENCFF071GML.bigWig: OK
ENCFF461RKK.bigWig: OK
ENCFF202EVH.bigWig: OK
ENCFF605EVL.bigWig: OK
ENCFF847BFA.bigWig: OK
ENCFF465GBD.bigWig: OK
ENCFF633OZC.bigWig: OK
ENCFF405ZDL.bigWig: OK
ENCFF632NQA.bigWig: OK
ENCFF457URZ.bigWig: OK
All PASSED

Check execution log

Code
ls ${FD_DAT}/external/${TXT_FOLDER}
region_narrowPeak  signal_fold_change  signal_pvalue
Code
FN_LOG=download.encode_chipseq_histone_250120.region_narrowPeak.txt
FP_LOG=${FD_LOG}/${FN_LOG}

head -n 20 ${FP_LOG}
Hostname:           plp-rcc-node-03
Slurm Array Index: 
Time Stamp:         05-22-25+16:25:55

Change directory:
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/region_narrowPeak

Download files...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1935  100  1935    0     0    497      0  0:00:03  0:00:03 --:--:--   497
Warning: Failed to create the file ENCFF689QIJ.bed.gz: File exists
  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
curl: (23) Failed writing header
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1933  100  1933    0     0   5154      0 --:--:-- --:--:-- --:--:--  5140
Warning: Failed to create the file ENCFF323WOT.bed.gz: File exists
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
curl: (23) Failed writing header
Code
FN_LOG=checksum.encode_chipseq_histone_250120.region_narrowPeak.txt
FP_LOG=${FD_LOG}/${FN_LOG}

head -n 20 ${FP_LOG}
Hostname:           plp-rcc-node-03
Slurm Array Index: 
Time Stamp:         05-22-25+16:37:50

Change directory:
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/region_narrowPeak

Checksum files...


Done!
Run Time: 1 seconds