Prepare ENCODE ATAC/DNase Peaks 03

Prepare the data

Set environment

Code
source ../run_config_project.sh
show_env
You are working on             Duke Server: HARDAC
BASE DIRECTORY (FD_BASE):      /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO):      /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK):      /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA):      /data/reddylab/Kuei/data
CONTAINER DIR. (FD_SING):      /data/reddylab/Kuei/container

You are working with           ENCODE FCC
PATH OF PROJECT (FD_PRJ):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA    (FD_DAT):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE    (FD_NBK):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS    (FD_DOC):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG     (FD_LOG):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT APP     (FD_APP):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/app
PROJECT REF     (FD_REF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references
PROJECT IMAGE   (FP_PRJ_SIF):  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/app/singularity_proj_encode_fcc.sif

Prepare

Code
FDIRY=${FD_DAT}/external/encode_open_chromatin

echo ${FDIRY}
ls   ${FDIRY}/*.bed.gz | wc -l
ls   ${FDIRY}/*.bed.gz | xargs -n 1 basename
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_open_chromatin
6
K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz
K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz
K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz
K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz
K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz
K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz
Code
FDIRY=${FD_DAT}/external/encode_open_chromatin
FNAME=K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz
FPATH=${FDIRY}/${FNAME}
zcat ${FPATH} | head
chr1    181400  181530  .   0   .   0.299874    -1  -1  75
chr1    778660  778800  .   0   .   14.1383 -1  -1  75
chr1    779137  779200  .   0   .   0.33144 -1  -1  75
chr1    827460  827554  .   0   .   3.38384 -1  -1  75
chr1    842880  843060  .   0   .   1.64457 -1  -1  75
chr1    869800  869980  .   0   .   2.23485 -1  -1  75
chr1    875760  875920  .   0   .   0.356692    -1  -1  75
chr1    898828  899004  .   0   .   0.363005    -1  -1  75
chr1    900030  900100  .   0   .   0.249369    -1  -1  75
chr1    901580  901645  .   0   .   0.217803    -1  -1  75

Copy and sort the files

ln -s file link
zcat ${FP_INP} | sort -k 1,1 -k2,2n | gzip -c > ${FP_OUT}
Code
### set directory
FD_INP=${FD_DAT}/external/encode_open_chromatin
FD_OUT=${FD_RES}/region/encode_open_chromatin
mkdir -p ${FD_OUT}

### loop and link files
FP_INPS=($(ls ${FD_INP}/*bed.gz))
for FP_INP in ${FP_INPS[@]}; do
    ### get input file name
    FN_INP=$(basename ${FP_INP})
    echo ${FN_INP}
    
    ### set output file name
    FN_OUT=${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    
    ### sort and output to a new file
    zcat ${FP_INP} | sort -k 1,1 -k2,2n | gzip -c > ${FP_OUT}
done
K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz
K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz
K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz
K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz
K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz
K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz

Review

Check output files

Code
### set directory
FD_INP=${FD_DAT}/external/encode_open_chromatin
FD_OUT=${FD_RES}/region/encode_open_chromatin

ls ${FD_OUT}
description.tsv
K562.hg38.ENCSR000EKS.ENCFF274YGF.DNase.bed.gz
K562.hg38.ENCSR000EOT.ENCFF185XRG.DNase.bed.gz
K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz
K562.hg38.ENCSR483RKN.ENCFF925CYR.ATAC.bed.gz
K562.hg38.ENCSR868FGK.ENCFF333TAT.ATAC.bed.gz
K562.hg38.ENCSR868FGK.ENCFF948AFM.ATAC.bed.gz
Code
FNAME=K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz
FPATH=${FD_INP}/${FNAME}
zcat ${FPATH} | head
chr1    100028923   100029241   Peak_202759 52  .   2.91347 5.24652 3.38358 74
chr1    100037575   100038998   Peak_172664 68  .   1.83651 6.83702 4.90355 60
chr1    100037575   100038998   Peak_30781  934 .   5.32597 93.45621    90.75060    988
chr1    100037575   100038998   Peak_37596  722 .   4.66917 72.27560    69.66279    791
chr1    100037575   100038998   Peak_38860  692 .   4.55797 69.21054    66.61273    1182
chr1    100037575   100038998   Peak_7320   1000    .   12.39880    394.75714   391.35434   513
chr1    100046904   100047187   Peak_125191 111 .   3.44067 11.11133    9.04548 94
chr1    100048739   100048974   Peak_163268 74  .   2.90807 7.42227 5.46578 145
chr1    100050176   100050470   Peak_153800 81  .   3.11401 8.14941 6.16809 117
chr1    100090679   100091339   Peak_149202 84  .   3.47044 8.41934 6.42649 269
Code
FNAME=K562.hg38.ENCSR483RKN.ENCFF558BLC.ATAC.bed.gz
FPATH=${FD_OUT}/${FNAME}
zcat ${FPATH} | head
chr1    10509   10674   Peak_125662 110 .   4.29354 11.01368    8.94965 94
chr1    41987   42385   Peak_116641 124 .   4.60022 12.47548    10.37921    281
chr1    41987   42385   Peak_178161 63  .   3.22015 6.39968 4.47989 99
chr1    46755   47025   Peak_217457 47  .   2.76013 4.70157 2.87144 182
chr1    68969   69964   Peak_105604 147 .   5.06024 14.77241    12.63228    733
chr1    68969   69964   Peak_203155 52  .   2.91347 5.24652 3.38358 77
chr1    68969   69964   Peak_80922  232 .   6.59365 23.21283    20.95212    310
chr1    78951   79407   Peak_150566 82  .   3.68018 8.27430 6.28501 335
chr1    778328  779235  Peak_145438 89  .   2.37705 8.90270 6.89830 74
chr1    778328  779235  Peak_2263   1000    .   28.93443    749.94922   745.95093   458