Prepare ENCODE ChIP-seq Histone 11

Export region data to result folder

Set environment

Code
source ../run_config_project.sh
show_env
You are working on             Duke Server: RCC
BASE DIRECTORY (FD_BASE):      /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO):      /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK):      /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA):      /data/reddylab/Kuei/data
CONTAINER DIR. (FD_SING):      /data/reddylab/Kuei/container

You are working with           ENCODE FCC
PATH OF PROJECT (FD_PRJ):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA    (FD_DAT):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE    (FD_NBK):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS    (FD_DOC):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG     (FD_LOG):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT REF     (FD_REF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references
PROJECT IMAGE   (FP_PRJ_SIF):  /data/reddylab/Kuei/container/project/singularity_proj_encode_fcc.sif
PROJECT CONF.   (FP_CNF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts/config_project.sh

Set global variable

Code
REGION_FOLDER_DAT="encode_chipseq_histone_250120"
REGION_FOLDER_RES="encode_chipseq_histone"

Prepare

Code
ls ${FD_DAT}/external/
chrom.hg38.main.bed              encode_crispri_hcrff
chrom.hg38.total.bed             encode_open_chromatin
chrom.sizes.hg19                 encode_rnaseq
chrom.sizes.hg38                 genome_tss
encode_chipseq_agarwal2023       hic_insitu_GM12878
encode_chipseq_flagship          hic_insitu_K562_ENCSR545YBD
encode_chipseq_histone_241223    hic_intact_K562_deep
encode_chipseq_histone_250120    hic_intact_K562_ENCSR479XDG
encode_chipseq_histone_previous  protein_interpro
encode_chipseq_subset            RNAseq
encode_chipseq_tf_250120         tmp
encode_chromatin_states

Check data

Code
FDIRY=${FD_DAT}/external/${REGION_FOLDER_DAT}/region_narrowPeak

echo ${FDIRY}
ls   ${FDIRY}/*.bed.gz | wc -l
ls   ${FDIRY}/*.bed.gz | xargs -n 1 basename
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/region_narrowPeak
19
ENCFF122CSI.bed.gz
ENCFF135ZLM.bed.gz
ENCFF148UQI.bed.gz
ENCFF193ERO.bed.gz
ENCFF209OQD.bed.gz
ENCFF213OTI.bed.gz
ENCFF323WOT.bed.gz
ENCFF462AVD.bed.gz
ENCFF540NGG.bed.gz
ENCFF544LXB.bed.gz
ENCFF561OUZ.bed.gz
ENCFF689QIJ.bed.gz
ENCFF706WUF.bed.gz
ENCFF749KLQ.bed.gz
ENCFF801AHF.bed.gz
ENCFF885FQN.bed.gz
ENCFF891CHI.bed.gz
ENCFF909RKY.bed.gz
ENCFF963GZJ.bed.gz
Code
FDIRY=${FD_DAT}/external/${REGION_FOLDER_DAT}/region_narrowPeak
FNAME=ENCFF148UQI.bed.gz
FPATH=${FDIRY}/${FNAME}
zcat ${FPATH} | head
chr1    100036789   100037192   Peak_17399  250 .   8.03489 25.00575    22.50131    208
chr1    100037429   100039302   Peak_3318   1000    .   20.81212    122.65933   118.96193   861
chr1    100049537   100049764   Peak_142494 43  .   3.45416 4.33257 2.45926 16
chr1    10005813    10005968    Peak_142495 43  .   3.45416 4.33257 2.45926 135
chr1    100064740   100064990   Peak_81885  64  .   4.52586 6.41818 4.37584 118
chr1    100132263   100134079   Peak_3836   1000    .   19.85187    115.72178   112.11715   495
chr1    100248742   100249893   Peak_1109   1000    .   37.83720    168.92155   164.59120   845
chr1    100265488   100267236   Peak_5461   974 .   13.71470    97.44978    94.07813    933
chr1    1002870 1003877 Peak_26394  151 .   6.39187 15.12382    12.78725    716
chr1    100299841   100300073   Peak_74576  68  .   4.23764 6.87195 4.80571 134

Execute

Copy and sort the files

ln -s file link
zcat ${FP_INP} | sort -k 1,1 -k2,2n | gzip -c > ${FP_OUT}
Code
### set directory
FD_INP=${FD_DAT}/external/${REGION_FOLDER_DAT}/region_narrowPeak
FD_OUT=${FD_RES}/region/${REGION_FOLDER_RES}
mkdir -p ${FD_OUT}

### loop and generate sorted files
FP_INPS=($(ls ${FD_INP}/*bed.gz))
for FP_INP in ${FP_INPS[@]}; do
    ### get input file name
    FN_INP=$(basename ${FP_INP})
    echo ${FN_INP}
    
    ### set output file name
    FN_OUT=${FN_INP}
    FP_OUT=${FD_OUT}/${FN_OUT}
    
    ### sort and output to a new file
    zcat ${FP_INP} | sort -k 1,1 -k2,2n | gzip -c > ${FP_OUT}
done
ENCFF122CSI.bed.gz
ENCFF135ZLM.bed.gz
ENCFF148UQI.bed.gz
ENCFF193ERO.bed.gz
ENCFF209OQD.bed.gz
ENCFF213OTI.bed.gz
ENCFF323WOT.bed.gz
ENCFF462AVD.bed.gz
ENCFF540NGG.bed.gz
ENCFF544LXB.bed.gz
ENCFF561OUZ.bed.gz
ENCFF689QIJ.bed.gz
ENCFF706WUF.bed.gz
ENCFF749KLQ.bed.gz
ENCFF801AHF.bed.gz
ENCFF885FQN.bed.gz
ENCFF891CHI.bed.gz
ENCFF909RKY.bed.gz
ENCFF963GZJ.bed.gz

Copy metadata

Code
### set directory
FD_INP=${FD_DAT}/external/${REGION_FOLDER_DAT}/region_narrowPeak
FD_OUT=${FD_RES}/region/${REGION_FOLDER_RES}/summary
mkdir -p ${FD_OUT}

###
ls ${FD_INP}/metadata.tsv
cp ${FD_INP}/metadata.tsv ${FD_OUT}/metadata.tsv
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data/external/encode_chipseq_histone_250120/region_narrowPeak/metadata.tsv

Review

Check output files

Code
### set directory
FD_INP=${FD_DAT}/external/${REGION_FOLDER_DAT}/region_narrowPeak
FD_OUT=${FD_RES}/region/${REGION_FOLDER_RES}/

ls -1 ${FD_OUT}/*.bed.gz | xargs -n 1 basename
ENCFF122CSI.bed.gz
ENCFF135ZLM.bed.gz
ENCFF148UQI.bed.gz
ENCFF193ERO.bed.gz
ENCFF209OQD.bed.gz
ENCFF213OTI.bed.gz
ENCFF323WOT.bed.gz
ENCFF462AVD.bed.gz
ENCFF540NGG.bed.gz
ENCFF544LXB.bed.gz
ENCFF561OUZ.bed.gz
ENCFF689QIJ.bed.gz
ENCFF706WUF.bed.gz
ENCFF749KLQ.bed.gz
ENCFF801AHF.bed.gz
ENCFF885FQN.bed.gz
ENCFF891CHI.bed.gz
ENCFF909RKY.bed.gz
ENCFF963GZJ.bed.gz
Code
FDIRY=${FD_INP}
FNAME=ENCFF122CSI.bed.gz
FPATH=${FDIRY}/${FNAME}
zcat ${FPATH} | head
chr1    100035996   100036405   Peak_23348  63  .   3.98933 6.37021 4.27210 233
chr1    100036685   100039654   Peak_4007   1000    .   58.10942    571.11139   567.66040   1640
chr1    100131924   100134195   Peak_5504   1000    .   43.58243    497.47467   494.24628   842
chr1    100248817   100250176   Peak_67 1000    .   126.88165   912.40326   906.58197   791
chr1    100265444   100267279   Peak_2692   1000    .   52.96255    643.15088   639.43225   1005
chr1    10032383    10032660    Peak_15319  296 .   8.03387 29.66819    27.40563    136
chr1    10032894    10035748    Peak_1033   1000    .   63.18290    751.14288   746.85895   484
chr1    100350792   100354011   Peak_2248   1000    .   89.99366    668.75806   664.92468   2475
chr1    10035799    10036151    Peak_16359  221 .   8.02052 22.18122    19.95345    178
chr1    100363088   100364901   Peak_10572  1000    .   34.24813    173.46451   170.88153   909
Code
FDIRY=${FD_OUT}
FNAME=ENCFF122CSI.bed.gz
FPATH=${FDIRY}/${FNAME}
zcat ${FPATH} | head
chr1    137633  138169  Peak_12165  933 .   27.65562    93.36147    90.92577    340
chr1    138355  139639  Peak_11436  1000    .   34.56117    125.43503   122.93681   1105
chr1    777524  779517  Peak_8984   1000    .   37.83845    281.30594   278.54050   1009
chr1    779548  780064  Peak_18128  142 .   7.26780 14.20008    12.02068    165
chr1    826539  827913  Peak_9649   1000    .   55.93549    234.42130   231.73557   895
chr1    903939  905007  Peak_12342  864 .   23.00623    86.46342    84.04183    651
chr1    905089  905431  Peak_16314  225 .   7.46203 22.55154    20.32200    132
chr1    923801  925695  Peak_13322  594 .   18.20666    59.49374    57.13548    408
chr1    940312  940899  Peak_14365  406 .   12.27774    40.68121    38.37808    272
chr1    941307  941988  Peak_17047  186 .   8.26568 18.64017    16.43216    512