Calculate GC% of ATAC regions 01

Run bedtools nuc

set environment

Code
source ../run_config_project.sh
show_env
You are working on             Duke Server: RCC
BASE DIRECTORY (FD_BASE):      /data/reddylab/Kuei
REPO DIRECTORY (FD_REPO):      /data/reddylab/Kuei/repo
WORK DIRECTORY (FD_WORK):      /data/reddylab/Kuei/work
DATA DIRECTORY (FD_DATA):      /data/reddylab/Kuei/data
CONTAINER DIR. (FD_SING):      /data/reddylab/Kuei/container

You are working with           ENCODE FCC
PATH OF PROJECT (FD_PRJ):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC
PROJECT RESULTS (FD_RES):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results
PROJECT SCRIPTS (FD_EXE):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts
PROJECT DATA    (FD_DAT):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data
PROJECT NOTE    (FD_NBK):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks
PROJECT DOCS    (FD_DOC):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs
PROJECT LOG     (FD_LOG):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log
PROJECT REF     (FD_REF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references
PROJECT IMAGE   (FP_PRJ_SIF):  /data/reddylab/Kuei/container/project/singularity_proj_encode_fcc.sif
PROJECT CONF.   (FP_CNF):      /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts/config_project.sh

Preview

Check region 01 file

Code
FP_REGION_LABEL=${FD_RES}/region/summary/metadata.label.astarr_macs.tsv
cat ${FP_REGION_LABEL}
Folder  FName   Label
fcc_astarr_macs K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz fcc_astarr_macs_input_overlap
fcc_astarr_macs K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.bed.gz    fcc_astarr_macs_input_union
Code
FD_INP=${FD_RES}/region/fcc_astarr_macs
FN_INP="K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz"
FP_INP=${FD_INP}/${FN_INP}

ls ${FP_INP}
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/fcc_astarr_macs/K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz

Check region 02 file

Code
ls ${FD_RES}/region/genome_tss
K562.hg38.TSS.selected_by_highest_Pol2_signal.bed.gz
K562.hg38.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz
K562.hg38.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.tsv
K562.hg38.TSS.selected_by_highest_Pol2_signal.tsv
summary
Code
ls ${FD_RES}/region/genome_tss/summary
description.tsv  metadata.label.tsv
Code
FP_REGION_LABEL=${FD_RES}/region/genome_tss/summary/metadata.label.tsv
cat ${FP_REGION_LABEL}
Folder  FName   Label
genome_tss  K562.hg38.TSS.selected_by_highest_Pol2_signal.bed.gz    genome_tss_pol2
genome_tss  K562.hg38.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz genome_tss_pol2_rnaseq

Check loop

Code
### init
FP_REGION_LABEL_A=${FD_RES}/region/summary/metadata.label.astarr_macs.tsv
FP_REGION_LABEL_B=${FD_RES}/region/genome_tss/summary/metadata.label.tsv

### Loop region A
while read FOLDER_REG_A FNAME_REG_A LABEL_REG_A; do
    
    ### Set input A
    FD_INP_A=${FD_RES}/region/${FOLDER_REG_A}
    FN_INP_A=${FNAME_REG_A}
    FP_INP_A=${FD_INP_A}/${FN_INP_A}
    FOLDER_A=${LABEL_REG_A}
    
    ### Loop region B
    while read FOLDER_REG_B FNAME_REG_B LABEL_REG_B; do

        ### Set input B
        FD_INP_B=${FD_RES}/region/${FOLDER_REG_B}
        FN_INP_B=${FNAME_REG_B}
        FP_INP_B=${FD_INP_B}/${FN_INP_B}
        FOLDER_B=${FOLDER_REG_B}
        
        ### Set output
        FD_OUT=${FD_RES}/region_closest/${FOLDER_A}/${FOLDER_B}
        FN_OUT=${LABEL_REG_A}.${LABEL_REG_B}.bed.gz
        FP_OUT=${FD_OUT}/${FN_OUT}
        
        ### setup log file
        FN_LOG=region.closest.${LABEL_REG_A}.${LABEL_REG_B}.txt
        FP_LOG=${FD_LOG}/${FN_LOG}
        
        ### Set script
        FP_EXE=${FD_EXE}/run_bedtools_closest.sh
        
        ### show progress
        echo ==============================
        echo "Input  Label A:" ${LABEL_REG_A}
        echo "Input  Label B:" ${LABEL_REG_B}
        echo 
        echo "Output FDiry:  " ${FD_OUT}
        echo "Output FName:  " ${FN_OUT}
        echo "Log    FPath:  " '${FD_LOG}/'${FN_LOG}
        echo 
    done < <(cat ${FP_REGION_LABEL_B} | awk 'NR >=2 {print}')
done < <(cat ${FP_REGION_LABEL_A} | awk 'NR >=2 {print}')
==============================
Input  Label A: fcc_astarr_macs_input_overlap
Input  Label B: genome_tss_pol2

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/genome_tss
Output FName:   fcc_astarr_macs_input_overlap.genome_tss_pol2.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_overlap.genome_tss_pol2.txt

==============================
Input  Label A: fcc_astarr_macs_input_overlap
Input  Label B: genome_tss_pol2_rnaseq

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/genome_tss
Output FName:   fcc_astarr_macs_input_overlap.genome_tss_pol2_rnaseq.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_overlap.genome_tss_pol2_rnaseq.txt

==============================
Input  Label A: fcc_astarr_macs_input_union
Input  Label B: genome_tss_pol2

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/genome_tss
Output FName:   fcc_astarr_macs_input_union.genome_tss_pol2.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_union.genome_tss_pol2.txt

==============================
Input  Label A: fcc_astarr_macs_input_union
Input  Label B: genome_tss_pol2_rnaseq

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/genome_tss
Output FName:   fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.txt

Check config file

Code
echo ${FP_CNF}
/data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts/config_project.sh

Execute

Code
### init
FP_REGION_LABEL_A=${FD_RES}/region/summary/metadata.label.astarr_macs.tsv
FP_REGION_LABEL_B=${FD_RES}/region/genome_tss/summary/metadata.label.tsv

### Loop region A
while read FOLDER_REG_A FNAME_REG_A LABEL_REG_A; do
    
    ### Set input A
    FD_INP_A=${FD_RES}/region/${FOLDER_REG_A}
    FN_INP_A=${FNAME_REG_A}
    FP_INP_A=${FD_INP_A}/${FN_INP_A}
    FOLDER_A=${LABEL_REG_A}
    
    ### Loop region B
    while read FOLDER_REG_B FNAME_REG_B LABEL_REG_B; do

        ### Set input B
        FD_INP_B=${FD_RES}/region/${FOLDER_REG_B}
        FN_INP_B=${FNAME_REG_B}
        FP_INP_B=${FD_INP_B}/${FN_INP_B}
        FOLDER_B=${FOLDER_REG_B}
        
        ### Set output
        FD_OUT=${FD_RES}/region_closest/${FOLDER_A}/${FOLDER_B}
        FN_OUT=${LABEL_REG_A}.${LABEL_REG_B}.bed.gz
        FP_OUT=${FD_OUT}/${FN_OUT}
        
        ### setup log file
        FN_LOG=region.closest.${LABEL_REG_A}.${LABEL_REG_B}.txt
        FP_LOG=${FD_LOG}/${FN_LOG}
        
        ### Set script
        FP_EXE=${FD_EXE}/run_bedtools_closest.sh
        
        ### show progress
        echo ==============================
        echo "Input  Label A:" ${LABEL_REG_A}
        echo "Input  Label B:" ${LABEL_REG_B}
        echo 
        echo "Output FDiry:  " ${FD_OUT}
        echo "Output FName:  " ${FN_OUT}
        echo "Log    FPath:  " '${FD_LOG}/'${FN_LOG}
        echo 
        
        ### execute
        mkdir -p ${FD_OUT}
        sbatch \
            --cpus-per-task 4 \
            --mem 4G \
            --output ${FP_LOG} \
            ${FP_EXE} ${FP_CNF} ${FP_INP_A} ${FP_INP_B} ${FP_OUT}
        echo
    done < <(cat ${FP_REGION_LABEL_B} | awk 'NR >=2 {print}')
done < <(cat ${FP_REGION_LABEL_A} | awk 'NR >=2 {print}')
==============================
Input  Label A: fcc_astarr_macs_input_overlap
Input  Label B: genome_tss_pol2

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/genome_tss
Output FName:   fcc_astarr_macs_input_overlap.genome_tss_pol2.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_overlap.genome_tss_pol2.txt

Submitted batch job 275496

==============================
Input  Label A: fcc_astarr_macs_input_overlap
Input  Label B: genome_tss_pol2_rnaseq

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/genome_tss
Output FName:   fcc_astarr_macs_input_overlap.genome_tss_pol2_rnaseq.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_overlap.genome_tss_pol2_rnaseq.txt

Submitted batch job 275497

==============================
Input  Label A: fcc_astarr_macs_input_union
Input  Label B: genome_tss_pol2

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/genome_tss
Output FName:   fcc_astarr_macs_input_union.genome_tss_pol2.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_union.genome_tss_pol2.txt

Submitted batch job 275498

==============================
Input  Label A: fcc_astarr_macs_input_union
Input  Label B: genome_tss_pol2_rnaseq

Output FDiry:   /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/genome_tss
Output FName:   fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.bed.gz
Log    FPath:   ${FD_LOG}/region.closest.fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.txt

Submitted batch job 275499

Review

Code
cat ${FD_LOG}/region.closest.fcc_astarr_macs_input_overlap.genome_tss_pol2.txt
Hostname:           plp-rcc-node-02
Slurm Array Index: 
Time Stamp:         05-20-25+08:51:06

Input:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/fcc_astarr_macs/K562.hg38.ASTARR.macs.KS91.input.rep_all.max_overlaps.q5.bed.gz

show first few lines of input
chr1    10038   10405   chr1:10038-10405
chr1    14282   14614   chr1:14282-14614
chr1    16025   16338   chr1:16025-16338
chr1    17288   17689   chr1:17288-17689
chr1    28934   29499   chr1:28934-29499
chr1    115429  115969  chr1:115429-115969
chr1    136201  137353  chr1:136201-137353
chr1    137748  138049  chr1:137748-138049
chr1    138321  139517  chr1:138321-139517
chr1    181005  181854  chr1:181005-181854

Input:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/genome_tss/K562.hg38.TSS.selected_by_highest_Pol2_signal.bed.gz

show first few lines of input
chr1    11873   11874   chr1:11873-11874    DDX11L1 2.3e-4  TSS_Pol2    DDX11L1
chr1    17436   17437   chr1:17436-17437    MIR6859-1   9.43812 TSS_Pol2    MIR6859-1
chr1    17436   17437   chr1:17436-17437    MIR6859-2   9.43812 TSS_Pol2    MIR6859-2
chr1    17436   17437   chr1:17436-17437    MIR6859-3   9.43812 TSS_Pol2    MIR6859-3
chr1    17436   17437   chr1:17436-17437    MIR6859-4   9.43812 TSS_Pol2    MIR6859-4
chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2    WASH7P
chr1    30365   30366   chr1:30365-30366    MIR1302-10  0.562995    TSS_Pol2    MIR1302-10
chr1    30365   30366   chr1:30365-30366    MIR1302-11  0.562995    TSS_Pol2    MIR1302-11
chr1    30365   30366   chr1:30365-30366    MIR1302-2   0.562995    TSS_Pol2    MIR1302-2
chr1    30365   30366   chr1:30365-30366    MIR1302-9   0.562995    TSS_Pol2    MIR1302-9


Output:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_overlap/genome_tss/fcc_astarr_macs_input_overlap.genome_tss_pol2.bed.gz

show first few lines of output:
chr1    10038   10405   chr1:10038-10405    chr1    11873   11874   chr1:11873-11874    DDX11L1 2.3e-4  TSS_Pol2    DDX11L1 1469
chr1    14282   14614   chr1:14282-14614    chr1    11873   11874   chr1:11873-11874    DDX11L1 2.3e-4  TSS_Pol2    DDX11L1 2409
chr1    16025   16338   chr1:16025-16338    chr1    17436   17437   chr1:17436-17437    MIR6859-1   9.43812 TSS_Pol2    MIR6859-1   1099
chr1    16025   16338   chr1:16025-16338    chr1    17436   17437   chr1:17436-17437    MIR6859-2   9.43812 TSS_Pol2    MIR6859-2   1099
chr1    16025   16338   chr1:16025-16338    chr1    17436   17437   chr1:17436-17437    MIR6859-3   9.43812 TSS_Pol2    MIR6859-3   1099
chr1    16025   16338   chr1:16025-16338    chr1    17436   17437   chr1:17436-17437    MIR6859-4   9.43812 TSS_Pol2    MIR6859-4   1099
chr1    17288   17689   chr1:17288-17689    chr1    17436   17437   chr1:17436-17437    MIR6859-1   9.43812 TSS_Pol2    MIR6859-1   0
chr1    17288   17689   chr1:17288-17689    chr1    17436   17437   chr1:17436-17437    MIR6859-2   9.43812 TSS_Pol2    MIR6859-2   0
chr1    17288   17689   chr1:17288-17689    chr1    17436   17437   chr1:17436-17437    MIR6859-3   9.43812 TSS_Pol2    MIR6859-3   0
chr1    17288   17689   chr1:17288-17689    chr1    17436   17437   chr1:17436-17437    MIR6859-4   9.43812 TSS_Pol2    MIR6859-4   0


Done!
Run Time: 1 seconds
Code
cat ${FD_LOG}/region.closest.fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.txt
Hostname:           plp-rcc-node-02
Slurm Array Index: 
Time Stamp:         05-20-25+08:51:06

Input:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/fcc_astarr_macs/K562.hg38.ASTARR.macs.KS91.input.rep_all.union.q5.bed.gz

show first few lines of input
chr1    10015   10442   chr1:10015-10442
chr1    14253   14645   chr1:14253-14645
chr1    16015   16477   chr1:16015-16477
chr1    17237   17772   chr1:17237-17772
chr1    28903   29613   chr1:28903-29613
chr1    30803   31072   chr1:30803-31072
chr1    101603  101849  chr1:101603-101849
chr1    115411  115986  chr1:115411-115986
chr1    118518  118743  chr1:118518-118743
chr1    136071  137429  chr1:136071-137429

Input:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region/genome_tss/K562.hg38.TSS.selected_by_highest_Pol2_signal.filtered_by_RNAseq_TPM.bed.gz

show first few lines of input
chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P
chr1    827522  827523  chr1:827522-827523  LINC00115   64.4656 TSS_Pol2_RNAseq LINC00115
chr1    827590  827591  chr1:827590-827591  LINC01128   64.4603 TSS_Pol2_RNAseq LINC01128
chr1    876802  876803  chr1:876802-876803  FAM41C  0.00788399  TSS_Pol2_RNAseq FAM41C
chr1    959256  959257  chr1:959256-959257  NOC2L   104.866 TSS_Pol2_RNAseq NOC2L
chr1    960583  960584  chr1:960583-960584  KLHL17  8.22571 TSS_Pol2_RNAseq KLHL17
chr1    1000097 1000098 chr1:1000097-1000098    HES4    50.5814 TSS_Pol2_RNAseq HES4
chr1    1013496 1013497 chr1:1013496-1013497    ISG15   42.9708 TSS_Pol2_RNAseq ISG15
chr1    1020119 1020120 chr1:1020119-1020120    AGRN    2.71433 TSS_Pol2_RNAseq AGRN
chr1    1116089 1116090 chr1:1116089-1116090    C1orf159    16.4374 TSS_Pol2_RNAseq C1orf159


Output:  /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results/region_closest/fcc_astarr_macs_input_union/genome_tss/fcc_astarr_macs_input_union.genome_tss_pol2_rnaseq.bed.gz

show first few lines of output:
chr1    10015   10442   chr1:10015-10442    chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  18929
chr1    14253   14645   chr1:14253-14645    chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  14726
chr1    16015   16477   chr1:16015-16477    chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  12894
chr1    17237   17772   chr1:17237-17772    chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  11599
chr1    28903   29613   chr1:28903-29613    chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  0
chr1    30803   31072   chr1:30803-31072    chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  1433
chr1    101603  101849  chr1:101603-101849  chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  72233
chr1    115411  115986  chr1:115411-115986  chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  86041
chr1    118518  118743  chr1:118518-118743  chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  89148
chr1    136071  137429  chr1:136071-137429  chr1    29370   29371   chr1:29370-29371    WASH7P  2.3e-4  TSS_Pol2_RNAseq WASH7P  106701


Done!
Run Time: 1 seconds