Prepare ENCODE Chromatin State 03

Prepare region files

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_FOLDER_REGION = "encode_chromatin_states"

Import data

Check data

Code
txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_REGION)
vec = dir(txt_fdiry)
for (txt in vec){cat(txt, "\n")}
ccres_v4.silencer.rest.tsv 
ccres_v4.silencer.starr.tsv 
checksum_md5sum.txt 
checksum_results.txt 
K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz 
K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.bed.gz 
run_download_files.sh 
run_download.log.txt 

Read table: ENCODE cCREs v4

Code
### set file path
txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_REGION)
txt_fname = "K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
vec_txt_cname = c(
    "Chrom", "ChromStart", "ChromEnd", "Name", "Score", "Strand", 
    "ThickStart", "ThickEnd", "ItemRgb", 
    "Label", "Note")
dat = read_tsv(txt_fpath, col_names = vec_txt_cname, show_col_types = FALSE)

### show and assign
dat_region_ccres = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Score Strand ThickStart ThickEnd ItemRgb Label Note
chr1 10033 10250 EH38E2776516 0 . 10033 10250 225,225,225 Low-DNase All-data/Full-classification
chr1 10385 10713 EH38E2776517 0 . 10385 10713 225,225,225 Low-DNase All-data/Full-classification
chr1 16097 16381 EH38E3951272 0 . 16097 16381 225,225,225 Low-DNase All-data/Full-classification
chr1 17343 17642 EH38E3951273 0 . 17343 17642 225,225,225 Low-DNase All-data/Full-classification
chr1 29320 29517 EH38E3951274 0 . 29320 29517 225,225,225 Low-DNase All-data/Full-classification
chr1 66350 66509 EH38E3951275 0 . 66350 66509 225,225,225 Low-DNase All-data/Full-classification

Read table: ChromHMM

Code
### set file path
txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_REGION)
txt_fname = "K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
vec_txt_cname = c(
    "Chrom", "ChromStart", "ChromEnd", "Name", "Score", "Strand", 
    "ThickStart", "ThickEnd", "ItemRgb")
dat = read_tsv(txt_fpath, col_names = vec_txt_cname, show_col_types = FALSE)

### show and assign
dat_region_chromhmm = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Score Strand ThickStart ThickEnd ItemRgb
chr1 0 16000 Quies 1 . 0 16000 220,220,220
chr1 16000 16200 TxWk 1 . 16000 16200 63,154,80
chr1 16200 17400 Quies 1 . 16200 17400 220,220,220
chr1 17400 17600 TxWk 1 . 17400 17600 63,154,80
chr1 17600 118400 Quies 1 . 17600 118400 220,220,220
chr1 118400 120200 Enh1 1 . 118400 120200 255,223,0

Read table: ENCODE cCREs v4 Silencer (REST)

Code
### set file path
txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_REGION)
txt_fname = "ccres_v4.silencer.rest.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE, comment = "#")

### show and assign
dat_region_ccres_silencer_rest = dat
fun_display_table(head(dat))
Chr Start End cCRE accession cCRE class Silencer class
chr10 100680786 100681128 EH38E4018829 CA-H3K4me3 REST+ silencer
chr10 101081011 101081271 EH38E4018984 CA-TF REST+ silencer
chr10 102361781 102362038 EH38E4019348 CA-CTCF REST+ silencer
chr10 103265839 103266036 EH38E1495742 CA-CTCF REST+ silencer
chr10 104548071 104548416 EH38E4019821 CA-TF REST+ silencer
chr10 104750418 104750581 EH38E4019865 CA-TF REST+ silencer

Read table: ENCODE cCREs v4 Silencer (STARR)

Code
### set file path
txt_fdiry = file.path(FD_DAT, "external", TXT_FOLDER_REGION)
txt_fname = "ccres_v4.silencer.starr.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE, comment = "#")

### show and assign
dat_region_ccres_silencer_starr = dat
fun_display_table(head(dat))
Chr Start End cCRE accession cCRE class CAPRA quantification P-value Threshold
chr13 22970420 22970595 EH38E4082602 CA -2.7481 0.00e+00 Stringent
chr1 9356890 9357240 EH38E3954005 TF -2.3927 9.00e-07 Stringent
chr14 92678182 92678371 EH38E4113580 TF -2.3075 5.20e-06 Stringent
chr6 131563578 131563769 EH38E4417327 CA -2.3047 5.70e-06 Stringent
chr9 75977404 75977605 EH38E3886686 dELS -2.2951 6.60e-06 Stringent
chr16 26844012 26844260 EH38E4140223 CA -2.2464 1.03e-05 Stringent

Prepare regions: cCREs and ChromHMM

cCREs

Code
dat = dat_region_ccres
dat = dat %>%
    dplyr::mutate(Group = "cCREs") %>%
    dplyr::select(
        Chrom, ChromStart, ChromEnd, Name,
        Group, Label
    ) %>%
    dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_region_ccres_arrange = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Group Label
chr1 10033 10250 EH38E2776516 cCREs Low-DNase
chr1 10385 10713 EH38E2776517 cCREs Low-DNase
chr1 16097 16381 EH38E3951272 cCREs Low-DNase
chr1 17343 17642 EH38E3951273 cCREs Low-DNase
chr1 29320 29517 EH38E3951274 cCREs Low-DNase
chr1 66350 66509 EH38E3951275 cCREs Low-DNase

ChromHMM

Code
dat = dat_region_chromhmm
dat = dat %>%
    dplyr::mutate(
        Group = "ChromHMM",
        Label = Name
    ) %>%
    dplyr::select(
        Chrom, ChromStart, ChromEnd, Name,
        Group, Label
    ) %>%
    dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_region_chromhmm_arrange = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Group Label
chr1 0 16000 Quies ChromHMM Quies
chr1 16000 16200 TxWk ChromHMM TxWk
chr1 16200 17400 Quies ChromHMM Quies
chr1 17400 17600 TxWk ChromHMM TxWk
chr1 17600 118400 Quies ChromHMM Quies
chr1 118400 120200 Enh1 ChromHMM Enh1

Prepare regions: cCREs silencers

Code
dat = dat_region_ccres_silencer_rest
dat = dat %>%
    dplyr::mutate(
        Chrom      = Chr, 
        ChromStart = Start, 
        ChromEnd   = End,
        Name       = `cCRE accession`
    ) %>%
    dplyr::left_join(
        dat_region_ccres, 
        by = c("Chrom", "ChromStart", "ChromEnd", "Name")
    ) %>%
    dplyr::mutate(Group = "cCREs:REST+ silencer") %>%
    dplyr::select(Chrom, ChromStart, ChromEnd, Name, Group, Label) %>%
    dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_region_ccres_silencer_rest_arrange = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Group Label
chr1 911951 912144 EH38E3951341 cCREs:REST+ silencer Low-DNase
chr1 1298966 1299316 EH38E2777310 cCREs:REST+ silencer CA-TF
chr1 1300420 1300762 EH38E2777313 cCREs:REST+ silencer Low-DNase
chr1 1334920 1335266 EH38E2777380 cCREs:REST+ silencer Low-DNase
chr1 1375250 1375598 EH38E2777455 cCREs:REST+ silencer pELS
chr1 1407146 1407496 EH38E2777492 cCREs:REST+ silencer PLS
Code
dat = dat_region_ccres_silencer_starr
dat = dat %>%
    dplyr::mutate(
        Chrom      = Chr, 
        ChromStart = Start, 
        ChromEnd   = End,
        Name       = `cCRE accession`
    ) %>%
    dplyr::left_join(
        dat_region_ccres, 
        by = c("Chrom", "ChromStart", "ChromEnd", "Name")
    ) %>%
    dplyr::mutate(Group = "cCREs:STARR silencer") %>%
    dplyr::select(Chrom, ChromStart, ChromEnd, Name, Group, Label) %>%
    dplyr::arrange(Chrom, ChromStart, ChromEnd)


dat_region_ccres_silencer_starr_arrange = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Group Label
chr1 898458 898659 EH38E2776591 cCREs:STARR silencer Low-DNase
chr1 1493739 1493964 EH38E3951581 cCREs:STARR silencer Low-DNase
chr1 1777547 1777889 EH38E2777930 cCREs:STARR silencer pELS
chr1 2172722 2173053 EH38E2778456 cCREs:STARR silencer Low-DNase
chr1 2468066 2468391 EH38E3951891 cCREs:STARR silencer CA-TF
chr1 3065437 3065776 EH38E2779554 cCREs:STARR silencer Low-DNase

Prepare regions for genome track plots

Set label to name

Code
dat = dat_region_ccres
dat = dat %>% 
    dplyr::mutate(Name = Label) %>%
    dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_region_ccres_label2name = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Score Strand ThickStart ThickEnd ItemRgb Label Note
chr1 10033 10250 Low-DNase 0 . 10033 10250 225,225,225 Low-DNase All-data/Full-classification
chr1 10385 10713 Low-DNase 0 . 10385 10713 225,225,225 Low-DNase All-data/Full-classification
chr1 16097 16381 Low-DNase 0 . 16097 16381 225,225,225 Low-DNase All-data/Full-classification
chr1 17343 17642 Low-DNase 0 . 17343 17642 225,225,225 Low-DNase All-data/Full-classification
chr1 29320 29517 Low-DNase 0 . 29320 29517 225,225,225 Low-DNase All-data/Full-classification
chr1 66350 66509 Low-DNase 0 . 66350 66509 225,225,225 Low-DNase All-data/Full-classification

Extract PLS/ELS only

Code
dat = dat_region_ccres_label2name
vec = c("PLS", "pELS", "dELS")
dat = dat %>% 
    dplyr::filter(Label %in% vec) %>%
    dplyr::arrange(Chrom, ChromStart, ChromEnd)

dat_region_ccres_label2name_subset = dat
fun_display_table(head(dat))
Chrom ChromStart ChromEnd Name Score Strand ThickStart ThickEnd ItemRgb Label Note
chr1 138917 139112 pELS 0 . 138917 139112 255,167,0 pELS All-data/Full-classification
chr1 778570 778919 PLS 0 . 778570 778919 255,0,0 PLS All-data/Full-classification
chr1 779023 779182 PLS 0 . 779023 779182 255,0,0 PLS All-data/Full-classification
chr1 825846 826068 pELS 0 . 825846 826068 255,167,0 pELS All-data/Full-classification
chr1 826734 826887 PLS 0 . 826734 826887 255,0,0 PLS All-data/Full-classification
chr1 827417 827767 PLS 0 . 827417 827767 255,0,0 PLS All-data/Full-classification

Define column description

The peak file is in narrowPeak format, which is a standard six field bed with four additional fields (BED6+4 format)

Code
### create metadata: column information
dat = tribble(
    ~Name,        ~Note,
    "Chrom",      "Name of the chromosome",
    "ChromStart", "The starting position of the feature in the chromosome",
    "ChromEnd",   "The ending position of the feature in the chromosome",
    "Name",       "Name given to a region; Use '.' if no name is assigned.",
    "Group",      "Type of chromatin states annotaiton",
    "Label",      "cCREs/ChromHMM labels"
)

### assign and show
dat_cname = dat
fun_display_table(dat)
Name Note
Chrom Name of the chromosome
ChromStart The starting position of the feature in the chromosome
ChromEnd The ending position of the feature in the chromosome
Name Name given to a region; Use '.' if no name is assigned.
Group Type of chromatin states annotaiton
Label cCREs/ChromHMM labels

Save results

Write description table of columns

Code
txt_folder = TXT_FOLDER_REGION
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "description.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_cname
write_tsv(dat, txt_fpath)

Write tables

Code
### set directory
txt_folder = TXT_FOLDER_REGION
txt_fdiry = file.path(FD_RES, "region", txt_folder)
txt_cmd   = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### write table: cCREs
#txt_fname = "K562.hg38.cCREs.bed.gz"
txt_fname = "K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.simplified.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_region_ccres_arrange
write_tsv(dat, txt_fpath, col_names = FALSE)

### write table: ChromHMM
#txt_fname = "K562.hg38.ChromHMM.bed.gz"
txt_fname = "K562.hg38.ENCSR365YNI.ENCFF106BGJ.ChromHMM.simplified.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_region_chromhmm_arrange
write_tsv(dat, txt_fpath, col_names = FALSE)

### write table: cCREs Silencer (REST)
txt_fname = "K562.hg38.cCREs.silencer_rest.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_region_ccres_silencer_rest_arrange
write_tsv(dat, txt_fpath, col_names = FALSE)

### write table: cCREs Silencer (STARR)
txt_fname = "K562.hg38.cCREs.silencer_starr.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_region_ccres_silencer_starr_arrange
write_tsv(dat, txt_fpath, col_names = FALSE)
Code
### set directory
txt_folder = TXT_FOLDER_REGION
txt_fdiry = file.path(FD_RES, "region", txt_folder, "summary")
txt_cmd   = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### write table: cCREs (total; for genome plot)
txt_fname = "K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.label2name.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_region_ccres_label2name
write_tsv(dat, txt_fpath, col_names = FALSE)

### write table: cCREs (subset; for genome plot)
txt_fname = "K562.hg38.ENCSR913HQX.ENCFF286VQG.cCREs.label2name.PLS_ELS.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)
dat = dat_region_ccres_label2name_subset
write_tsv(dat, txt_fpath, col_names = FALSE)