Prepare TF Modules 01

Export the data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity: singularity_proj_encode_fcc 
BASE DIRECTORY (FD_BASE): /data/reddylab/Kuei 
REPO DIRECTORY (FD_REPO): /data/reddylab/Kuei/repo 
WORK DIRECTORY (FD_WORK): /data/reddylab/Kuei/work 
DATA DIRECTORY (FD_DATA): /data/reddylab/Kuei/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/log 
PROJECT REF     (FD_REF): /data/reddylab/Kuei/repo/Proj_ENCODE_FCC/references 

Set global variables

Code
TXT_FODLER_INP = "TF_modules_Shannon"
TXT_FOLDER_OUT = "module_tf_shannon"
Code
txt_folder = TXT_FODLER_INP
txt_fdiry  = file.path(FD_DAT, "processed", txt_folder)

vec = dir(txt_fdiry)
for (txt in vec) {cat(txt, "\n")}
K562.full.region.assignments.txt 
K562.TFzscore.txt 

Import data

Code
### set directory
txt_folder = TXT_FODLER_INP
txt_fdiry  = file.path(FD_DAT, "processed", txt_folder)
txt_fname  = "K562.full.region.assignments.txt"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_module_full_import = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 173997     74
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 33 34 35 36 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 61 63 64 65 66 67 68 71 72 73 74 75 77 chr start end TFs
0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 115702 115751 NONO,RNF2,FOXK2,NR2F1,SOX6,ESRRA,ATF4,GMEB1,NFIC,TRIM24,HDAC1,TCF12,NCOR1,NFE2,ATF7,ZNF24,GABPB1,ZBTB2,MTA3,FOXM1,ZBTB40,DPF2,NFRKB,HDAC2,GATAD2A,IKZF1,ARID3A,NBN,EP300,SMARCE1,ZNF281,KDM1A,NCOA1,MAFG,POLR2A,EGR1,REST,TCF3,SMARCC2,MTA1,MEIS2
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 118585 118665 RNF2,MAFG,NFE2L1,NFE2
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 136446 136510 CEBPB,ESRRA,NR2F1,EGR1
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 139031 139110 ZFX,CTCF,RFX1
0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 268005 268051 SMC3,CTCF,MAZ,ATF7,ARID2,ZNF281,REST,RAD21
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 586187 586203 CTCF,MAZ,SMC3
Code
### set directory
txt_folder = TXT_FODLER_INP
txt_fdiry  = file.path(FD_DAT, "processed", txt_folder)
txt_fname  = "K562.TFzscore.txt"
txt_fpath  = file.path(txt_fdiry, txt_fname)

### read table
dat = read_tsv(txt_fpath, show_col_types = FALSE)

### assign and show
dat_module_tfs_import = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 2503    2
module TF
1 ZNF589
1 JUNB
1 FOSL1
1 MEIS2
1 EGR1
1 MEF2D

Rename columns

Code
dat = dat_module_full_import
dat = dat %>% 
    dplyr::rename(
        "Chrom"      = "chr",
        "ChromStart" = "start",
        "ChromEnd"   = "end"
    ) %>%
    dplyr::mutate(Region = fun_gen_region(Chrom, ChromStart, ChromEnd))

### assign and show
dat_module_full = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 173997     75
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 33 34 35 36 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 61 63 64 65 66 67 68 71 72 73 74 75 77 Chrom ChromStart ChromEnd TFs Region
0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 115702 115751 NONO,RNF2,FOXK2,NR2F1,SOX6,ESRRA,ATF4,GMEB1,NFIC,TRIM24,HDAC1,TCF12,NCOR1,NFE2,ATF7,ZNF24,GABPB1,ZBTB2,MTA3,FOXM1,ZBTB40,DPF2,NFRKB,HDAC2,GATAD2A,IKZF1,ARID3A,NBN,EP300,SMARCE1,ZNF281,KDM1A,NCOA1,MAFG,POLR2A,EGR1,REST,TCF3,SMARCC2,MTA1,MEIS2 chr1:115702-115751
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 118585 118665 RNF2,MAFG,NFE2L1,NFE2 chr1:118585-118665
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 136446 136510 CEBPB,ESRRA,NR2F1,EGR1 chr1:136446-136510
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 139031 139110 ZFX,CTCF,RFX1 chr1:139031-139110
0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 268005 268051 SMC3,CTCF,MAZ,ATF7,ARID2,ZNF281,REST,RAD21 chr1:268005-268051
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 chr1 586187 586203 CTCF,MAZ,SMC3 chr1:586187-586203

Check missing values

Code
dat = dat_module_full
dat = dat %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region, TFs)

lst = lapply(dat, function(vec){sum(is.na(vec))})
print(lst)
$Chrom
[1] 1

$ChromStart
[1] 1

$ChromEnd
[1] 1

$Region
[1] 0

$TFs
[1] 1
Code
dat = dat_module_full
dat = dat %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region, TFs)
dat = dat %>% dplyr::filter(is.na(Chrom))
dat
A tibble: 1 × 5
Chrom ChromStart ChromEnd Region TFs
<chr> <dbl> <dbl> <chr> <chr>
NA NA NA NA:NA-NA NA

Remove NA

Code
dat = dat_module_full
print(any(is.na(dat)))
print(dim(dat))

dat = dat %>% dplyr::filter(!is.na(Chrom))

dat_module_full_rmna = dat
print(any(is.na(dat)))
print(dim(dat))
[1] TRUE
[1] 173997     75
[1] FALSE
[1] 173996     75

Arrange TF-Module mapping

Code
### rename columns
dat = dat_module_tfs_import
colnames(dat) = c("Module", "TF")

### rename values
vec = dat$Module
vec = stringr::str_pad(vec, 2, pad = "0")
vec = paste("Module", vec, sep = "_")
dat = dat %>% dplyr::mutate(Module = vec)

### assign and show
dat_module2tf_list = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 2503    2
Module TF
Module_01 ZNF589
Module_01 JUNB
Module_01 FOSL1
Module_01 MEIS2
Module_01 EGR1
Module_01 MEF2D
Code
###
dat = dat_module2tf_list
dat = dat %>% 
    dplyr::mutate(Value = 1) %>%
    tidyr::spread(Module, Value) %>%
    replace(is.na(.), 0)

### assign and show
dat_module2tf_matrix = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 234  71
TF Module_01 Module_02 Module_03 Module_04 Module_05 Module_06 Module_07 Module_08 Module_09 Module_10 Module_11 Module_12 Module_13 Module_14 Module_15 Module_16 Module_17 Module_18 Module_19 Module_20 Module_21 Module_22 Module_23 Module_24 Module_25 Module_26 Module_27 Module_28 Module_29 Module_30 Module_31 Module_33 Module_34 Module_35 Module_36 Module_37 Module_39 Module_40 Module_41 Module_42 Module_43 Module_44 Module_45 Module_46 Module_47 Module_48 Module_49 Module_50 Module_51 Module_52 Module_53 Module_54 Module_55 Module_56 Module_57 Module_58 Module_59 Module_61 Module_63 Module_64 Module_65 Module_66 Module_67 Module_68 Module_71 Module_72 Module_73 Module_74 Module_75 Module_77
ADNP 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
AGO1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
ARHGAP35 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ARID2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
ARID3A 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
ARID4B 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0

Arrange TF regions

Region-TFs

Code
### get region info of the rows
dat = dat_module_full_rmna
dat = dat %>% dplyr::select(Chrom, ChromStart, ChromEnd, Region, TFs)

### assign and show
dat_region2tf_list = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 173996      5
Chrom ChromStart ChromEnd Region TFs
chr1 115702 115751 chr1:115702-115751 NONO,RNF2,FOXK2,NR2F1,SOX6,ESRRA,ATF4,GMEB1,NFIC,TRIM24,HDAC1,TCF12,NCOR1,NFE2,ATF7,ZNF24,GABPB1,ZBTB2,MTA3,FOXM1,ZBTB40,DPF2,NFRKB,HDAC2,GATAD2A,IKZF1,ARID3A,NBN,EP300,SMARCE1,ZNF281,KDM1A,NCOA1,MAFG,POLR2A,EGR1,REST,TCF3,SMARCC2,MTA1,MEIS2
chr1 118585 118665 chr1:118585-118665 RNF2,MAFG,NFE2L1,NFE2
chr1 136446 136510 chr1:136446-136510 CEBPB,ESRRA,NR2F1,EGR1
chr1 139031 139110 chr1:139031-139110 ZFX,CTCF,RFX1
chr1 268005 268051 chr1:268005-268051 SMC3,CTCF,MAZ,ATF7,ARID2,ZNF281,REST,RAD21
chr1 586187 586203 chr1:586187-586203 CTCF,MAZ,SMC3

Region-TFs matrix

Code
dat = dat_region2tf_list
dat = dat %>% 
    tidyr::separate_rows(TFs, sep = ",") %>%
    dplyr::distinct() %>% 
    na.omit

### convert into Region-TF matrix
dat = dat %>%
    dplyr::mutate(Value = 1) %>%
    tidyr::spread(TFs, Value) %>%
    replace(is.na(.), 0)

### assign and show
dat_region2tf_matrix = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 173996    269
Chrom ChromStart ChromEnd Region ADNP AGO1 ARHGAP35 ARID2 ARID3A ARID4B ARNT ATF1 ATF2 ATF3 ATF4 ATF6 ATF7 BCL6 BHLHE40 BRD4 BRF2 CAMTA2 CBFB CBX1 CBX2 CBX5 CCAR2 CEBPB CEBPG CEBPZ CHCHD3 CHD2 CHD4 CREB1 CREB3 CREM CTCF DDIT3 DMTF1 DPF2 E2F1 E2F4 E2F5 E2F8 EGR1 EHMT2 ELF1 ELF4 ELK1 EP300 ERF ESRRA ETS1 ETV5 ETV6 EZH2 FIP1L1 FOSL1 FOXA1 FOXA3 FOXJ3 FOXK1 FOXK2 FOXM1 FOXP1 FOXP4 FUS GABPA GABPB1 GATA2 GATAD2A GMEB1 GTF2F1 HBP1 HCFC1 HDAC1 HDAC2 HDAC6 HINFP HIVEP1 HMG20A HMG20B HNRNPH1 HNRNPK HNRNPL HNRNPLL HNRNPUL1 HOMEZ IKZF1 IRF1 IRF2 IRF9 JUN JUNB JUND KAT2B KAT7 KAT8 KDM1A KDM5B KLF13 KLF16 KLF6 LCOR MAFF MAFG MAFK MAX MAZ MBD1 MEF2A MEF2D MEIS2 MGA MLX MNT MTA1 MTA3 MTF1 MXD1 MXI1 MYBL2 MYC MYNN NBN NCOA1 NCOA2 NCOR1 NFATC3 NFE2 NFE2L1 NFIC NFRKB NFYA NONO NR1H2 NR2C2 NR2F1 NR2F2 NR2F6 NR3C1 NRF1 PATZ1 PBX2 PCBP1 PCBP2 PHB2 PHF20 PHF21A PHF8 POLR2A POLR2AphosphoS2 POLR2AphosphoS5 POLR2G PRDM10 PRPF4 PTBP1 RAD21 RAD51 RBFOX2 RBM22 RBM39 RBPJ RCOR1 RELA REST RFX1 RFX5 RNF2 RNF219 RREB1 SAFB2 SETDB1 SFPQ SIN3A SIN3B SKIL SMAD1 SMAD3 SMAD4 SMARCC2 SMARCE1 SMC3 SNRNP70 SOX6 SP1 SRF SRSF1 SRSF9 STAG1 STAT5B SUZ12 TAF15 TARDBP TBL1XR1 TBP TCF12 TCF3 TCF7 TCF7L2 TEAD1 TEAD2 TEAD4 TFAP4 TFDP1 TFE3 TGIF2 THRA THRB TOE1 TRIM24 U2AF1 U2AF2 UBTF USF1 USF2 XRCC5 YY1 ZBTB2 ZBTB26 ZBTB33 ZBTB34 ZBTB40 ZBTB49 ZBTB7A ZBTB8A ZC3H4 ZC3H8 ZFP1 ZFP36 ZFP91 ZFX ZHX1 ZKSCAN1 ZKSCAN8 ZMYM3 ZNF12 ZNF121 ZNF124 ZNF143 ZNF146 ZNF217 ZNF224 ZNF232 ZNF24 ZNF253 ZNF263 ZNF274 ZNF280B ZNF281 ZNF282 ZNF3 ZNF318 ZNF354B ZNF384 ZNF431 ZNF511 ZNF512 ZNF589 ZNF639 ZNF644 ZNF7 ZNF707 ZNF740 ZNF766 ZNF780A ZNF83 ZSCAN29 ZZZ3
chr1 115702 115751 chr1:115702-115751 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 118585 118665 chr1:118585-118665 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 136446 136510 chr1:136446-136510 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 139031 139110 chr1:139031-139110 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 268005 268051 chr1:268005-268051 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 586187 586203 chr1:586187-586203 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Arrange module regions

Region-Module matrix

Code
### init
dat = dat_module_full_rmna

### get Region-Module matrix
tmp = dat %>% dplyr::select( Chrom,  ChromStart,  ChromEnd,  Region)
mat = dat %>% dplyr::select(-Chrom, -ChromStart, -ChromEnd, -Region, -TFs)

### reset column names
vec = colnames(mat)
vec = stringr::str_pad(vec, 2, pad = "0")
vec = paste("Module", vec, sep = "_")
colnames(mat) = vec

### re-combine
dat = cbind(tmp, mat)

### assign and show
dat_region2module_matrix = dat
print(dim(dat))
fun_display_table(head(dat))
[1] 173996     74
Chrom ChromStart ChromEnd Region Module_01 Module_02 Module_03 Module_04 Module_05 Module_06 Module_07 Module_08 Module_09 Module_10 Module_11 Module_12 Module_13 Module_14 Module_15 Module_16 Module_17 Module_18 Module_19 Module_20 Module_21 Module_22 Module_23 Module_24 Module_25 Module_26 Module_27 Module_28 Module_29 Module_30 Module_31 Module_33 Module_34 Module_35 Module_36 Module_37 Module_39 Module_40 Module_41 Module_42 Module_43 Module_44 Module_45 Module_46 Module_47 Module_48 Module_49 Module_50 Module_51 Module_52 Module_53 Module_54 Module_55 Module_56 Module_57 Module_58 Module_59 Module_61 Module_63 Module_64 Module_65 Module_66 Module_67 Module_68 Module_71 Module_72 Module_73 Module_74 Module_75 Module_77
chr1 115702 115751 chr1:115702-115751 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 118585 118665 chr1:118585-118665 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 136446 136510 chr1:136446-136510 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 139031 139110 chr1:139031-139110 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 268005 268051 chr1:268005-268051 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1 586187 586203 chr1:586187-586203 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Region-Module

Code
### reshape the region-module
dat = dat_region2module_matrix
dat = dat %>%
    dplyr::mutate(Group = "TF_Module") %>%
    tidyr::gather(Label, Value, -Chrom, -ChromStart, -ChromEnd, -Region, -Group) %>%
    dplyr::filter(Value == 1) %>%
    dplyr::select(-Value)

### assign and show
dat_region2module_list = dat
print(dim(dat))
head(dat)
[1] 413728      6
A data.frame: 6 × 6
Chrom ChromStart ChromEnd Region Group Label
<chr> <dbl> <dbl> <chr> <chr> <chr>
1 chr1 1079499 1080590 chr1:1079499-1080590 TF_Module Module_01
2 chr1 1282057 1282312 chr1:1282057-1282312 TF_Module Module_01
3 chr1 1305319 1306287 chr1:1305319-1306287 TF_Module Module_01
4 chr1 1307969 1309285 chr1:1307969-1309285 TF_Module Module_01
5 chr1 1318296 1318659 chr1:1318296-1318659 TF_Module Module_01
6 chr1 1344799 1345337 chr1:1344799-1345337 TF_Module Module_01

Check missing values

Code
dat = dat_region2module_list
lst = lapply(dat, function(vec){any(is.na(vec))})
print(lst)
$Chrom
[1] FALSE

$ChromStart
[1] FALSE

$ChromEnd
[1] FALSE

$Region
[1] FALSE

$Group
[1] FALSE

$Label
[1] FALSE

Define column description

The peak file is in narrowPeak format, which is a standard six field bed with four additional fields (BED6+4 format)

Code
### create metadata: column information
dat = tribble(
    ~Name,        ~Note,
    "Chrom",      "Name of the chromosome",
    "ChromStart", "The starting position of the feature in the chromosome",
    "ChromEnd",   "The ending position of the feature in the chromosome",
    "Name",       "Region of the row",
    "Group",      "Annotation name",
    "Label",      "TF Modules"
)

### assign and show
dat_cname = dat
fun_display_table(dat)
Name Note
Chrom Name of the chromosome
ChromStart The starting position of the feature in the chromosome
ChromEnd The ending position of the feature in the chromosome
Name Region of the row
Group Annotation name
Label TF Modules

Save results

Save column description

Code
txt_folder = TXT_FOLDER_OUT
txt_fdiry  = file.path(FD_RES, "region", txt_folder, "summary")
txt_fname  = "description.tsv"
txt_fpath  = file.path(txt_fdiry, txt_fname)

dir.create(txt_fdiry, showWarnings = FALSE)
dat = dat_cname
write_tsv(dat, txt_fpath)

Save bed file: region2model list

Code
### set directory
txt_fdiry = file.path(FD_RES, "region", "module_tf_shannon")
txt_fname = "K562.hg38.TF_Module.bed.gz"
txt_fpath = file.path(txt_fdiry, txt_fname)

txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### write table
dat = dat_region2module_list
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
write_tsv(dat, txt_fpath, col_names = FALSE)

Save matrix summary

Code
### set directory
txt_fdiry = file.path(FD_RES, "region", "module_tf_shannon", "summary")
txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### write region-module (Matrix)
txt_fname = "matrix.region2module.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_region2module_matrix
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
write_tsv(dat, txt_fpath)

### write region-module (List)
txt_fname = "data.region2module.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_region2module_list
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
write_tsv(dat, txt_fpath)

### write region-tfs matrix
txt_fname = "matrix.region2TF.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_region2tf_matrix
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
write_tsv(dat, txt_fpath)

### write region-tfs matrix
txt_fname = "data.region2TF.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_region2tf_list
dat = dat %>% dplyr::arrange(Chrom, ChromStart, ChromEnd)
write_tsv(dat, txt_fpath)

### write TF-Module mapping (matrix)
txt_fname = "matrix.module2TF.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_module2tf_matrix
write_tsv(dat, txt_fpath)

### write TF-Module mapping (list)
txt_fname = "data.module2TF.tsv"
txt_fpath = file.path(txt_fdiry, txt_fname)

dat = dat_module2tf_list
write_tsv(dat, txt_fpath)