Prepare Tiling MPRA data 03

Extract fragment counts from Tiling MPRA data

Set environment

Code
suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()
You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references 
Code
TXT_ASSAY = "MPRA_Tiling_K562_Tewhey_Hannah"

Import data

<Data>
* OL13 (FADS)
* OL43 (GATA/MYC)
* OL45 (HBE1/LMO2/RBM38/HBA2/BCL11A)

- OL13_20220512_counts.out
- OL13_20220512_normalized_counts.out
- OL43_20211228_counts.out      
- OL43_20211228_normalized_counts.out
- OL43_20221003_counts.out 
- OL43_20221003_K562_normalized_counts.out
- OL45_20220927_counts.out 
- OL45_20220927_K562_normalized_counts.out

Check data

Code
fdiry = file.path(FD_DAT, "processed", TXT_ASSAY, "tiling_counts")
print(dir(fdiry))
 [1] "FADS_tile_snp.20190214.attributes"       
 [2] "OL13_20220512_counts.out"                
 [3] "OL13_20220512_normalized_counts.out"     
 [4] "OL43_20211228_counts.out"                
 [5] "OL43_20211228_normalized_counts.out"     
 [6] "OL43_20221003_counts.out"                
 [7] "OL43_20221003_K562_normalized_counts.out"
 [8] "OL43_K562.bed"                           
 [9] "OL43.attributes"                         
[10] "OL45_20220927_counts.out"                
[11] "OL45_20220927_K562_normalized_counts.out"
[12] "OL45_K562.bed"                           
[13] "OL45.attributes"                         
[14] "UKBB_GTEx"                               
Code
fdiry = file.path(FD_DAT, "processed", TXT_ASSAY, "tiling_counts")
fpaths = dir(fdiry)
print(grep("out", fpaths, value=TRUE))
[1] "OL13_20220512_counts.out"                
[2] "OL13_20220512_normalized_counts.out"     
[3] "OL43_20211228_counts.out"                
[4] "OL43_20211228_normalized_counts.out"     
[5] "OL43_20221003_counts.out"                
[6] "OL43_20221003_K562_normalized_counts.out"
[7] "OL45_20220927_counts.out"                
[8] "OL45_20220927_K562_normalized_counts.out"

Setup metadata

Code
dat_meta = data.frame(
    Dataset = c(
        "OL13_20220512", "OL13_20220512", 
        "OL43_20221003", "OL43_20221003", 
        "OL45_20220927", "OL45_20220927"),
    Process = c("raw", "norm", "raw", "norm", "raw", "norm"),
    Genome  = c("hg19", "hg19", "hg38", "hg38", "hg38", "hg38"),
    N_Rep_Input  = c(4, 4, 6, 6, 4, 4),  
    N_Rep_Output = c(4, 4, 5, 5, 4, 4), 
    FName = c(
        "OL13_20220512_counts.out",
        "OL13_20220512_normalized_counts.out",
        "OL43_20221003_counts.out",
        "OL43_20221003_K562_normalized_counts.out",
        "OL45_20220927_counts.out",
        "OL45_20220927_K562_normalized_counts.out")
)
dat_meta
A data.frame: 6 × 6
Dataset Process Genome N_Rep_Input N_Rep_Output FName
<chr> <chr> <chr> <dbl> <dbl> <chr>
OL13_20220512 raw hg19 4 4 OL13_20220512_counts.out
OL13_20220512 norm hg19 4 4 OL13_20220512_normalized_counts.out
OL43_20221003 raw hg38 6 5 OL43_20221003_counts.out
OL43_20221003 norm hg38 6 5 OL43_20221003_K562_normalized_counts.out
OL45_20220927 raw hg38 4 4 OL45_20220927_counts.out
OL45_20220927 norm hg38 4 4 OL45_20220927_K562_normalized_counts.out

Read data

Code
lst_dat_read = lapply(1:nrow(dat_meta), function(idx){

    ### Extract data info from the metadata table
    xrow  = dat_meta[idx,]
    fname = xrow$FName
    n_rep_input  = xrow$N_Rep_Input
    n_rep_output = xrow$N_Rep_Output
    
    ### Set file directory
    fdiry = file.path(FD_DAT, "processed", TXT_ASSAY, "tiling_counts")
    fpath = file.path(fdiry, fname)
          
    ### import data
    dat = read.table(fpath, row.names=1)
    dat = dat %>% rownames_to_column(var = "Name")

    ### rename columns & assign
    cnames = c(
        "Name", 
        paste0("Input.rep",  1:n_rep_input), 
        paste0("Output.rep", 1:n_rep_output)
    )
    colnames(dat) = cnames
    
    ### show progress and return
    cat("\n=======================\n")
    cat(fname, "\n")
    cat("Shape:", dim(dat), "\n")
    print(head(dat, 3))
    flush.console()
    return(dat)
})

names(lst_dat_read) = paste(
    paste("TMPRA_K562", dat_meta$Dataset, sep="_"), 
    dat_meta$Genome, 
    dat_meta$Process, sep=".")

=======================
OL13_20220512_counts.out 
Shape: 55229 9 
                                               Name Input.rep1 Input.rep2
1       (11:61555216-61555415;11:61555315:T:C_A_wC)       1609       1221
2 (11:61555231-61555430_RC;11:61555330:T:C_A_wC_RC)       1179        582
3 (11:61555315:T:C_A_wC_RC;11:61555216-61555415_RC)       1066        643
  Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
1       1396        798         845         283         574        1223
2       1225        721         476         416         912         956
3       1206        681         470         846         540         847

=======================
OL13_20220512_normalized_counts.out 
Shape: 55229 9 
                                               Name Input.rep1 Input.rep2
1       (11:61555216-61555415;11:61555315:T:C_A_wC)   881.8510  1196.3976
2 (11:61555231-61555430_RC;11:61555330:T:C_A_wC_RC)   646.1792   570.2730
3 (11:61555315:T:C_A_wC_RC;11:61555216-61555415_RC)   584.2468   630.0439
  Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
1   855.5317   774.9257    1807.947    569.6449    730.4747    1493.445
2   750.7352   700.1522    1018.441    837.3579   1160.6148    1167.403
3   739.0912   661.3088    1005.604   1702.8961    687.2061    1034.299

=======================
OL43_20221003_counts.out 
Shape: 99307 12 
                                                     Name Input.rep1 Input.rep2
1           (8:127381651-127381850;8:127320401-127320600)       1318       1810
2 (8:127742001-127742200;Nadav:negCtrl:seq31776.neg1.MYC)        634        730
3 (8:128045001-128045200;Nadav:posCtrl:seq31797.pos5.MYC)        416        432
  Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2
1       1477       1505        738        687         620         636
2        698        761        316        343         158         194
3        418        409        181        211       11164        6818
  Output.rep3 Output.rep4 Output.rep5
1         781         520         538
2         268         141         251
3       12118        9512       10450

=======================
OL43_20221003_K562_normalized_counts.out 
Shape: 99307 12 
                                                     Name Input.rep1 Input.rep2
1           (8:127381651-127381850;8:127320401-127320600)   887.4466  1078.7715
2 (8:127742001-127742200;Nadav:negCtrl:seq31776.neg1.MYC)   426.8901   435.0846
3 (8:128045001-128045200;Nadav:posCtrl:seq31797.pos5.MYC)   280.1045   257.4747
  Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2
1   946.8314   945.7219   962.3235   893.9085    961.7725   1136.6673
2   447.4531   478.2022   412.0518   446.3037    245.0969    346.7193
3   267.9590   257.0101   236.0170   274.5483  17318.1102  12185.2169
  Output.rep3 Output.rep4 Output.rep5
1    842.9706    896.6654    760.1443
2    289.2652    243.1343    354.6398
3  13079.5358  16402.0789  14764.8855

=======================
OL45_20220927_counts.out 
Shape: 94381 9 
                                                                  Name
1 (Nadav:posCtrl:seq33763.pos1.GATA1;Nadav:posCtrl:ENSG00000102317.270
2                                                  1:10437778:C:T:R:wC
3                                                  1:10451799:C:T:R:wC
  Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2
1        897        717        761        724       56816       37391
2        645        437        491        493        2239        1698
3        603        584        512        429         336         280
  Output.rep3 Output.rep4
1       36946       52349
2        1202        2532
3         321         333

=======================
OL45_20220927_K562_normalized_counts.out 
Shape: 94381 9 
                                                                  Name
1 (Nadav:posCtrl:seq33763.pos1.GATA1;Nadav:posCtrl:ENSG00000102317.270
2                                                  1:10437778:C:T:R:wC
3                                                  1:10451799:C:T:R:wC
  Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2
1   660.3527   622.9447   644.2283   623.2888  69961.0133  56175.0918
2   474.8355   379.6748   415.6585   424.4218   2757.0175   2551.0231
3   443.9160   507.3915   433.4361   369.3244    413.7373    420.6634
  Output.rep3 Output.rep4
1  62517.7362  58457.7541
2   2033.9501   2827.4663
3    543.1763    371.8587

Preprocess the table: Extract only tiling regions

Define helper functions for data processing

Code
get_strand = function(x){ifelse(is.na(x), "+", "-")}
add_chrom  = function(x){ifelse(str_detect(x, "chr"), x, paste0("chr", x))}

fun_extract   = function(dat) {
    ### Input:  dataframe
    ### Output: dataframe
    ### Description:
    ###     Preprocess the raw dataframe
    ###     - expand duplicates
    ###     - filter and extract only tiling oligos
    ###     - get location information
    ### ===========================================
    ### init regex pattern
    pattern = "^[chr|0-9|X]+:[0-9]*-[0-9]*"
    
    ### preprocess
    res = dat %>% 
        dplyr::mutate(Name = gsub("\\(|\\)", "", Name)) %>% 
        tidyr::separate_rows(Name, sep=";") %>%
        dplyr::filter(!grepl(pattern = "Nadav", Name)) %>%
        dplyr::filter(str_detect(string=Name, pattern=pattern)) %>%
        tidyr::separate(Name, c("Chrom", "ChromStart", "ChromEnd", "Strand"), remove=FALSE, fill="right") %>%
        dplyr::mutate(
            Chrom      = add_chrom(Chrom),
            ChromStart = as.integer(ChromStart),
            ChromEnd   = as.integer(ChromEnd),
            Name       = add_chrom(Name),
            Strand     = get_strand(Strand))
    return(res)
}

Apply processing functions

Code
### init
lst = lst_dat_read

### before preprocessed
print(lapply(lst, dim))
cat("++++++++++++++++++++++++++++++++++++++++\n")

### after proprocessed
lst = lapply(lst, fun_extract)
lst_dat_extract = lst
print(lapply(lst, dim))
$TMPRA_K562_OL13_20220512.hg19.raw
[1] 55229     9

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 55229     9

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 99307    12

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 99307    12

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 94381     9

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 94381     9

++++++++++++++++++++++++++++++++++++++++
$TMPRA_K562_OL13_20220512.hg19.raw
[1] 43907    13

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 43907    13

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 96108    16

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 96108    16

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 91110    13

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 91110    13

Show results

Code
set.seed(123)
lst = lst_dat_extract
lapply(lst, slice_sample, n=3)
$TMPRA_K562_OL13_20220512.hg19.raw
A tibble: 3 × 13
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
<chr> <chr> <int> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int>
chr11:61559376-61559575_RC chr11 61559376 61559575 - 363 256 368 252 0 32 0 37
chr11:61628981-61629180_RC chr11 61628981 61629180 - 823 497 638 344 78 300 263 298
chr11:61628421-61628620 chr11 61628421 61628620 + 400 378 637 250 138 165 295 236
$TMPRA_K562_OL13_20220512.hg19.norm
A tibble: 3 × 13
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
<chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
chr11:61648876-61649075_RC chr11 61648876 61649075 - 303.0849 283.1768 304.58401 238.8869 179.7249 181.1592 0.0000 363.8974
chr11:61558796-61558995 chr11 61558796 61558995 + 488.8819 550.6760 591.39550 539.9232 842.9954 215.3781 674.4801 461.5880
chr11:61652526-61652725 chr11 61652526 61652725 + 118.9320 117.5821 86.41116 140.8073 0.0000 255.6357 442.8662 124.5555
$TMPRA_K562_OL43_20221003.hg38.raw
A tibble: 3 × 16
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2 Output.rep3 Output.rep4 Output.rep5
<chr> <chr> <int> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
chr8:127212351-127212550 chr8 127212351 127212550 + 498 576 544 596 231 270 228 201 369 182 251
chrX:48539581-48539780 chrX 48539581 48539780 + 271 290 221 253 108 139 15877 10329 17175 14124 14530
chrX:48294251-48294450 chrX 48294251 48294450 + 181 221 277 253 96 106 38 73 128 74 105
$TMPRA_K562_OL43_20221003.hg38.norm
A tibble: 3 × 16
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2 Output.rep3 Output.rep4 Output.rep5
<chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
chr8:127441201-127441400 chr8 127441201 127441400 + 233.6449 222.3104 171.1604 221.1921 215.1536 252.4283 228.0332 121.5305 212.6315 184.5061 156.8328
chr8:127491151-127491350 chr8 127491151 127491350 + 438.3367 409.4564 412.1954 390.8565 355.9815 352.6189 642.2158 616.5884 457.6434 496.6147 508.6468
chr8:128028501-128028700 chr8 128028501 128028700 + 1120.4181 1169.3644 1135.3002 1182.6237 1090.1117 1122.9157 1002.1049 886.4576 943.3499 1046.6844 916.9771
$TMPRA_K562_OL45_20220927.hg38.raw
A tibble: 3 × 13
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
<chr> <chr> <int> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int>
chr20:58250701-58250900 chr20 58250701 58250900 + 225 216 217 192 120 105 121 98
chr11:33783701-33783900 chr11 33783701 33783900 + 786 596 647 773 2050 1933 1917 2263
chr11:5570301-5570500 chr11 5570301 5570500 + 294 254 226 187 158 137 110 250
$TMPRA_K562_OL45_20220927.hg38.norm
A tibble: 3 × 13
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
<chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
chr2:60047901-60048100 chr2 60047901 60048100 + 635.3226 677.6804 700.1009 686.1342 446.9841 381.6018 419.6503 503.62848
chr11:33672501-33672700 chr11 33672501 33672700 + 711.1490 671.5987 617.9851 668.9163 507.3208 438.6918 417.9581 714.68343
chr11:34229601-34229800 chr11 34229601 34229800 + 128.8313 118.1597 126.1367 141.1870 155.1515 174.2748 135.3711 92.68551
Code
lst = lst_dat_extract
lapply(lst, function(dat){table(dat$Chrom)})
$TMPRA_K562_OL13_20220512.hg19.raw

chr11 
43907 

$TMPRA_K562_OL13_20220512.hg19.norm

chr11 
43907 

$TMPRA_K562_OL43_20221003.hg38.raw

 chr8  chrX 
41905 54203 

$TMPRA_K562_OL43_20221003.hg38.norm

 chr8  chrX 
41905 54203 

$TMPRA_K562_OL45_20220927.hg38.raw

chr11 chr16  chr2 chr20 
39857 11439 19917 19897 

$TMPRA_K562_OL45_20220927.hg38.norm

chr11 chr16  chr2 chr20 
39857 11439 19917 19897 

Filter out the fragments (Oligos) where mean input is zero

Define helper functions

Code
fun_filter = function(dat){
    ### Input:  dataframe
    ### Output: dataframe
    ### Description:
    ###     calculate the mean of input for each fragment and 
    ###     filter out the fragment w/ plasmid mean is zero
    ###     - convert wide matrix to long matrix
    ###     - group by the input and output to calculate the mean
    ###     - get the non-zero fragments from the input dataframe
    
    ### calculate the mean of input for each fragment and 
    tmp = dat %>% 
        tidyr::gather(Sample, Value, -Chrom, -ChromStart, -ChromEnd, -Name, -Strand) %>% 
        tidyr::separate(Sample, c("Group", "Rep"), sep="\\.", remove=FALSE) %>%
        dplyr::filter(Group == "Input") %>%
        dplyr::group_by(Chrom, ChromStart, ChromEnd, Name, Strand, Group) %>%
        dplyr::summarise(Value = mean(Value), .groups="drop")
    
    ### filter out the fragment w/ plasmid mean is zero
    tmp = tmp %>% dplyr::filter(Value > 0)
    idx = unique(tmp$Name)
    res = dat %>% dplyr::filter(Name %in% idx)
    return(res)
}

Filter fragments

Code
### init
lst = lst_dat_extract

### show info before processing
print(lapply(lst, dim))
cat("++++++++++++++++++++++++++++++++++++++++\n")

### processing
lst = lapply(lst, fun_filter)

### assign and show info
lst_dat_filter = lst
print(lapply(lst, dim))
$TMPRA_K562_OL13_20220512.hg19.raw
[1] 43907    13

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 43907    13

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 96108    16

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 96108    16

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 91110    13

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 91110    13

++++++++++++++++++++++++++++++++++++++++
$TMPRA_K562_OL13_20220512.hg19.raw
[1] 43871    13

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 43871    13

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 96087    16

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 96087    16

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 91092    13

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 91092    13

Show results

Code
lst = lst_dat_filter
dat = lst[[1]]
head(dat, 3)
A tibble: 3 × 13
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
<chr> <chr> <int> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int>
chr11:61555216-61555415 chr11 61555216 61555415 + 1609 1221 1396 798 845 283 574 1223
chr11:61555231-61555430_RC chr11 61555231 61555430 - 1179 582 1225 721 476 416 912 956
chr11:61555216-61555415_RC chr11 61555216 61555415 - 1066 643 1206 681 470 846 540 847

Calculate mean for normalized count table

Define helper functions

Code
fun_add_mean_column = function(dat){
    ### get input and output to calculate log2FC
    x_inp = dat %>% dplyr::select(starts_with("Input"))  %>% apply(., 1, mean)
    x_out = dat %>% dplyr::select(starts_with("Output")) %>% apply(., 1, mean)
    x_lfc = log2(x_out) - log2(x_inp)
    
    ### add columns
    dat$Input.mean  = x_inp
    dat$Output.mean = x_out
    dat$Log2FC.mean = x_lfc
 
    return(dat)
}

Calculate mean and add it as an additional column for normalized count

Code
lst = lst_dat_filter
print(names(lst))
cat("=======================\n")
sid = grep("norm", names(lst), value=TRUE)
print(sid)
[1] "TMPRA_K562_OL13_20220512.hg19.raw"  "TMPRA_K562_OL13_20220512.hg19.norm"
[3] "TMPRA_K562_OL43_20221003.hg38.raw"  "TMPRA_K562_OL43_20221003.hg38.norm"
[5] "TMPRA_K562_OL45_20220927.hg38.raw"  "TMPRA_K562_OL45_20220927.hg38.norm"
=======================
[1] "TMPRA_K562_OL13_20220512.hg19.norm" "TMPRA_K562_OL43_20221003.hg38.norm"
[3] "TMPRA_K562_OL45_20220927.hg38.norm"
Code
### init
lst  = lst_dat_filter
idxs = grep("norm", names(lst), value=TRUE)

### add mean for normalization count
for (idx in idxs){
    cat(idx, "\n")
    lst[[idx]] = fun_add_mean_column(lst[[idx]])
}

### assigned
lst_dat_prep = lst
TMPRA_K562_OL13_20220512.hg19.norm 
TMPRA_K562_OL43_20221003.hg38.norm 
TMPRA_K562_OL45_20220927.hg38.norm 

Show results

Code
lst = lst_dat_prep
idx = names(lst)[1]
dat = lst_dat_prep[[1]]

cat(idx)
head(dat, 3)
TMPRA_K562_OL13_20220512.hg19.raw
A tibble: 3 × 13
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
<chr> <chr> <int> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int>
chr11:61555216-61555415 chr11 61555216 61555415 + 1609 1221 1396 798 845 283 574 1223
chr11:61555231-61555430_RC chr11 61555231 61555430 - 1179 582 1225 721 476 416 912 956
chr11:61555216-61555415_RC chr11 61555216 61555415 - 1066 643 1206 681 470 846 540 847
Code
lst = lst_dat_prep
idx = names(lst)[2]
dat = lst_dat_prep[[2]]

cat(idx)
head(dat, 3)
TMPRA_K562_OL13_20220512.hg19.norm
A tibble: 3 × 16
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4 Input.mean Output.mean Log2FC.mean
<chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
chr11:61555216-61555415 chr11 61555216 61555415 + 881.8510 1196.3976 855.5317 774.9257 1807.947 569.6449 730.4747 1493.445 927.1765 1150.378 0.3111919
chr11:61555231-61555430_RC chr11 61555231 61555430 - 646.1792 570.2730 750.7352 700.1522 1018.441 837.3579 1160.6148 1167.403 666.8349 1045.954 0.6494180
chr11:61555216-61555415_RC chr11 61555216 61555415 - 584.2468 630.0439 739.0912 661.3088 1005.604 1702.8961 687.2061 1034.299 653.6727 1107.501 0.7606680
Code
lst = lst_dat_prep
idx = names(lst)[3]
dat = lst_dat_prep[[3]]

cat(idx)
head(dat, 3)
TMPRA_K562_OL43_20221003.hg38.raw
A tibble: 3 × 16
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2 Output.rep3 Output.rep4 Output.rep5
<chr> <chr> <int> <int> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
chr8:127381651-127381850 chr8 127381651 127381850 + 1318 1810 1477 1505 738 687 620 636 781 520 538
chr8:127320401-127320600 chr8 127320401 127320600 + 1318 1810 1477 1505 738 687 620 636 781 520 538
chr8:127742001-127742200 chr8 127742001 127742200 + 634 730 698 761 316 343 158 194 268 141 251
Code
lst = lst_dat_prep
idx = names(lst)[4]
dat = lst_dat_prep[[4]]

cat(idx)
head(dat, 3)
TMPRA_K562_OL43_20221003.hg38.norm
A tibble: 3 × 19
Name Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2 Output.rep3 Output.rep4 Output.rep5 Input.mean Output.mean Log2FC.mean
<chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
chr8:127381651-127381850 chr8 127381651 127381850 + 887.4466 1078.7715 946.8314 945.7219 962.3235 893.9085 961.7725 1136.6673 842.9706 896.6654 760.1443 952.5006 919.6440 -0.0506444
chr8:127320401-127320600 chr8 127320401 127320600 + 887.4466 1078.7715 946.8314 945.7219 962.3235 893.9085 961.7725 1136.6673 842.9706 896.6654 760.1443 952.5006 919.6440 -0.0506444
chr8:127742001-127742200 chr8 127742001 127742200 + 426.8901 435.0846 447.4531 478.2022 412.0518 446.3037 245.0969 346.7193 289.2652 243.1343 354.6398 440.9976 295.7711 -0.5762898

Store the fragments and counts

<Data>
* OL13 (FADS)
* OL43 (GATA/MYC)
* OL45 (HBE1/LMO2/RBM38/HBA2/BCL11A)

- OL13_20220512_counts.out
- OL13_20220512_normalized_counts.out
- OL43_20211228_counts.out      
- OL43_20211228_normalized_counts.out
- OL43_20221003_counts.out 
- OL43_20221003_K562_normalized_counts.out
- OL45_20220927_counts.out 
- OL45_20220927_K562_normalized_counts.out

<Results>
PROJECT/results/assay_fcc
└── MPRA_Tiling_K562_Tewhey_Hannah
    ├── fragment_counts
    │   └── summary
    │       ├── TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL13_20220512.hg19.norm.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL43_20221003.hg38.norm.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.tsv 
    │       └── TMPRA_K562_OL45_20220927.hg38.norm.stranded_pos.tsv 
    │
    └── fragment
        ├── TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.bed.gz 
        ├── TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.bed.gz 
        └── TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.bed.gz 

Setup output directory

Code
### create fragment folder
txt_folder = "fragment"
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, txt_folder)
cat(txt_fdiry, "\n")

txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### create fragment count folders
txt_folder = "fragment_counts/summary"
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, txt_folder)
cat(txt_fdiry, "\n")

txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment 
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment_counts/summary 
Code
### check loop names
lst  = lst_dat_prep
idxs = grep("raw", names(lst), value=TRUE)
for (idx in idxs) {cat(idx, "\n")}
TMPRA_K562_OL13_20220512.hg19.raw 
TMPRA_K562_OL43_20221003.hg38.raw 
TMPRA_K562_OL45_20220927.hg38.raw 

Store fragments

Code
txt_assay  = TXT_ASSAY
txt_folder = "fragment"
txt_strand = "stranded_pos"

lst  = lst_dat_prep
idxs = grep("raw", names(lst), value=TRUE)

txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
cat("Output directory:", "\n")
cat(txt_fdiry, "\n")

for (idx in idxs){
    ### show progres
    cat("\n=======================")
    cat("\nSample:", idx, "\n")
    
    ### init
    dat = lst[[idx]]
    
    ### extract positive trands and order by positions
    dat = dat %>% 
        dplyr::filter(Strand == "+") %>% 
        dplyr::arrange(Chrom, ChromStart, ChromEnd)
    
    ### arrange the column into bed file format
    dat = dat %>%
        dplyr::mutate(Score = ".") %>%
        dplyr::select(Chrom, ChromStart, ChromEnd, Name, Score, Strand)
    
    ### save table
    txt_fname = paste(idx, txt_strand, "bed.gz", sep=".")
    txt_fpath = file.path(txt_fdiry, txt_fname)
    write_tsv(dat, txt_fpath, col_names=FALSE)
    
    ### show progress
    #print(fpath)
    print(head(dat))
    cat("\nSaved Table:", txt_fname, "\n")
}
Output directory: 
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment 

=======================
Sample: TMPRA_K562_OL13_20220512.hg19.raw 
# A tibble: 6 × 6
  Chrom ChromStart ChromEnd Name                    Score Strand
  <chr>      <int>    <int> <chr>                   <chr> <chr> 
1 chr11   61554801 61555000 chr11:61554801-61555000 .     +     
2 chr11   61554806 61555005 chr11:61554806-61555005 .     +     
3 chr11   61554811 61555010 chr11:61554811-61555010 .     +     
4 chr11   61554816 61555015 chr11:61554816-61555015 .     +     
5 chr11   61554821 61555020 chr11:61554821-61555020 .     +     
6 chr11   61554826 61555025 chr11:61554826-61555025 .     +     

Saved Table: TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.bed.gz 

=======================
Sample: TMPRA_K562_OL43_20221003.hg38.raw 
# A tibble: 6 × 6
  Chrom ChromStart  ChromEnd Name                     Score Strand
  <chr>      <int>     <int> <chr>                    <chr> <chr> 
1 chr8   126735901 126736100 chr8:126735901-126736100 .     +     
2 chr8   126735951 126736150 chr8:126735951-126736150 .     +     
3 chr8   126736001 126736200 chr8:126736001-126736200 .     +     
4 chr8   126736051 126736250 chr8:126736051-126736250 .     +     
5 chr8   126736101 126736300 chr8:126736101-126736300 .     +     
6 chr8   126736151 126736350 chr8:126736151-126736350 .     +     

Saved Table: TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.bed.gz 

=======================
Sample: TMPRA_K562_OL45_20220927.hg38.raw 
# A tibble: 6 × 6
  Chrom ChromStart ChromEnd Name                  Score Strand
  <chr>      <int>    <int> <chr>                 <chr> <chr> 
1 chr11    4505501  4505700 chr11:4505501-4505700 .     +     
2 chr11    4505601  4505800 chr11:4505601-4505800 .     +     
3 chr11    4505701  4505900 chr11:4505701-4505900 .     +     
4 chr11    4505801  4506000 chr11:4505801-4506000 .     +     
5 chr11    4505901  4506100 chr11:4505901-4506100 .     +     
6 chr11    4506001  4506200 chr11:4506001-4506200 .     +     

Saved Table: TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.bed.gz 

Store table of counts

Code
txt_assay  = TXT_ASSAY
txt_folder = "fragment_counts/summary"
txt_strand = "stranded_pos"

lst  = lst_dat_prep
idxs = names(lst)

txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
cat("Output directory:", "\n")
cat(txt_fdiry, "\n")

for (idx in idxs){
    ### show progres
    cat("\n=======================")
    cat("\nSample:", idx, "\n")
    
    ### init
    dat = lst[[idx]]
    
    ### extract positive trands and order by positions
    dat = dat %>% 
       dplyr::filter(Strand == "+") %>% 
       dplyr::arrange(Chrom, ChromStart, ChromEnd)
    
    ### save table
    txt_fname = paste(idx, txt_strand, "tsv", sep=".")
    txt_fpath = file.path(txt_fdiry, txt_fname)
    write_tsv(dat, txt_fpath)

    ### show progress
    print(head(dat))
    cat("\nSaved Table:", txt_fname, "\n")
}
Output directory: 
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment_counts/summary 

=======================
Sample: TMPRA_K562_OL13_20220512.hg19.raw 
# A tibble: 6 × 13
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <int>      <int>      <int>
1 chr11:61554… chr11   61554801 61555000 +             971        542        785
2 chr11:61554… chr11   61554806 61555005 +            1267        562       1059
3 chr11:61554… chr11   61554811 61555010 +            1183        641       1118
4 chr11:61554… chr11   61554816 61555015 +            1020        476       1086
5 chr11:61554… chr11   61554821 61555020 +            1138        580        894
6 chr11:61554… chr11   61554826 61555025 +             702        406        745
# ℹ 5 more variables: Input.rep4 <int>, Output.rep1 <int>, Output.rep2 <int>,
#   Output.rep3 <int>, Output.rep4 <int>

Saved Table: TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL13_20220512.hg19.norm 
# A tibble: 6 × 16
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <dbl>      <dbl>      <dbl>
1 chr11:61554… chr11   61554801 61555000 +            532.       531.       481.
2 chr11:61554… chr11   61554806 61555005 +            694.       551.       649.
3 chr11:61554… chr11   61554811 61555010 +            648.       628.       685.
4 chr11:61554… chr11   61554816 61555015 +            559.       466.       666.
5 chr11:61554… chr11   61554821 61555020 +            624.       568.       548.
6 chr11:61554… chr11   61554826 61555025 +            385.       398.       457.
# ℹ 8 more variables: Input.rep4 <dbl>, Output.rep1 <dbl>, Output.rep2 <dbl>,
#   Output.rep3 <dbl>, Output.rep4 <dbl>, Input.mean <dbl>, Output.mean <dbl>,
#   Log2FC.mean <dbl>

Saved Table: TMPRA_K562_OL13_20220512.hg19.norm.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL43_20221003.hg38.raw 
# A tibble: 6 × 16
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <int>      <int>      <int>
1 chr8:126735… chr8   126735901   1.27e8 +            1190       1409       1259
2 chr8:126735… chr8   126735951   1.27e8 +             194        161        161
3 chr8:126736… chr8   126736001   1.27e8 +             746        757        708
4 chr8:126736… chr8   126736051   1.27e8 +             934       1056        988
5 chr8:126736… chr8   126736101   1.27e8 +             697        887        772
6 chr8:126736… chr8   126736151   1.27e8 +            1225       1454       1264
# ℹ 8 more variables: Input.rep4 <int>, Input.rep5 <int>, Input.rep6 <int>,
#   Output.rep1 <int>, Output.rep2 <int>, Output.rep3 <int>, Output.rep4 <int>,
#   Output.rep5 <int>

Saved Table: TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL43_20221003.hg38.norm 
# A tibble: 6 × 19
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <dbl>      <dbl>      <dbl>
1 chr8:126735… chr8   126735901   1.27e8 +            801.      840.        807.
2 chr8:126735… chr8   126735951   1.27e8 +            131.       96.0       103.
3 chr8:126736… chr8   126736001   1.27e8 +            502.      451.        454.
4 chr8:126736… chr8   126736051   1.27e8 +            629.      629.        633.
5 chr8:126736… chr8   126736101   1.27e8 +            469.      529.        495.
6 chr8:126736… chr8   126736151   1.27e8 +            825.      867.        810.
# ℹ 11 more variables: Input.rep4 <dbl>, Input.rep5 <dbl>, Input.rep6 <dbl>,
#   Output.rep1 <dbl>, Output.rep2 <dbl>, Output.rep3 <dbl>, Output.rep4 <dbl>,
#   Output.rep5 <dbl>, Input.mean <dbl>, Output.mean <dbl>, Log2FC.mean <dbl>

Saved Table: TMPRA_K562_OL43_20221003.hg38.norm.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL45_20220927.hg38.raw 
# A tibble: 6 × 13
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <int>      <int>      <int>
1 chr11:45055… chr11    4505501  4505700 +             811        664        917
2 chr11:45056… chr11    4505601  4505800 +             693        552        607
3 chr11:45057… chr11    4505701  4505900 +             577        414        448
4 chr11:45058… chr11    4505801  4506000 +             855        754        688
5 chr11:45059… chr11    4505901  4506100 +             317        217        313
6 chr11:45060… chr11    4506001  4506200 +             444        422        454
# ℹ 5 more variables: Input.rep4 <int>, Output.rep1 <int>, Output.rep2 <int>,
#   Output.rep3 <int>, Output.rep4 <int>

Saved Table: TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL45_20220927.hg38.norm 
# A tibble: 6 × 16
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <dbl>      <dbl>      <dbl>
1 chr11:45055… chr11    4505501  4505700 +            597.       577.       776.
2 chr11:45056… chr11    4505601  4505800 +            510.       480.       514.
3 chr11:45057… chr11    4505701  4505900 +            425.       360.       379.
4 chr11:45058… chr11    4505801  4506000 +            629.       655.       582.
5 chr11:45059… chr11    4505901  4506100 +            233.       189.       265.
6 chr11:45060… chr11    4506001  4506200 +            327.       367.       384.
# ℹ 8 more variables: Input.rep4 <dbl>, Output.rep1 <dbl>, Output.rep2 <dbl>,
#   Output.rep3 <dbl>, Output.rep4 <dbl>, Input.mean <dbl>, Output.mean <dbl>,
#   Log2FC.mean <dbl>

Saved Table: TMPRA_K562_OL45_20220927.hg38.norm.stranded_pos.tsv