Prepare Tiling MPRA data 03

Extract fragment counts from Tiling MPRA data

Set environment

Code

suppressMessages(suppressWarnings(source("../run_config_project_sing.R")))
show_env()

You are working on        Singularity 
BASE DIRECTORY (FD_BASE): /mount 
REPO DIRECTORY (FD_REPO): /mount/repo 
WORK DIRECTORY (FD_WORK): /mount/work 
DATA DIRECTORY (FD_DATA): /mount/data 

You are working with      ENCODE FCC 
PATH OF PROJECT (FD_PRJ): /mount/repo/Proj_ENCODE_FCC 
PROJECT RESULTS (FD_RES): /mount/repo/Proj_ENCODE_FCC/results 
PROJECT SCRIPTS (FD_EXE): /mount/repo/Proj_ENCODE_FCC/scripts 
PROJECT DATA    (FD_DAT): /mount/repo/Proj_ENCODE_FCC/data 
PROJECT NOTE    (FD_NBK): /mount/repo/Proj_ENCODE_FCC/notebooks 
PROJECT DOCS    (FD_DOC): /mount/repo/Proj_ENCODE_FCC/docs 
PROJECT LOG     (FD_LOG): /mount/repo/Proj_ENCODE_FCC/log 
PROJECT APP     (FD_APP): /mount/repo/Proj_ENCODE_FCC/app 
PROJECT REF     (FD_REF): /mount/repo/Proj_ENCODE_FCC/references

Code

TXT_ASSAY = "MPRA_Tiling_K562_Tewhey_Hannah"

Import data

<Data>
* OL13 (FADS)
* OL43 (GATA/MYC)
* OL45 (HBE1/LMO2/RBM38/HBA2/BCL11A)

- OL13_20220512_counts.out
- OL13_20220512_normalized_counts.out
- OL43_20211228_counts.out      
- OL43_20211228_normalized_counts.out
- OL43_20221003_counts.out 
- OL43_20221003_K562_normalized_counts.out
- OL45_20220927_counts.out 
- OL45_20220927_K562_normalized_counts.out

Check data

Code

fdiry = file.path(FD_DAT, "processed", TXT_ASSAY, "tiling_counts")
print(dir(fdiry))

 [1] "FADS_tile_snp.20190214.attributes"       
 [2] "OL13_20220512_counts.out"                
 [3] "OL13_20220512_normalized_counts.out"     
 [4] "OL43_20211228_counts.out"                
 [5] "OL43_20211228_normalized_counts.out"     
 [6] "OL43_20221003_counts.out"                
 [7] "OL43_20221003_K562_normalized_counts.out"
 [8] "OL43_K562.bed"                           
 [9] "OL43.attributes"                         
[10] "OL45_20220927_counts.out"                
[11] "OL45_20220927_K562_normalized_counts.out"
[12] "OL45_K562.bed"                           
[13] "OL45.attributes"                         
[14] "UKBB_GTEx"

Code

fdiry = file.path(FD_DAT, "processed", TXT_ASSAY, "tiling_counts")
fpaths = dir(fdiry)
print(grep("out", fpaths, value=TRUE))

[1] "OL13_20220512_counts.out"                
[2] "OL13_20220512_normalized_counts.out"     
[3] "OL43_20211228_counts.out"                
[4] "OL43_20211228_normalized_counts.out"     
[5] "OL43_20221003_counts.out"                
[6] "OL43_20221003_K562_normalized_counts.out"
[7] "OL45_20220927_counts.out"                
[8] "OL45_20220927_K562_normalized_counts.out"

Setup metadata

Code

dat_meta = data.frame(
    Dataset = c(
        "OL13_20220512", "OL13_20220512", 
        "OL43_20221003", "OL43_20221003", 
        "OL45_20220927", "OL45_20220927"),
    Process = c("raw", "norm", "raw", "norm", "raw", "norm"),
    Genome  = c("hg19", "hg19", "hg38", "hg38", "hg38", "hg38"),
    N_Rep_Input  = c(4, 4, 6, 6, 4, 4),  
    N_Rep_Output = c(4, 4, 5, 5, 4, 4), 
    FName = c(
        "OL13_20220512_counts.out",
        "OL13_20220512_normalized_counts.out",
        "OL43_20221003_counts.out",
        "OL43_20221003_K562_normalized_counts.out",
        "OL45_20220927_counts.out",
        "OL45_20220927_K562_normalized_counts.out")
)
dat_meta

A data.frame: 6 × 6
Dataset	Process	Genome	N_Rep_Input	N_Rep_Output	FName
<chr>	<chr>	<chr>	<dbl>	<dbl>	<chr>
OL13_20220512	raw	hg19	4	4	OL13_20220512_counts.out
OL13_20220512	norm	hg19	4	4	OL13_20220512_normalized_counts.out
OL43_20221003	raw	hg38	6	5	OL43_20221003_counts.out
OL43_20221003	norm	hg38	6	5	OL43_20221003_K562_normalized_counts.out
OL45_20220927	raw	hg38	4	4	OL45_20220927_counts.out
OL45_20220927	norm	hg38	4	4	OL45_20220927_K562_normalized_counts.out

Read data

Code

lst_dat_read = lapply(1:nrow(dat_meta), function(idx){

    ### Extract data info from the metadata table
    xrow  = dat_meta[idx,]
    fname = xrow$FName
    n_rep_input  = xrow$N_Rep_Input
    n_rep_output = xrow$N_Rep_Output
    
    ### Set file directory
    fdiry = file.path(FD_DAT, "processed", TXT_ASSAY, "tiling_counts")
    fpath = file.path(fdiry, fname)
          
    ### import data
    dat = read.table(fpath, row.names=1)
    dat = dat %>% rownames_to_column(var = "Name")

    ### rename columns & assign
    cnames = c(
        "Name", 
        paste0("Input.rep",  1:n_rep_input), 
        paste0("Output.rep", 1:n_rep_output)
    )
    colnames(dat) = cnames
    
    ### show progress and return
    cat("\n=======================\n")
    cat(fname, "\n")
    cat("Shape:", dim(dat), "\n")
    print(head(dat, 3))
    flush.console()
    return(dat)
})

names(lst_dat_read) = paste(
    paste("TMPRA_K562", dat_meta$Dataset, sep="_"), 
    dat_meta$Genome, 
    dat_meta$Process, sep=".")


=======================
OL13_20220512_counts.out 
Shape: 55229 9 
                                               Name Input.rep1 Input.rep2
1       (11:61555216-61555415;11:61555315:T:C_A_wC)       1609       1221
2 (11:61555231-61555430_RC;11:61555330:T:C_A_wC_RC)       1179        582
3 (11:61555315:T:C_A_wC_RC;11:61555216-61555415_RC)       1066        643
  Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
1       1396        798         845         283         574        1223
2       1225        721         476         416         912         956
3       1206        681         470         846         540         847

=======================
OL13_20220512_normalized_counts.out 
Shape: 55229 9 
                                               Name Input.rep1 Input.rep2
1       (11:61555216-61555415;11:61555315:T:C_A_wC)   881.8510  1196.3976
2 (11:61555231-61555430_RC;11:61555330:T:C_A_wC_RC)   646.1792   570.2730
3 (11:61555315:T:C_A_wC_RC;11:61555216-61555415_RC)   584.2468   630.0439
  Input.rep3 Input.rep4 Output.rep1 Output.rep2 Output.rep3 Output.rep4
1   855.5317   774.9257    1807.947    569.6449    730.4747    1493.445
2   750.7352   700.1522    1018.441    837.3579   1160.6148    1167.403
3   739.0912   661.3088    1005.604   1702.8961    687.2061    1034.299

=======================
OL43_20221003_counts.out 
Shape: 99307 12 
                                                     Name Input.rep1 Input.rep2
1           (8:127381651-127381850;8:127320401-127320600)       1318       1810
2 (8:127742001-127742200;Nadav:negCtrl:seq31776.neg1.MYC)        634        730
3 (8:128045001-128045200;Nadav:posCtrl:seq31797.pos5.MYC)        416        432
  Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2
1       1477       1505        738        687         620         636
2        698        761        316        343         158         194
3        418        409        181        211       11164        6818
  Output.rep3 Output.rep4 Output.rep5
1         781         520         538
2         268         141         251
3       12118        9512       10450

=======================
OL43_20221003_K562_normalized_counts.out 
Shape: 99307 12 
                                                     Name Input.rep1 Input.rep2
1           (8:127381651-127381850;8:127320401-127320600)   887.4466  1078.7715
2 (8:127742001-127742200;Nadav:negCtrl:seq31776.neg1.MYC)   426.8901   435.0846
3 (8:128045001-128045200;Nadav:posCtrl:seq31797.pos5.MYC)   280.1045   257.4747
  Input.rep3 Input.rep4 Input.rep5 Input.rep6 Output.rep1 Output.rep2
1   946.8314   945.7219   962.3235   893.9085    961.7725   1136.6673
2   447.4531   478.2022   412.0518   446.3037    245.0969    346.7193
3   267.9590   257.0101   236.0170   274.5483  17318.1102  12185.2169
  Output.rep3 Output.rep4 Output.rep5
1    842.9706    896.6654    760.1443
2    289.2652    243.1343    354.6398
3  13079.5358  16402.0789  14764.8855

=======================
OL45_20220927_counts.out 
Shape: 94381 9 
                                                                  Name
1 (Nadav:posCtrl:seq33763.pos1.GATA1;Nadav:posCtrl:ENSG00000102317.270
2                                                  1:10437778:C:T:R:wC
3                                                  1:10451799:C:T:R:wC
  Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2
1        897        717        761        724       56816       37391
2        645        437        491        493        2239        1698
3        603        584        512        429         336         280
  Output.rep3 Output.rep4
1       36946       52349
2        1202        2532
3         321         333

=======================
OL45_20220927_K562_normalized_counts.out 
Shape: 94381 9 
                                                                  Name
1 (Nadav:posCtrl:seq33763.pos1.GATA1;Nadav:posCtrl:ENSG00000102317.270
2                                                  1:10437778:C:T:R:wC
3                                                  1:10451799:C:T:R:wC
  Input.rep1 Input.rep2 Input.rep3 Input.rep4 Output.rep1 Output.rep2
1   660.3527   622.9447   644.2283   623.2888  69961.0133  56175.0918
2   474.8355   379.6748   415.6585   424.4218   2757.0175   2551.0231
3   443.9160   507.3915   433.4361   369.3244    413.7373    420.6634
  Output.rep3 Output.rep4
1  62517.7362  58457.7541
2   2033.9501   2827.4663
3    543.1763    371.8587

Preprocess the table: Extract only tiling regions

Define helper functions for data processing

Code

get_strand = function(x){ifelse(is.na(x), "+", "-")}
add_chrom  = function(x){ifelse(str_detect(x, "chr"), x, paste0("chr", x))}

fun_extract   = function(dat) {
    ### Input:  dataframe
    ### Output: dataframe
    ### Description:
    ###     Preprocess the raw dataframe
    ###     - expand duplicates
    ###     - filter and extract only tiling oligos
    ###     - get location information
    ### ===========================================
    ### init regex pattern
    pattern = "^[chr|0-9|X]+:[0-9]*-[0-9]*"
    
    ### preprocess
    res = dat %>% 
        dplyr::mutate(Name = gsub("\\(|\\)", "", Name)) %>% 
        tidyr::separate_rows(Name, sep=";") %>%
        dplyr::filter(!grepl(pattern = "Nadav", Name)) %>%
        dplyr::filter(str_detect(string=Name, pattern=pattern)) %>%
        tidyr::separate(Name, c("Chrom", "ChromStart", "ChromEnd", "Strand"), remove=FALSE, fill="right") %>%
        dplyr::mutate(
            Chrom      = add_chrom(Chrom),
            ChromStart = as.integer(ChromStart),
            ChromEnd   = as.integer(ChromEnd),
            Name       = add_chrom(Name),
            Strand     = get_strand(Strand))
    return(res)
}

Apply processing functions

Code

### init
lst = lst_dat_read

### before preprocessed
print(lapply(lst, dim))
cat("++++++++++++++++++++++++++++++++++++++++\n")

### after proprocessed
lst = lapply(lst, fun_extract)
lst_dat_extract = lst
print(lapply(lst, dim))

$TMPRA_K562_OL13_20220512.hg19.raw
[1] 55229     9

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 55229     9

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 99307    12

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 99307    12

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 94381     9

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 94381     9

++++++++++++++++++++++++++++++++++++++++
$TMPRA_K562_OL13_20220512.hg19.raw
[1] 43907    13

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 43907    13

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 96108    16

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 96108    16

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 91110    13

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 91110    13

Show results

Code

set.seed(123)
lst = lst_dat_extract
lapply(lst, slice_sample, n=3)

$TMPRA_K562_OL13_20220512.hg19.raw

A tibble: 3 × 13
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4
<chr>	<chr>	<int>	<int>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
chr11:61559376-61559575_RC	chr11	61559376	61559575	-	363	256	368	252	0	32	0	37
chr11:61628981-61629180_RC	chr11	61628981	61629180	-	823	497	638	344	78	300	263	298
chr11:61628421-61628620	chr11	61628421	61628620	+	400	378	637	250	138	165	295	236

$TMPRA_K562_OL13_20220512.hg19.norm

A tibble: 3 × 13
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4
<chr>	<chr>	<int>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
chr11:61648876-61649075_RC	chr11	61648876	61649075	-	303.0849	283.1768	304.58401	238.8869	179.7249	181.1592	0.0000	363.8974
chr11:61558796-61558995	chr11	61558796	61558995	+	488.8819	550.6760	591.39550	539.9232	842.9954	215.3781	674.4801	461.5880
chr11:61652526-61652725	chr11	61652526	61652725	+	118.9320	117.5821	86.41116	140.8073	0.0000	255.6357	442.8662	124.5555

$TMPRA_K562_OL43_20221003.hg38.raw

A tibble: 3 × 16
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Input.rep5	Input.rep6	Output.rep1	Output.rep2	Output.rep3	Output.rep4	Output.rep5
<chr>	<chr>	<int>	<int>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
chr8:127212351-127212550	chr8	127212351	127212550	+	498	576	544	596	231	270	228	201	369	182	251
chrX:48539581-48539780	chrX	48539581	48539780	+	271	290	221	253	108	139	15877	10329	17175	14124	14530
chrX:48294251-48294450	chrX	48294251	48294450	+	181	221	277	253	96	106	38	73	128	74	105

$TMPRA_K562_OL43_20221003.hg38.norm

A tibble: 3 × 16
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Input.rep5	Input.rep6	Output.rep1	Output.rep2	Output.rep3	Output.rep4	Output.rep5
<chr>	<chr>	<int>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
chr8:127441201-127441400	chr8	127441201	127441400	+	233.6449	222.3104	171.1604	221.1921	215.1536	252.4283	228.0332	121.5305	212.6315	184.5061	156.8328
chr8:127491151-127491350	chr8	127491151	127491350	+	438.3367	409.4564	412.1954	390.8565	355.9815	352.6189	642.2158	616.5884	457.6434	496.6147	508.6468
chr8:128028501-128028700	chr8	128028501	128028700	+	1120.4181	1169.3644	1135.3002	1182.6237	1090.1117	1122.9157	1002.1049	886.4576	943.3499	1046.6844	916.9771

$TMPRA_K562_OL45_20220927.hg38.raw

A tibble: 3 × 13
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4
<chr>	<chr>	<int>	<int>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
chr20:58250701-58250900	chr20	58250701	58250900	+	225	216	217	192	120	105	121	98
chr11:33783701-33783900	chr11	33783701	33783900	+	786	596	647	773	2050	1933	1917	2263
chr11:5570301-5570500	chr11	5570301	5570500	+	294	254	226	187	158	137	110	250

$TMPRA_K562_OL45_20220927.hg38.norm

A tibble: 3 × 13
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4
<chr>	<chr>	<int>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
chr2:60047901-60048100	chr2	60047901	60048100	+	635.3226	677.6804	700.1009	686.1342	446.9841	381.6018	419.6503	503.62848
chr11:33672501-33672700	chr11	33672501	33672700	+	711.1490	671.5987	617.9851	668.9163	507.3208	438.6918	417.9581	714.68343
chr11:34229601-34229800	chr11	34229601	34229800	+	128.8313	118.1597	126.1367	141.1870	155.1515	174.2748	135.3711	92.68551

Code

lst = lst_dat_extract
lapply(lst, function(dat){table(dat$Chrom)})

$TMPRA_K562_OL13_20220512.hg19.raw

chr11 
43907 

$TMPRA_K562_OL13_20220512.hg19.norm

chr11 
43907 

$TMPRA_K562_OL43_20221003.hg38.raw

 chr8  chrX 
41905 54203 

$TMPRA_K562_OL43_20221003.hg38.norm

 chr8  chrX 
41905 54203 

$TMPRA_K562_OL45_20220927.hg38.raw

chr11 chr16  chr2 chr20 
39857 11439 19917 19897 

$TMPRA_K562_OL45_20220927.hg38.norm

chr11 chr16  chr2 chr20 
39857 11439 19917 19897

Filter out the fragments (Oligos) where mean input is zero

Define helper functions

Code

fun_filter = function(dat){
    ### Input:  dataframe
    ### Output: dataframe
    ### Description:
    ###     calculate the mean of input for each fragment and 
    ###     filter out the fragment w/ plasmid mean is zero
    ###     - convert wide matrix to long matrix
    ###     - group by the input and output to calculate the mean
    ###     - get the non-zero fragments from the input dataframe
    
    ### calculate the mean of input for each fragment and 
    tmp = dat %>% 
        tidyr::gather(Sample, Value, -Chrom, -ChromStart, -ChromEnd, -Name, -Strand) %>% 
        tidyr::separate(Sample, c("Group", "Rep"), sep="\\.", remove=FALSE) %>%
        dplyr::filter(Group == "Input") %>%
        dplyr::group_by(Chrom, ChromStart, ChromEnd, Name, Strand, Group) %>%
        dplyr::summarise(Value = mean(Value), .groups="drop")
    
    ### filter out the fragment w/ plasmid mean is zero
    tmp = tmp %>% dplyr::filter(Value > 0)
    idx = unique(tmp$Name)
    res = dat %>% dplyr::filter(Name %in% idx)
    return(res)
}

Filter fragments

Code

### init
lst = lst_dat_extract

### show info before processing
print(lapply(lst, dim))
cat("++++++++++++++++++++++++++++++++++++++++\n")

### processing
lst = lapply(lst, fun_filter)

### assign and show info
lst_dat_filter = lst
print(lapply(lst, dim))

$TMPRA_K562_OL13_20220512.hg19.raw
[1] 43907    13

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 43907    13

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 96108    16

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 96108    16

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 91110    13

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 91110    13

++++++++++++++++++++++++++++++++++++++++
$TMPRA_K562_OL13_20220512.hg19.raw
[1] 43871    13

$TMPRA_K562_OL13_20220512.hg19.norm
[1] 43871    13

$TMPRA_K562_OL43_20221003.hg38.raw
[1] 96087    16

$TMPRA_K562_OL43_20221003.hg38.norm
[1] 96087    16

$TMPRA_K562_OL45_20220927.hg38.raw
[1] 91092    13

$TMPRA_K562_OL45_20220927.hg38.norm
[1] 91092    13

Show results

Code

lst = lst_dat_filter
dat = lst[[1]]
head(dat, 3)

A tibble: 3 × 13
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4
<chr>	<chr>	<int>	<int>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
chr11:61555216-61555415	chr11	61555216	61555415	+	1609	1221	1396	798	845	283	574	1223
chr11:61555231-61555430_RC	chr11	61555231	61555430	-	1179	582	1225	721	476	416	912	956
chr11:61555216-61555415_RC	chr11	61555216	61555415	-	1066	643	1206	681	470	846	540	847

Calculate mean for normalized count table

Define helper functions

Code

fun_add_mean_column = function(dat){
    ### get input and output to calculate log2FC
    x_inp = dat %>% dplyr::select(starts_with("Input"))  %>% apply(., 1, mean)
    x_out = dat %>% dplyr::select(starts_with("Output")) %>% apply(., 1, mean)
    x_lfc = log2(x_out) - log2(x_inp)
    
    ### add columns
    dat$Input.mean  = x_inp
    dat$Output.mean = x_out
    dat$Log2FC.mean = x_lfc
 
    return(dat)
}

Calculate mean and add it as an additional column for normalized count

Code

lst = lst_dat_filter
print(names(lst))
cat("=======================\n")
sid = grep("norm", names(lst), value=TRUE)
print(sid)

[1] "TMPRA_K562_OL13_20220512.hg19.raw"  "TMPRA_K562_OL13_20220512.hg19.norm"
[3] "TMPRA_K562_OL43_20221003.hg38.raw"  "TMPRA_K562_OL43_20221003.hg38.norm"
[5] "TMPRA_K562_OL45_20220927.hg38.raw"  "TMPRA_K562_OL45_20220927.hg38.norm"
=======================
[1] "TMPRA_K562_OL13_20220512.hg19.norm" "TMPRA_K562_OL43_20221003.hg38.norm"
[3] "TMPRA_K562_OL45_20220927.hg38.norm"

Code

### init
lst  = lst_dat_filter
idxs = grep("norm", names(lst), value=TRUE)

### add mean for normalization count
for (idx in idxs){
    cat(idx, "\n")
    lst[[idx]] = fun_add_mean_column(lst[[idx]])
}

### assigned
lst_dat_prep = lst

TMPRA_K562_OL13_20220512.hg19.norm 
TMPRA_K562_OL43_20221003.hg38.norm 
TMPRA_K562_OL45_20220927.hg38.norm

Show results

Code

lst = lst_dat_prep
idx = names(lst)[1]
dat = lst_dat_prep[[1]]

cat(idx)
head(dat, 3)

TMPRA_K562_OL13_20220512.hg19.raw

A tibble: 3 × 13
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4
<chr>	<chr>	<int>	<int>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
chr11:61555216-61555415	chr11	61555216	61555415	+	1609	1221	1396	798	845	283	574	1223
chr11:61555231-61555430_RC	chr11	61555231	61555430	-	1179	582	1225	721	476	416	912	956
chr11:61555216-61555415_RC	chr11	61555216	61555415	-	1066	643	1206	681	470	846	540	847

Code

lst = lst_dat_prep
idx = names(lst)[2]
dat = lst_dat_prep[[2]]

cat(idx)
head(dat, 3)

TMPRA_K562_OL13_20220512.hg19.norm

A tibble: 3 × 16
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Output.rep1	Output.rep2	Output.rep3	Output.rep4	Input.mean	Output.mean	Log2FC.mean
<chr>	<chr>	<int>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
chr11:61555216-61555415	chr11	61555216	61555415	+	881.8510	1196.3976	855.5317	774.9257	1807.947	569.6449	730.4747	1493.445	927.1765	1150.378	0.3111919
chr11:61555231-61555430_RC	chr11	61555231	61555430	-	646.1792	570.2730	750.7352	700.1522	1018.441	837.3579	1160.6148	1167.403	666.8349	1045.954	0.6494180
chr11:61555216-61555415_RC	chr11	61555216	61555415	-	584.2468	630.0439	739.0912	661.3088	1005.604	1702.8961	687.2061	1034.299	653.6727	1107.501	0.7606680

Code

lst = lst_dat_prep
idx = names(lst)[3]
dat = lst_dat_prep[[3]]

cat(idx)
head(dat, 3)

TMPRA_K562_OL43_20221003.hg38.raw

A tibble: 3 × 16
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Input.rep5	Input.rep6	Output.rep1	Output.rep2	Output.rep3	Output.rep4	Output.rep5
<chr>	<chr>	<int>	<int>	<chr>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<int>
chr8:127381651-127381850	chr8	127381651	127381850	+	1318	1810	1477	1505	738	687	620	636	781	520	538
chr8:127320401-127320600	chr8	127320401	127320600	+	1318	1810	1477	1505	738	687	620	636	781	520	538
chr8:127742001-127742200	chr8	127742001	127742200	+	634	730	698	761	316	343	158	194	268	141	251

Code

lst = lst_dat_prep
idx = names(lst)[4]
dat = lst_dat_prep[[4]]

cat(idx)
head(dat, 3)

TMPRA_K562_OL43_20221003.hg38.norm

A tibble: 3 × 19
Name	Chrom	ChromStart	ChromEnd	Strand	Input.rep1	Input.rep2	Input.rep3	Input.rep4	Input.rep5	Input.rep6	Output.rep1	Output.rep2	Output.rep3	Output.rep4	Output.rep5	Input.mean	Output.mean	Log2FC.mean
<chr>	<chr>	<int>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
chr8:127381651-127381850	chr8	127381651	127381850	+	887.4466	1078.7715	946.8314	945.7219	962.3235	893.9085	961.7725	1136.6673	842.9706	896.6654	760.1443	952.5006	919.6440	-0.0506444
chr8:127320401-127320600	chr8	127320401	127320600	+	887.4466	1078.7715	946.8314	945.7219	962.3235	893.9085	961.7725	1136.6673	842.9706	896.6654	760.1443	952.5006	919.6440	-0.0506444
chr8:127742001-127742200	chr8	127742001	127742200	+	426.8901	435.0846	447.4531	478.2022	412.0518	446.3037	245.0969	346.7193	289.2652	243.1343	354.6398	440.9976	295.7711	-0.5762898

Store the fragments and counts

<Data>
* OL13 (FADS)
* OL43 (GATA/MYC)
* OL45 (HBE1/LMO2/RBM38/HBA2/BCL11A)

- OL13_20220512_counts.out
- OL13_20220512_normalized_counts.out
- OL43_20211228_counts.out      
- OL43_20211228_normalized_counts.out
- OL43_20221003_counts.out 
- OL43_20221003_K562_normalized_counts.out
- OL45_20220927_counts.out 
- OL45_20220927_K562_normalized_counts.out

<Results>
PROJECT/results/assay_fcc
└── MPRA_Tiling_K562_Tewhey_Hannah
    ├── fragment_counts
    │   └── summary
    │       ├── TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL13_20220512.hg19.norm.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL43_20221003.hg38.norm.stranded_pos.tsv 
    │       ├── TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.tsv 
    │       └── TMPRA_K562_OL45_20220927.hg38.norm.stranded_pos.tsv 
    │
    └── fragment
        ├── TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.bed.gz 
        ├── TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.bed.gz 
        └── TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.bed.gz

Setup output directory

Code

### create fragment folder
txt_folder = "fragment"
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, txt_folder)
cat(txt_fdiry, "\n")

txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

### create fragment count folders
txt_folder = "fragment_counts/summary"
txt_fdiry = file.path(FD_RES, "assay_fcc", TXT_ASSAY, txt_folder)
cat(txt_fdiry, "\n")

txt_cmd = paste("mkdir -p", txt_fdiry)
system(txt_cmd)

/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment 
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment_counts/summary

Code

### check loop names
lst  = lst_dat_prep
idxs = grep("raw", names(lst), value=TRUE)
for (idx in idxs) {cat(idx, "\n")}

TMPRA_K562_OL13_20220512.hg19.raw 
TMPRA_K562_OL43_20221003.hg38.raw 
TMPRA_K562_OL45_20220927.hg38.raw

Store fragments

Code

txt_assay  = TXT_ASSAY
txt_folder = "fragment"
txt_strand = "stranded_pos"

lst  = lst_dat_prep
idxs = grep("raw", names(lst), value=TRUE)

txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
cat("Output directory:", "\n")
cat(txt_fdiry, "\n")

for (idx in idxs){
    ### show progres
    cat("\n=======================")
    cat("\nSample:", idx, "\n")
    
    ### init
    dat = lst[[idx]]
    
    ### extract positive trands and order by positions
    dat = dat %>% 
        dplyr::filter(Strand == "+") %>% 
        dplyr::arrange(Chrom, ChromStart, ChromEnd)
    
    ### arrange the column into bed file format
    dat = dat %>%
        dplyr::mutate(Score = ".") %>%
        dplyr::select(Chrom, ChromStart, ChromEnd, Name, Score, Strand)
    
    ### save table
    txt_fname = paste(idx, txt_strand, "bed.gz", sep=".")
    txt_fpath = file.path(txt_fdiry, txt_fname)
    write_tsv(dat, txt_fpath, col_names=FALSE)
    
    ### show progress
    #print(fpath)
    print(head(dat))
    cat("\nSaved Table:", txt_fname, "\n")
}

Output directory: 
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment 

=======================
Sample: TMPRA_K562_OL13_20220512.hg19.raw 
# A tibble: 6 × 6
  Chrom ChromStart ChromEnd Name                    Score Strand
  <chr>      <int>    <int> <chr>                   <chr> <chr> 
1 chr11   61554801 61555000 chr11:61554801-61555000 .     +     
2 chr11   61554806 61555005 chr11:61554806-61555005 .     +     
3 chr11   61554811 61555010 chr11:61554811-61555010 .     +     
4 chr11   61554816 61555015 chr11:61554816-61555015 .     +     
5 chr11   61554821 61555020 chr11:61554821-61555020 .     +     
6 chr11   61554826 61555025 chr11:61554826-61555025 .     +     

Saved Table: TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.bed.gz 

=======================
Sample: TMPRA_K562_OL43_20221003.hg38.raw 
# A tibble: 6 × 6
  Chrom ChromStart  ChromEnd Name                     Score Strand
  <chr>      <int>     <int> <chr>                    <chr> <chr> 
1 chr8   126735901 126736100 chr8:126735901-126736100 .     +     
2 chr8   126735951 126736150 chr8:126735951-126736150 .     +     
3 chr8   126736001 126736200 chr8:126736001-126736200 .     +     
4 chr8   126736051 126736250 chr8:126736051-126736250 .     +     
5 chr8   126736101 126736300 chr8:126736101-126736300 .     +     
6 chr8   126736151 126736350 chr8:126736151-126736350 .     +     

Saved Table: TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.bed.gz 

=======================
Sample: TMPRA_K562_OL45_20220927.hg38.raw 
# A tibble: 6 × 6
  Chrom ChromStart ChromEnd Name                  Score Strand
  <chr>      <int>    <int> <chr>                 <chr> <chr> 
1 chr11    4505501  4505700 chr11:4505501-4505700 .     +     
2 chr11    4505601  4505800 chr11:4505601-4505800 .     +     
3 chr11    4505701  4505900 chr11:4505701-4505900 .     +     
4 chr11    4505801  4506000 chr11:4505801-4506000 .     +     
5 chr11    4505901  4506100 chr11:4505901-4506100 .     +     
6 chr11    4506001  4506200 chr11:4506001-4506200 .     +     

Saved Table: TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.bed.gz

Store table of counts

Code

txt_assay  = TXT_ASSAY
txt_folder = "fragment_counts/summary"
txt_strand = "stranded_pos"

lst  = lst_dat_prep
idxs = names(lst)

txt_fdiry = file.path(FD_RES, "assay_fcc", txt_assay, txt_folder)
cat("Output directory:", "\n")
cat(txt_fdiry, "\n")

for (idx in idxs){
    ### show progres
    cat("\n=======================")
    cat("\nSample:", idx, "\n")
    
    ### init
    dat = lst[[idx]]
    
    ### extract positive trands and order by positions
    dat = dat %>% 
       dplyr::filter(Strand == "+") %>% 
       dplyr::arrange(Chrom, ChromStart, ChromEnd)
    
    ### save table
    txt_fname = paste(idx, txt_strand, "tsv", sep=".")
    txt_fpath = file.path(txt_fdiry, txt_fname)
    write_tsv(dat, txt_fpath)

    ### show progress
    print(head(dat))
    cat("\nSaved Table:", txt_fname, "\n")
}

Output directory: 
/mount/repo/Proj_ENCODE_FCC/results/assay_fcc/MPRA_Tiling_K562_Tewhey_Hannah/fragment_counts/summary 

=======================
Sample: TMPRA_K562_OL13_20220512.hg19.raw 
# A tibble: 6 × 13
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <int>      <int>      <int>
1 chr11:61554… chr11   61554801 61555000 +             971        542        785
2 chr11:61554… chr11   61554806 61555005 +            1267        562       1059
3 chr11:61554… chr11   61554811 61555010 +            1183        641       1118
4 chr11:61554… chr11   61554816 61555015 +            1020        476       1086
5 chr11:61554… chr11   61554821 61555020 +            1138        580        894
6 chr11:61554… chr11   61554826 61555025 +             702        406        745
# ℹ 5 more variables: Input.rep4 <int>, Output.rep1 <int>, Output.rep2 <int>,
#   Output.rep3 <int>, Output.rep4 <int>

Saved Table: TMPRA_K562_OL13_20220512.hg19.raw.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL13_20220512.hg19.norm 
# A tibble: 6 × 16
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <dbl>      <dbl>      <dbl>
1 chr11:61554… chr11   61554801 61555000 +            532.       531.       481.
2 chr11:61554… chr11   61554806 61555005 +            694.       551.       649.
3 chr11:61554… chr11   61554811 61555010 +            648.       628.       685.
4 chr11:61554… chr11   61554816 61555015 +            559.       466.       666.
5 chr11:61554… chr11   61554821 61555020 +            624.       568.       548.
6 chr11:61554… chr11   61554826 61555025 +            385.       398.       457.
# ℹ 8 more variables: Input.rep4 <dbl>, Output.rep1 <dbl>, Output.rep2 <dbl>,
#   Output.rep3 <dbl>, Output.rep4 <dbl>, Input.mean <dbl>, Output.mean <dbl>,
#   Log2FC.mean <dbl>

Saved Table: TMPRA_K562_OL13_20220512.hg19.norm.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL43_20221003.hg38.raw 
# A tibble: 6 × 16
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <int>      <int>      <int>
1 chr8:126735… chr8   126735901   1.27e8 +            1190       1409       1259
2 chr8:126735… chr8   126735951   1.27e8 +             194        161        161
3 chr8:126736… chr8   126736001   1.27e8 +             746        757        708
4 chr8:126736… chr8   126736051   1.27e8 +             934       1056        988
5 chr8:126736… chr8   126736101   1.27e8 +             697        887        772
6 chr8:126736… chr8   126736151   1.27e8 +            1225       1454       1264
# ℹ 8 more variables: Input.rep4 <int>, Input.rep5 <int>, Input.rep6 <int>,
#   Output.rep1 <int>, Output.rep2 <int>, Output.rep3 <int>, Output.rep4 <int>,
#   Output.rep5 <int>

Saved Table: TMPRA_K562_OL43_20221003.hg38.raw.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL43_20221003.hg38.norm 
# A tibble: 6 × 19
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <dbl>      <dbl>      <dbl>
1 chr8:126735… chr8   126735901   1.27e8 +            801.      840.        807.
2 chr8:126735… chr8   126735951   1.27e8 +            131.       96.0       103.
3 chr8:126736… chr8   126736001   1.27e8 +            502.      451.        454.
4 chr8:126736… chr8   126736051   1.27e8 +            629.      629.        633.
5 chr8:126736… chr8   126736101   1.27e8 +            469.      529.        495.
6 chr8:126736… chr8   126736151   1.27e8 +            825.      867.        810.
# ℹ 11 more variables: Input.rep4 <dbl>, Input.rep5 <dbl>, Input.rep6 <dbl>,
#   Output.rep1 <dbl>, Output.rep2 <dbl>, Output.rep3 <dbl>, Output.rep4 <dbl>,
#   Output.rep5 <dbl>, Input.mean <dbl>, Output.mean <dbl>, Log2FC.mean <dbl>

Saved Table: TMPRA_K562_OL43_20221003.hg38.norm.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL45_20220927.hg38.raw 
# A tibble: 6 × 13
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <int>      <int>      <int>
1 chr11:45055… chr11    4505501  4505700 +             811        664        917
2 chr11:45056… chr11    4505601  4505800 +             693        552        607
3 chr11:45057… chr11    4505701  4505900 +             577        414        448
4 chr11:45058… chr11    4505801  4506000 +             855        754        688
5 chr11:45059… chr11    4505901  4506100 +             317        217        313
6 chr11:45060… chr11    4506001  4506200 +             444        422        454
# ℹ 5 more variables: Input.rep4 <int>, Output.rep1 <int>, Output.rep2 <int>,
#   Output.rep3 <int>, Output.rep4 <int>

Saved Table: TMPRA_K562_OL45_20220927.hg38.raw.stranded_pos.tsv 

=======================
Sample: TMPRA_K562_OL45_20220927.hg38.norm 
# A tibble: 6 × 16
  Name         Chrom ChromStart ChromEnd Strand Input.rep1 Input.rep2 Input.rep3
  <chr>        <chr>      <int>    <int> <chr>       <dbl>      <dbl>      <dbl>
1 chr11:45055… chr11    4505501  4505700 +            597.       577.       776.
2 chr11:45056… chr11    4505601  4505800 +            510.       480.       514.
3 chr11:45057… chr11    4505701  4505900 +            425.       360.       379.
4 chr11:45058… chr11    4505801  4506000 +            629.       655.       582.
5 chr11:45059… chr11    4505901  4506100 +            233.       189.       265.
6 chr11:45060… chr11    4506001  4506200 +            327.       367.       384.
# ℹ 8 more variables: Input.rep4 <dbl>, Output.rep1 <dbl>, Output.rep2 <dbl>,
#   Output.rep3 <dbl>, Output.rep4 <dbl>, Input.mean <dbl>, Output.mean <dbl>,
#   Log2FC.mean <dbl>

Saved Table: TMPRA_K562_OL45_20220927.hg38.norm.stranded_pos.tsv