2 Technical properties

2.1 Breadth of coverage

Code to generate *.covhist files. covhist files are included in the extdata/covhist/ directory.

For ACT, 100 cells were sampled from each dataset.

> headers_commands.sh
> reads_commands.sh
> delete_commands.sh
> sort_commands.sh
> genomecoveragebed_commands.sh
> mark_duplicates_commands.sh
for INBAM in /PATH/TO/BAM/FILES/*.sort.bam
do
    INFILEBASE=`basename $INBAM`
    INSAMPLEBASE=${INFILEBASE%.sort.bam}
    COVHISTFILE=${INSAMPLEBASE}.covhist.txt
    OUTBAM=$INSAMPLEBASE.sorted.bam
    INSAMPLEBASE_MD=`basename -s .sort.bam $INFILEBASE`
    OUTBAM_MD=$INSAMPLEBASE

    # marking duplicates
    touch $OUTBAM.txt
    
    printf "%s\n" "java -XX:ParallelGCThreads=4 -Xmx150g -jar picard.jar MarkDuplicates I=$INFILEBASE O=$OUTBAM_MD.marked.sort.bam M=$OUTBAM_MD.marked.sort.bam.txt  VALIDATION_STRINGENCY=SILENT AS=true REMOVE_DUPLICATES=true  MAX_RECORDS_IN_RAM=10000000 TMP_DIR=tmp" >> mark_duplicates_commands.sh

    # Downsampling
    # Save headers, which will not be included when downsampling
    printf "%s\n" "samtools view $INSAMPLEBASE.marked.sort.bam -H > $INSAMPLEBASE.sam" >> headers_commands.sh
    # Downsample by randomly selecting a subset of rows besides the header, and
    # appending them to the header
    # Use the file itself as the seed for the random number generator
    printf "%s\n" "samtools view $INSAMPLEBASE.marked.sort.bam | shuf -n 800000 --random-source=$INSAMPLEBASE.marked.sort.bam >> $INSAMPLEBASE.sam" >> reads_commands.sh
    # Sort the downsampled SAM file
    printf "%s\n" "samtools sort -o $OUTBAM $INSAMPLEBASE.sam" >> sort_commands.sh
    # Delete the unsorted SAM files
    printf "%s\n" "rm $INSAMPLEBASE.sam" >> delete_commands.sh
    # Calculate coverage maximum read size of 50 to all reads
    printf "genomeCoverageBed -ibam $OUTBAM -fs 50 -g genomes/human.hg19.genome > $COVHISTFILE\n" >> genomecoveragebed_commands.sh
done
parallel --jobs 30 < mark_duplicates_commands.sh
parallel --jobs 70 < headers_commands.sh
parallel --jobs 70 < reads_commands.sh
parallel --jobs 70 < sort_commands.sh
parallel --jobs 70 < delete_commands.sh
parallel --jobs 70 < genomecoveragebed_commands.sh
tn1_cov <- calc_coverage(path = here("extdata/covhist/TN1/")) %>% 
  mutate(sample = "TN1",
         tech = "ACT",
         cellname = str_replace(cellname, "TN28", "TN1"))

tn2_cov <- calc_coverage(path = here("extdata/covhist/TN2/")) %>% 
  mutate(sample = "TN2",
         tech = "ACT",
         cellname = str_replace(cellname, "TN20", "TN2")) 

tn3_cov <- calc_coverage(path = here("extdata/covhist/TN3/")) %>% 
  mutate(sample = "TN3",
         tech = "ACT",
         cellname = str_replace(cellname, "TN17", "TN3"))

tn4_cov <- calc_coverage(path = here("extdata/covhist/TN4/")) %>% 
  mutate(sample = "TN4",
         tech = "ACT",
         cellname = str_replace(cellname, "TN26", "TN4"))

tn1_10xcnv_cov <-
  calc_coverage(path = here("extdata/covhist/TN1_10XCNA/")) %>%
  mutate(sample = "TN1",
         tech = "10X CNA")

tn3_10xcnv_cov <- calc_coverage(path = here("extdata/covhist/TN3_10XCNA/")) %>% 
  mutate(sample = "TN3",
         tech = "10X CNA")

t2_doppcr_cov <- calc_coverage(path = here("extdata/covhist/T2_DOPPCR//")) %>% 
  mutate(sample = "T2",
         tech = "DOP-PCR")

t4_doppcr_cov <- calc_coverage(path = here("extdata/covhist/T4_DOPPCR/")) %>% 
  mutate(sample = "T4",
         tech = "DOP-PCR")

t8_doppcr_cov <- calc_coverage(path = here("extdata/covhist/T8_DOPPCR/")) %>% 
  mutate(sample = "T8",
         tech = "DOP-PCR")

t10_doppcr_cov <- calc_coverage(path = here("extdata/covhist/T10_DOPPCR/")) %>% 
  mutate(sample = "T10",
         tech = "DOP-PCR")

dlp_htert_cov <- calc_coverage(path = here("extdata/covhist/DLP_htert/")) %>% 
  mutate(sample = "hTERT",
         tech = "DLP")

dlp_xeno_cov <- calc_coverage(path = here("extdata/covhist/DLP_xeno/")) %>% 
  mutate(sample = "Xeno",
         tech = "DLP")

# bincounts
TN1_bincounts <- readRDS(here("extdata/bincounts/TN1_bincounts.rds"))

TN2_bincounts <- readRDS(here("extdata/bincounts/TN2_bincounts.rds"))

TN3_bincounts <- readRDS(here("extdata/bincounts/TN3_bincounts.rds"))

TN4_bincounts <- readRDS(here("extdata/bincounts/TN4_bincounts.rds"))

TN1_10XCNA_bincounts <- readRDS(here("extdata/bincounts/TN1_10XCNA_bincounts.rds"))

TN3_10XCNA_bincounts <- readRDS(here("extdata/bincounts/TN3_10XCNA_bincounts.rds"))

DLP_xeno_bincounts <- readRDS(here("extdata/bincounts/DLP_xeno_bincounts.rds"))

DLP_htert_bincounts <- readRDS(here("extdata/bincounts/DLP_htert_bincounts.rds"))

T2_DOPPCR_bincounts <- readRDS(here("extdata/bincounts/T2_DOPPCR_bincounts.rds"))

T4_DOPPCR_bincounts <- readRDS(here("extdata/bincounts/T4_DOPPCR_bincounts.rds"))

T8_DOPPCR_bincounts <- readRDS(here("extdata/bincounts/T8_DOPPCR_bincounts.rds"))

T10_DOPPCR_bincounts <- readRDS(here("extdata/bincounts/T10_DOPPCR_bincounts.rds"))

## # A tibble: 1 x 6
##   .y.         n statistic    df         p method        
## * <chr>   <int>     <dbl> <int>     <dbl> <chr>         
## 1 breadth   662      537.     3 4.06e-116 Kruskal-Wallis
## # A tibble: 6 x 9
##   .y.     group1  group2     n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>   <chr>   <chr>   <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 breadth 10X CNA ACT       200   100      6.98 2.97e-12 5.95e-12 ****        
## 2 breadth 10X CNA DLP       200   228    -11.3  2.10e-29 6.29e-29 ****        
## 3 breadth 10X CNA DOP-PCR   200   134    -16.3  1.24e-59 6.18e-59 ****        
## 4 breadth ACT     DLP       100   228    -16.2  3.62e-59 1.45e-58 ****        
## 5 breadth ACT     DOP-PCR   100   134    -20.2  5.71e-91 3.43e-90 ****        
## 6 breadth DLP     DOP-PCR   228   134     -6.68 2.35e-11 2.35e-11 ****

2.2 Overdispersion

TN1_overdispersion <-  map_dfr(TN1_bincounts, 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "TN1",
         tech = "ACT",
         cells = names(TN1_bincounts)) %>% 
  dplyr::rename(iod = "V1")

TN2_overdispersion <-  map_dfr(TN2_bincounts, 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "TN2",
         tech = "ACT",
         cells = names(TN2_bincounts)) %>% 
  dplyr::rename(iod = "V1")

TN3_overdispersion <-  map_dfr(TN3_bincounts, 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "TN3",
         tech = "ACT",
         cells = names(TN3_bincounts)) %>% 
  dplyr::rename(iod = "V1")

TN4_overdispersion <-  map_dfr(TN4_bincounts, 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "TN4",
         tech = "ACT",
         cells = names(TN4_bincounts)) %>% 
  dplyr::rename(iod = "V1")

#10X CNA
names(TN1_10XCNA_bincounts) <- str_replace(names(TN1_10XCNA_bincounts), "\\.", "-")

TN1_10XCNA_overdispersion <- map_dfr(TN1_10XCNA_bincounts, 
                          overdispersion) %>% t() %>% as.data.frame() %>% 
  mutate(sample = "TN1",
         tech = "10X CNA",
         cells = names(TN1_10XCNA_bincounts)) %>% 
  dplyr::rename(iod = "V1")

names(TN3_10XCNA_bincounts) <- str_replace(names(TN3_10XCNA_bincounts), "\\.", "-")

TN3_10XCNA_overdispersion <- map_dfr(TN3_10XCNA_bincounts, 
                          overdispersion) %>% t() %>% as.data.frame() %>% 
  mutate(sample = "TN3",
         tech = "10X CNA",
         cells = names(TN3_10XCNA_bincounts)) %>% 
  dplyr::rename(iod = "V1")

# DLP

DLP_xeno_overdispersion <-  map_dfr(DLP_xeno_bincounts[,-c(1:3)], 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "Xeno",
         tech = "DLP",
         cells = names(DLP_xeno_bincounts)[-c(1:3)]) %>% 
  dplyr::rename(iod = "V1")

DLP_htert_overdispersion <-  map_dfr(DLP_htert_bincounts[,-c(1:3)], 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "hTERT",
         tech = "DLP",
         cells = names(DLP_htert_bincounts)[-c(1:3)]) %>% 
  dplyr::rename(iod = "V1")

#DOP-PCR
T2_DOPPCR_overdispersion <-  map_dfr(T2_DOPPCR_bincounts[,-c(1:3)], 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "T2",
         tech = "DOP-PCR",
         cells = toupper(names(T2_DOPPCR_bincounts)[-c(1:3)])) %>% 
  dplyr::rename(iod = "V1")

T4_DOPPCR_overdispersion <-  map_dfr(T4_DOPPCR_bincounts[,-c(1:3)], 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "T4",
         tech = "DOP-PCR",
         cells = toupper(names(T4_DOPPCR_bincounts)[-c(1:3)])) %>% 
  dplyr::rename(iod = "V1")

T8_DOPPCR_overdispersion <-  map_dfr(T8_DOPPCR_bincounts[,-c(1:3)], 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "T8",
         tech = "DOP-PCR",
         cells = toupper(names(T8_DOPPCR_bincounts)[-c(1:3)])) %>% 
  dplyr::rename(iod = "V1")

T10_DOPPCR_overdispersion <-  map_dfr(T10_DOPPCR_bincounts[,-c(1:3)], 
                    overdispersion) %>%
  t() %>%
  as.data.frame() %>% 
  mutate(sample = "T10",
         tech = "DOP-PCR",
         cells = toupper(names(T10_DOPPCR_bincounts)[-c(1:3)])) %>% 
  dplyr::rename(iod = "V1")