8 MDA-231 DNA & RNA

Transcript abundances for expanded clones triplicates were quantified by Salmon (v.0.14) with GENCODE transcript v30 and options -l A -1 read1 -2 read2 -p 40 –validateMappings –seqBias –gcBias. Quantified transcripts were imported into R with ‘tximport’ (v 1.14). Expanded clones e7, e39 and e71 had one technical replicate excluded due to poor RNA quality

source("R/setup.R")
source("R/run_umap.R")
source("R/run_clustering.R")
source("R/order_dataset.R")
source("R/plot_umap.R")
source("R/calculate_consensus.R")
source("R/consensus_genomic_classes.R")
source("R/run_me_tree.R")
source("R/plot_heatmap.R")
source("R/plot_moving_averages.R")

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sat Mar  6 17:34:47 2021
# RNA
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sat Mar  6 17:34:51 2021

# reading RNA data
dds <- readRDS(here("extdata/rna/mdamb231_rna_dds.rds"))

# Quality filtering
keep <- rowSums(counts(dds) >= 5) >= 3
dds <- dds[keep,]

# computation of size factors which normalize for 
# differences in sequencing depth among samples
dds <- estimateSizeFactors(dds)

## using 'avgTxLength' from assays(dds), correcting for library size

boxplot(log10(counts(dds,normalized=TRUE)+1), las = 2)

# variance-stabilizing transformation
vsd <- vst(dds)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sat Mar  6 17:34:56 2021
# DNA
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sat Mar  6 17:35:01 2021

mda_ploidy <- 2.41

# data from DNA bulk expanded clones 
# and from mda-mb-231 single-cell were merged to the same dataframe

mdamb231_sc_bulk_cna_popseg <- readRDS(here("extdata/popseg/mdamb231_sc_bulk_cna_popseg.rds"))
mdamb231_sc_bulk_cna_ml <- readRDS(here("extdata/merge_levels/mdamb231_sc_bulk_cna_ml.rds"))

8.1 Co-clustering

mdamb231_sc_bulk_umap <- run_umap(mdamb231_sc_bulk_cna_ml,
                                  umap_n_neighbors = 25,
                                  seed = 5)

## Constructing UMAP embedding.

mdamb231_sc_bulk_clustering <- run_clustering(mdamb231_sc_bulk_umap,
                                 k_snn_major = 43,
                                 k_snn_minor = 14)

## Building SNN graph.

## Running hdbscan.

##  cluster   n    percent
##       c1 176 0.19599109
##      c10 203 0.22605791
##      c11  36 0.04008909
##      c12  14 0.01559020
##      c13 197 0.21937639
##       c2  28 0.03118040
##       c3  59 0.06570156
##       c4  52 0.05790646
##       c5  32 0.03563474
##       c6  42 0.04677060
##       c7  20 0.02227171
##       c8  16 0.01781737
##       c9  23 0.02561247

## Done.

mdamb231_sc_bulk_clustering_umap <- left_join(mdamb231_sc_bulk_umap,
                                              mdamb231_sc_bulk_clustering, 
                                              by = c("cell" = "cells"))

mdamb231_sc_bulk_clustering_umap <- mdamb231_sc_bulk_clustering_umap %>%
  mutate(
    type = case_when(str_detect(cell, "gDNA") ~ "bulk",
                     TRUE ~ subclones),
    shape = case_when(str_detect(cell, "gDNA") ~ 8,
                      TRUE ~ 20),
    size = case_when(str_detect(cell, "gDNA") ~ 1.2,
                     TRUE ~ 1),
  )

p1 <- mdamb231_sc_bulk_clustering_umap %>% arrange(desc(type)) %>% 
  ggplot() + 
  geom_point(aes(V1,V2, color = superclones), size = 6) +
  geom_point(aes(V1,V2, color = type, shape = shape, size = size), stroke = 1.5) +
  scale_color_manual(values = c(colors_vector$superclones, colors_vector$subclones, "bulk" = "black")) +
  scale_shape_identity() + 
  scale_size_identity() +
  theme_classic() +
  theme(axis.title.x=element_text(colour = "gray50", size = 20),
        axis.text.x= element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y = element_text(colour = "gray50", size = 20),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        legend.position = "none",
        panel.border = element_rect(color = "black", 
                                    fill = NA,
                                    size = 2),
        # legend.position = "null"
  )  +
  labs(color = "") + 
  xlab("umap1") +
  ylab("umap2") 

p1

# this setup will be used to plot the heatmap later

mda_ordered <- order_dataset(popseg_long = mdamb231_sc_bulk_cna_ml,
                             clustering = mdamb231_sc_bulk_clustering)

mda_cocluster_consensus <- calculate_consensus(df = mda_ordered$dataset_ordered,
                                     clusters = mda_ordered$clustering_ordered$subclones)

mda_cocluster_me_tree <- run_me_tree(consensus_df = mda_cocluster_consensus,
                                     clusters = mdamb231_sc_bulk_clustering,
                                     ploidy_VAL = mda_ploidy)

8.2 Single-cell + Bulk Heatmap

plot_heatmap(df = mda_ordered$dataset_ordered,
             ploidy_VAL = mda_ploidy,
             ploidy_trunc = 2*(round(mda_ploidy))+2,
             clusters = mda_ordered$clustering_ordered,
             genomic_classes = NULL,
             keep_gene = NULL,
             tree_order = mda_cocluster_me_tree$cs_tree_order,
             show_legend = TRUE,
             mda_cocluster = TRUE)

8.3 Bulk Heatmap

mdamb231_bulk_cna_long <- mda_ordered$dataset_ordered[str_detect(rownames(mda_ordered$dataset_ordered), "gDNA"),]

# clustering info
mdamb231_bulk_cl_info <- mda_ordered$clustering_ordered %>% 
  filter(str_detect(cells, "gDNA")) %>% 
  arrange(superclones, subclones)

# matching order
mdamb231_bulk_ordered <- mdamb231_bulk_cna_long[mdamb231_bulk_cl_info$cells,]

plot_heatmap(df = mdamb231_bulk_ordered,
             ploidy_VAL = mda_ploidy,
             ploidy_trunc = 2*(round(mda_ploidy))+2,
             clusters = mdamb231_bulk_cl_info,
             genomic_classes = NULL,
             keep_gene = NULL,
             tree_order = NULL,
             show_legend = TRUE,
             mda_cocluster = TRUE)

8.4 DNA & RNA

# Biomart
grch37 <-
  useMart(
    biomart = "ENSEMBL_MART_ENSEMBL",
    host = "grch37.ensembl.org",
    path = "/biomart/martservice",
    dataset = "hsapiens_gene_ensembl"
  )

# hg19 chromosome positions from http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes
hg19_chrom_sizes <- readRDS(here("extdata/lib/hg19.chrom.sizes.rds"))

# transforming into a named vector and sorting by chrom
hg19_chrom_sizes <- hg19_chrom_sizes[1:23,]
hg19_chrom_sizes <- deframe(hg19_chrom_sizes)
hg19_chrom_sizes <- hg19_chrom_sizes[gtools::mixedsort(names(hg19_chrom_sizes))]

# obtaining the position for every ensembl gene id 
bms <- list()

for(i in 1:length(hg19_chrom_sizes)) {
  chrom <- str_remove(names(hg19_chrom_sizes)[i], "chr")
  
  if (chrom != "X")
    chrom <- as.numeric(chrom)
  
  length_chrom <- hg19_chrom_sizes[i]
  
  bms[[i]] <-
    getBM(
      c(
        "ensembl_gene_id",
        "hgnc_symbol",
        "start_position",
        "end_position"
      ),
      filters = c("chromosome_name", "start", "end"),
      values = list(chrom,
                    0, length_chrom),
      mart = grch37
    )
  
  bms[[i]] <- bms[[i]] %>% arrange(start_position)
  
  names(bms)[[i]] <- names(hg19_chrom_sizes)[i]
}

bms_df <- bind_rows(bms, .id = "chr") %>%
  dplyr::rename(gene = "hgnc_symbol",
                gene_id = "ensembl_gene_id")

# obtaining the count matrix of the MDAMB231 bulk RNAseq
cnt <- assay(vsd)

# gene id clean up
rownames(cnt) <- str_extract(rownames(cnt), "[A-Z]+[0-9]+")

# averaging the expression of the RNA triplicates and calculating a zscore
cnt_long <- cnt %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("gene_id") %>% 
  tidyr::gather(key = "sample",
                value = "count",
                -gene_id) %>% 
  mutate(sample = str_remove(sample, "[A-C]$"))

cnt_long_avg <- cnt_long %>%
  group_by(gene_id, sample) %>% 
  summarise(count_avg = mean(count)) %>% 
  ungroup()  %>% 
  group_by(gene_id) %>%
  mutate(z_score = (count_avg-mean(count_avg))/sd(count_avg)) %>%
  ungroup() 

# wide df will be used later for moving average
cnt_avg <- tidyr::pivot_wider(cnt_long_avg,
                              names_from = sample,
                              values_from = z_score,
                              id_cols = gene_id)


cnt_avg <- as.data.frame(cnt_avg)
rownames(cnt_avg) <- cnt_avg$gene_id
cnt_avg <- cnt_avg %>%
  dplyr::select(-gene_id)

8.5 Moving averages

# adding segments from population segmentation information
mdamb231_bulk_cna_id <- mdamb231_sc_bulk_cna_popseg %>% 
  mutate(seg_index = 1:nrow(mdamb231_sc_bulk_cna_popseg))

# clustering information
cl_info <- mdamb231_bulk_cl_info %>% 
  mutate(sample =str_remove(cells, "gDNA"),
         sample = str_extract(sample, "MDAMB231C[0-9]+"))

seg_ids_long <- rep.int(mdamb231_bulk_cna_id$seg_index, mdamb231_sc_bulk_cna_popseg$n.probes)

# bulk integer df
mdamb231_bulk_cna_long_int <- ploidy_scale(ploidy_VAL = 2.41,
                                           df = mdamb231_bulk_ordered)

rownames(mdamb231_bulk_cna_long_int) <- str_remove(rownames(mdamb231_bulk_cna_long_int), "gDNA") %>% 
  str_remove("_S[0-9]+")

blk_long <- as.data.frame(t(mdamb231_bulk_cna_long_int))

# adding pipeline information
blk_long$chr <- bins_in_cna_pipeline$chr[1:nrow(blk_long)]
blk_long$start <- bins_in_cna_pipeline$start[1:nrow(blk_long)]
blk_long$end <- bins_in_cna_pipeline$end[1:nrow(blk_long)]
blk_long$abspos <- bins_in_cna_pipeline$abspos[1:nrow(blk_long)]
blk_long$seg_index <- seg_ids_long

# creating a long file with a position vector according to the number of rows
blk_long_g <- blk_long %>% 
  mutate(pos = 1:nrow(blk_long)) %>% 
  gather(key = "sample",
         value = "cn",
         -c(chr, start, end, abspos, pos, seg_index))

# obataining gene positions Grange
txdb <- Homo.sapiens

hg19_genes <- GenomicFeatures::genes(txdb, columns = "SYMBOL")
hg19_genes_df <- as.data.frame(hg19_genes) %>% 
  mutate(SYMBOL = as.character(SYMBOL))

bins_gr <- bins_in_cna_pipeline  %>% 
  makeGRangesFromDataFrame(keep.extra.columns = T, ignore.strand = T)  

olaps <- findOverlaps(hg19_genes, bins_gr)

# finding which index (bin in the 200kb pipeline the gene is located) 
# saving to a data frame
mk_df <- tibble(gene = hg19_genes_df$SYMBOL[queryHits(olaps)],
                pos = subjectHits(olaps)) %>% 
  dplyr::distinct(gene, .keep_all = TRUE)

mk_df_j <- inner_join(mk_df, bms_df) 

# all triplicates in long form
cnt_long_trip <- cnt %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("gene_id") %>% 
  tidyr::gather(key = "sample_trip",
                value = "count",
                -gene_id) %>% 
  mutate(sample= str_remove(sample_trip, "[A-C]$"))

cnt_long_all_zc <- cnt_long_trip %>%
  group_by(gene_id) %>%
  mutate(z_score = (count-mean(count))/sd(count)) %>%
  ungroup()

cnt_long_gene_trip <- inner_join(cnt_long_all_zc, mk_df_j)

cnt_long_gene_trip <- inner_join(cnt_long_gene_trip, cl_info) %>% 
  arrange(chr, start_position)

# joining the copy number long data with the gene positions on their respective bins
blk_long_gj <- inner_join(blk_long_g, mk_df)

# adding the ensembl gene id information and the cluster information
blk_long_gj <- inner_join(blk_long_gj, bms_df) %>% 
  arrange(pos) %>% 
  left_join(cl_info)

# full dna rna table with counts merged by sample and ensemblgene_id
dna_rna <- inner_join(blk_long_gj,
                      cnt_long_avg)


# jitter to be able to visualize all the copy number tracks
jitter <- seq(-0.14, 0.17, 0.03) 

cls <- gtools::mixedsort(unique(levels(droplevels(as.factor(blk_long_gj$subclones)))))

blk_long_gj_jit <- blk_long_gj %>% 
  mutate(cn = case_when(
    subclones == cls[1] ~ cn + jitter[1],
    subclones == cls[2] ~ cn + jitter[2],
    subclones == cls[3] ~ cn + jitter[3],
    subclones == cls[4] ~ cn + jitter[4],
    subclones == cls[5] ~ cn + jitter[5],
    subclones == cls[6] ~ cn + jitter[6],
    subclones == cls[7] ~ cn + jitter[7],
    subclones == cls[8] ~ cn + jitter[8],
    subclones == cls[9] ~ cn + jitter[9],
    subclones == cls[10] ~ cn + jitter[10]
  ))

plot_moving_average(chr = "chr19",
                    genes = c("SH3GL1","PIK3R2"))

plot_moving_average(chr = "chr11",
                    genes = c("ATM","CBL"))

8.6 DE

samples <- as.data.frame(colData(dds)) %>% 
  mutate(sample_name = str_extract(sample, "MDAMB231C[0-9]+"))

samples_cl <- left_join(samples, cl_info, by = c("sample_name" = "sample"))
#sanity check
samples_cl <- samples_cl[match(samples_cl$sample, colData(dds)$sample),]

# removing unused levels
colData(dds)$cluster <- droplevels(as.factor(samples_cl$subclones))

design(dds) <- formula(~ cluster)
register(BatchtoolsParam(workers = 60), default = TRUE)
dds <- DESeq(dds, parallel = TRUE, betaPrior = T)

## using pre-existing normalization factors

## estimating dispersions

## gene-wise dispersion estimates: 60 workers

## mean-dispersion relationship

## final dispersion estimates, MLE betas: 60 workers

## fitting model and testing: 60 workers

## -- replacing outliers and refitting for 7 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

resultsNames(dds)

##  [1] "Intercept"  "clusterc11" "clusterc12" "clusterc13" "clusterc5"  "clusterc6"  "clusterc8"  "clusterc3"  "clusterc4" 
## [10] "clusterc1"  "clusterc2"

clusters <- samples_cl %>%
  droplevels() %>% 
  distinct(subclones) %>% 
  pull() %>%
  as.character()

8.7 RNA PCA

pcaData <- plotPCA(vst(dds), "cluster", returnData = T) %>%
  mutate(triplicate = str_extract(name, "C[0-9]+"),
         triplicate = tolower(triplicate)) %>%
  mutate(sample = str_extract(name, "MDAMB231C[0-9]+"))
percentVar <- round(100 * attr(pcaData, "percentVar"))

cluster_anno <- mdamb231_sc_bulk_clustering %>%
  filter(str_detect(cells, "gDNA")) %>%
  mutate(sample = str_remove(cells, "gDNA")) %>%
  mutate(sample = str_extract(sample, "MDAMB231C[0-9]+"))

pcaData <- pcaData %>%
  left_join(cluster_anno, by = c("group" = "subclones", 
                                 "sample" = "sample"))

p_pca <- ggplot(pcaData, aes(x = PC1,
                             y = PC2)) +
  scale_color_manual(values = c(colors_vector$subclones,
                                colors_vector$superclones)) +
  geom_point(aes(color = fct_relevel(superclones, gtools::mixedsort(unique(as.character(pcaData$superclones))))), size = 8) +
  geom_point(aes(color = fct_relevel(group, gtools::mixedsort(unique(as.character(pcaData$group))))), size = 3) +
  xlab("PC1") +
  ylab("PC2") +
  theme_cowplot() +
  theme(axis.title.x=element_text(size = 16),
        axis.text.x= element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y = element_text(size = 16),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        plot.title = element_text(hjust = 0.5),
        legend.position = "none",
        panel.border = element_rect(color = "black",
                                    fill = NA,
                                    size = 2)) +
  labs(color = "subclone")

p_pca

8.8 FGSEA

# Thanks to https://stephenturner.github.io/deseq-to-fgsea/

fgseaResTidy <- list()

for (x in 1:length(clusters)) {
  res <-
    results(dds,
            contrast = list(
              paste0("cluster", clusters[x]),
              paste0(
                "cluster",
                clusters[clusters %!in% clusters[x]]
              )
            ),
            listValues = c(1, -1 / 10))
  
  rownames(res) <- str_replace(rownames(res),
                               ".[0-9]+$", "")
  

  res$symbol <- mapIds(
    org.Hs.eg.db,
    keys = row.names(res),
    column = "SYMBOL",
    keytype = "ENSEMBL",
    multiVals = "first"
  )
  
  res$map <- mapIds(
    org.Hs.eg.db,
    keys = row.names(res),
    column = "MAP",
    keytype = "ENSEMBL",
    multiVals = "first"
  )
  head(res[order(res$padj), ], 20)
  
  res$gene_map <- paste0(res$symbol, " ", "(", res$map, ")")
  
  res <- drop_na(as.data.frame(res))
  
  res2_de <- res %>%
    as_tibble() %>%
    dplyr::select(symbol,
                  stat) %>%
    na.omit() %>%
    distinct() %>%
    group_by(symbol) %>%
    summarize(stat = mean(stat))
  
  ranks_de <- deframe(res2_de)

  pathways.hallmark <-
    gmtPathways(
      here("extdata/lib/h.all.v6.2.symbols.gmt")
    )
  
  fgseaRes <-
    fgsea(pathways = pathways.hallmark,
          stats = ranks_de,
          nperm = 2000)
  
  fgseaResTidy[[x]] <- fgseaRes %>%
    as_tibble()
  
  names(fgseaResTidy)[x] <- clusters[x]
  
  
}

fgseaResTidy_res <- bind_rows(fgseaResTidy, .id = "comparison")

# keeping only the significant pathways in x or more hallmarks
sig_path <- fgseaResTidy_res %>%
  filter(padj < 0.05) %>%
  group_by(pathway) %>%
  dplyr::count() %>%
  arrange(desc(n)) %>%
  filter(n > 3) %>%
  pull(pathway)


fgseaResTidy_res_f <- fgseaResTidy_res %>%
  filter(pathway %in% sig_path) %>%
  mutate(pathway = str_remove(pathway, "HALLMARK_")) %>%
  mutate(pathway = str_replace_all(pathway, "_", " ")) %>%
  mutate(pathway = str_to_lower(pathway))  %>%
  mutate(comparison = paste0("e", comparison))

fg_cl <- pivot_wider(
  fgseaResTidy_res_f,
  names_from = pathway,
  values_from = NES,
  id_cols = comparison
) %>%
  as.data.frame()

rownames(fg_cl) <- fg_cl[, 1]

fg_cl_hclust <- hclust(dist(fg_cl[, -1],
                            method = "euclidean"),
                       method = "ward.D")


fg_path <- pivot_wider(
  fgseaResTidy_res_f,
  names_from =
    comparison,
  values_from = NES,
  id_cols = pathway
) %>%
  as.data.frame()

rownames(fg_path) <- fg_path[, 1]

fg_path_hclust <- hclust(dist(fg_path[, -1],
                              method = "euclidean"),
                         method = "ward.D")


# thanks to https://mgrcbioinfo.github.io/my_GSEA_plot/

myGseaPlot <- function(gseaDataFrame, adjPCutoff = 0.05) {
  # subset data frame using p value cutoff and add status factor
  toPlot <- subset(gseaDataFrame, padj <= adjPCutoff) %>%
    mutate(status = case_when(NES > 0 ~ "Up", NES < 0 ~ "Down")) %>%
    mutate(status = factor(status, levels = c("Up", "Down")))
  
  # create plot
  p <-
    ggplot(
      data = toPlot,
      mapping = aes(
        x = fct_relevel(comparison,
                        rownames(fg_cl)[fg_cl_hclust$order]),
        y = fct_relevel(pathway,
                        rownames(fg_path)[fg_path_hclust$order]),
        color = status,
        shape = status,
        fill = NES,
        size = -log10(padj)
      )
    ) +
    geom_point() +
    scale_shape_manual(values = c(24, 25)) +
    scale_fill_gradient2(
      high = "firebrick3",
      low = "deepskyblue3",
      mid = "white",
      midpoint = 0
    ) +
    scale_color_manual(values = c(Up = "coral3", Down = "deepskyblue3")) +
    ggtitle(paste0("GSEA")) +
    theme_bw() +
    theme(
      plot.title = element_text(hjust = 0.5, size = 18),
      axis.title.x = element_text(size = 16),
      panel.grid = element_blank(),
      legend.position = "bottom",
      axis.text.x = element_text(
        angle = 90,
        hjust = 1,
        size = 16,
        vjust = .5
      ),
      axis.text.y = element_text(size = 14)
    ) +
    ylab("") +
    xlab("expanded clusters")
  
  return(p)
}

hallmarks_plot <-
  myGseaPlot(gseaDataFrame = fgseaResTidy_res_f, adjPCutoff = 0.05)

hallmarks_plot