Overview of the cell meta data

df = read_rds("/volumes/USR1/yyan/project/tnbc_pre_atlas/deliver/atlas_20251023_share/df_meta.rds")
print(colnames(df))
## [1] "nCount_RNA"       "nFeature_RNA"     "celltype"         "cellstate"       
## [5] "pCR_status"       "barcodes"         "paper_patient_id"
col_cat <- c('paper_patient_id', 'celltype', 'pCR_status')
for (col in col_cat) {
  message(col)
  print(table(df[, col], useNA = 'ifany'))
}
## paper_patient_id
## 
##    P1   P10  P100  P101   P11   P12   P13   P14   P15   P16   P17   P18   P19 
##  4717  5023  2196 14781  3143  6800  1429  1984  1092  2048  2802  4376  2334 
##    P2   P20   P21   P22   P23   P24   P25   P26   P27   P28   P29    P3   P30 
##   516  6186  1301  2808  6255  1022  3027  8140  3866  4816  2370  4038  2835 
##   P31   P32   P33   P34   P35   P36   P37   P38   P39    P4   P40   P41   P42 
##  4701  2078  2660  6174  1608  1175  4436  5376  8156  2771  4248  6631  4824 
##   P43   P44   P45   P46   P47   P48   P49    P5   P50   P51   P52   P53   P54 
##  5458  3098  8815  6949  2464  2981  2479  1147  3551   968  1946  1528  5234 
##   P55   P56   P57   P58   P59    P6   P60   P61   P62   P63   P64   P65   P66 
##  2143  7143  4067  8206  5057  2126  6012  5083  3242  5395  5713  3186  5592 
##   P67   P68   P69    P7   P70   P71   P72   P73   P74   P75   P76   P77   P78 
##  2572  2110  1024  1193  5458  3462  2668  3537  9152  2051  3649  9802  4411 
##   P79    P8   P80   P81   P82   P83   P84   P85   P86   P87   P88   P89    P9 
##  5954  2379  5130  3181  2161  4847  5920  6967  6649  7975  5514  2130  3418 
##   P90   P91   P92   P93   P94   P95   P96   P97   P98   P99 
##  6172 10284  2204  1379  3749  8773  2042  4532  7265  3817
## celltype
## 
##      B   Endo  Fibro    Mye   Peri      T  Tumor 
##  52015   5927  14182  49004   3139 254315  49275
## pCR_status
## 
##      pCR       RD  Unknown Excluded  Removed 
##   198019   153126        0    76712        0

UMAP

umap <- read_rds("/volumes/USR1/yyan/project/tnbc_pre_atlas/deliver/atlas_20251023_share/umap.rds")
scattermore::scattermoreplot(umap[, 1], umap[, 2], asp=1)

for (col in col_cat) {
  col_lv <- sort(unique(as.character(df[, col])))
  pal_col <- structure(rainbow(n=length(col_lv)), names=col_lv)
  scattermore::scattermoreplot(
    umap[, 1], umap[, 2], col = pal_col[as.character(df[, col])], 
    asp=1, main=col)
}

# col <- 'cellstate'
# col_lv <- sort(unique(as.character(df[, col])))
# pal_col <- structure(rainbow(n=length(col_lv)), names=col_lv)
# 
# for (col_lv_x in col_lv) {
#   idx <- as.character(df[, col]) %in% col_lv_x
#   scattermore::scattermoreplot(
#     umap[, 1], umap[, 2], 
#     col = ifelse(idx, pal_col[as.character(df[, col])][col_lv_x], 'grey'),  
#     asp=1, main=sprintf('%s: %d cells', col_lv_x, sum(idx)))
# }

Gene expression

data <- read_rds('/volumes/USR1/yyan/project/tnbc_pre_atlas/deliver/atlas_20251023_share/mat_data.rds')
print(dim(data))
## [1]  33538 427857
library(colorspace)
library(ggpubr)
query_gene <- "EPCAM"
v <- data[query_gene, ]
dfv <- data.frame(cbind(umap, v))
set.seed(42)
idx <- sample(1:nrow(dfv), size=nrow(dfv))
dfv <- dfv[idx, ]
head(dfv)
##                                      UMAP_1     UMAP_2        v
## Mye_ARTC23_ACTATGGAGGTCATTC      -11.140078  9.4080937 0.000000
## B_ARTC20_TCAGTTTGTAGTGATA         -2.160829  6.6423953 0.000000
## T_ARTC31_TGAGGAGCAGTAGAAT         -1.433794 -0.2171205 0.000000
## Mye_ARTC27_ACGTAACAGGTCATTC       -9.071169  7.3530205 0.000000
## B_ARTC18_CAGGGCTCATGCAGCC          5.107306  8.8766730 0.434959
## Tumor_ARTC102_CCCTTAGGTCAACATC-1   7.476499  2.8980523 1.898505
theme_set(theme_void())
ggplot(dfv, aes(x=UMAP_1, y=UMAP_2, color=v)) +
  geom_scattermore() + 
  colorspace::scale_color_continuous_sequential(palette = 'Reds 3')+
  labs(color=query_gene)+
  coord_equal() + ggpubr::border()+
  theme(legend.position = 'top', aspect.ratio = 1)
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

gene_opts <- c(
  'EPCAM',
  'LYZ', 'LILRA4', 'TPSAB1',
  'TRAC', 'CD3D', 'NKG7', 'GNLY', 
  'CD79A', 'MS4A1', 
  'FAP', 'COL1A1', 
  'VWF', 
  'RGS5', 'SYNPO2'
)
p_list <- list()
theme_set(theme_pubr())
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
set.seed(42)
idx <- sample(1:nrow(umap), size=nrow(umap))
for (g in gene_opts) {
  message(g)
  v <- data[g, ]
  dfv <- data.frame(cbind(umap, v))
  dfv <- dfv[idx, ]
  head(dfv)
  p_g <- ggplot(dfv, aes(x=UMAP_1, y=UMAP_2, color=v)) +
    geom_scattermore() + 
    # colorspace::scale_color_continuous_sequential(palette = 'Reds 3')+
    # scale_color_viridis_c(option='E') +
    scale_color_gradient(low = 'snow2', high = 'purple4') +
    labs(color='', title = g)+
    coord_equal() + ggpubr::border()+
    theme(legend.position = 'right', aspect.ratio = 1, 
          legend.key.width = unit(1, "lines"), 
          legend.key.height = unit(1, "lines")) + 
    ggpubr::rremove('x.title') + rremove('y.title') + 
    rremove('x.ticks') + rremove('y.ticks') + rremove('x.text') + rremove('y.text')
  # p_g
  p_list <- c(p_list, list(p_g))
}
## EPCAM
## LYZ
## LILRA4
## TPSAB1
## TRAC
## CD3D
## NKG7
## GNLY
## CD79A
## MS4A1
## FAP
## COL1A1
## VWF
## RGS5
## SYNPO2
p_list0 <- p_list
p_list <- lapply(p_list, function(x) {
  x + theme(legend.key.width = unit(.5, "lines"))
})
length(gene_opts)
## [1] 15
p <- ggpubr::ggarrange(plotlist=p_list, ncol=3)
print(p)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## attr(,"class")
## [1] "list"      "ggarrange"