In this Vignette we show how to create a CONGAS+ object starting from RNA and ATAC count matrices. We will use the B-cell lymphoma cells sequences via 10x genomics multiomic assay. The raw data can be foun here. Setup the directory where results are stored.


sample = 'lymphoma'

prefix = paste0(sample, "/tutorial/")

out.dir = paste0(prefix, "/congas_data/")
fig.dir = paste0(prefix, "/congas_figures/")

if (!dir.exists(out.dir)) {dir.create(out.dir, recursive = T)}
if (!dir.exists(fig.dir)) {dir.create(fig.dir, recursive = T)}

Load the ATAC counts to create the tibble needed as CONGAS+ input


data(atac_counts) 
data(rna_counts)

With the multiome assay, the feature files provided by CellRanger contains already a mapping between gene names and coordinates. Let’s read this file As an alternative, when such file not not readily available, users can exploit BiomaRt Please see the correspodning vignette

data(features)

Create the RNA and ATAC tibble required by CONGAS+

atac_featureDF = data.frame(id = rownames(atac_counts)) %>%
  tidyr::separate(id, c('chr', 'from', 'to'), '-')

atac = Rcongas::create_congas_tibble(counts = atac_counts, 
                                     modality = 'ATAC', 
                                     save_dir = NULL,
                                     features = atac_featureDF)
#>  Saving temporary counts file
#> Rows: 3100055 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: " "
#> dbl (3): 109788, 500, 3100055
#> 
#>  Use `spec()` to retrieve the full column specification for this data.
#>  Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> Joining with `by = join_by(feature_index)`
#> Joining with `by = join_by(cell_index)`
# duplicate call names across ATAC and RNA cells is not allowed by CONGAS+
# add suffix to atac cell barcodes prior to create the CONGAS+ object
atac = atac %>% mutate(cell = paste0(cell, '-ATAC'))

# Compute normalization factors
norm_atac = Rcongas:::auto_normalisation_factor(atac) %>%
  mutate(modality = 'ATAC')
#> → Computing library size factors as total counts.
#> → Median digits per factor 4.5, scaling by 31622.7766016838.
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.0309  0.1610  0.3184  0.4383  0.5927  2.1472

rna = Rcongas::create_congas_tibble(counts = rna_counts, 
                                    modality = 'RNA', 
                                    save_dir=NULL, 
                                    features = features)
#>  Saving temporary counts file
#> Rows: 1049590 Columns: 3── Column specification ────────────────────────────────────────────────────────
#> Delimiter: " "
#> dbl (3): 36601, 500, 1049590
#>  Use `spec()` to retrieve the full column specification for this data.
#>  Specify the column types or set `show_col_types = FALSE` to quiet this message.Joining with `by = join_by(feature_index)`Joining with `by = join_by(cell_index)`

rna = rna %>% mutate(cell = paste0(cell, '-RNA'))

# Compute normalization factors
norm_rna = Rcongas:::auto_normalisation_factor(rna) %>%
  mutate(modality = 'RNA')
#> → Computing library size factors as total counts.
#> → Median digits per factor 4, scaling by 10000.
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.0627  0.1671  0.2724  0.3964  0.4966  2.4000

atac = atac %>% mutate(value = as.integer(value))
rna = rna %>% mutate(value = as.integer(value))

Apply some basic filters on known genes

rna = rna %>% filter_known_genes(what='r')
#> 
#> ── Ribosomal
#>  n = 185 genes found.
#> RPL22, RPL11, RPS6KA1, RPS8, MRPL37, RPL5 , ...
#> 
#> ── Specials
#>  n = 1 genes found.

all_genes = rna$gene %>% unique
mito = all_genes %>% str_starts(pattern = 'MT-')
all_genes = setdiff(all_genes, all_genes[mito])
rna = rna %>% dplyr::filter(gene %in% all_genes)

Read chromosome arm coordinated that will be used as segment breakpoints

data(example_segments)

# Remove these chromosomes
segments = example_segments %>% dplyr::filter(chr != 'chrX', chr != 'chrY')

Now init the CONGAS+ object

x = init(
  rna = rna,
  atac = atac,
  segmentation = segments, #%>% mutate(copies = as.integer(round(copies))),
  rna_normalisation_factors = norm_rna,
  atac_normalisation_factors = norm_atac,
  rna_likelihood = "NB", 
  atac_likelihood = "NB",
  description = paste0('Bimodal ', sample))
#>                        ┌──────────────────────────────────┐
#>                        │                                  
#>                        │   (R)CONGAS+: Bimodal lymphoma   
#>                        │                                  
#>                        └──────────────────────────────────┘
#> 
#> ── RNA modality (38.360456 Mb)
#> → Input events: 1028900
#> → Input cells: 500
#> → Input locations: 21771
#> → Cleaning outlier features from RNA input matrix
#> → Normalising RNA counts using input normalisation factors.
#> Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
#> of ggplot2 3.3.4.
#>  The deprecated feature was likely used in the Rcongas package.
#>   Please report the issue to the authors.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> → Found: 109 outliers.
#> → Entries mapped: 491441, with 537459 outside input segments that will be discarded.
#> → Using likelihood: NB.
#> 
#> ── ATAC modality (86.844704 Mb)
#> → Input events: 3100055
#> → Input cells: 500
#> → Input locations: 109645
#> → Cleaning outlier features from ATAC input matrix
#> Joining with `by = join_by(chr, from, to)`
#> → Normalising ATAC counts using input normalisation factors.
#> → Found: 581 outliers.
#> → Entries mapped: 1530455, with 1569600 outside input segments that will be
#> discarded.
#> → Using likelihood: NB.
#> 
#> ── Checking segmentation
#> ! These segments have no events associated and will be removed - we suggest you to check if these can be further smoothed.
#>   Hint  Check if the reduced segments can be further smoothed!
#> # A tibble: 3 × 11
#>   chr    from       to copies segment_id      RNA_nonzerocells ATAC_nonzerocells
#>   <chr> <int>    <int>  <int> <chr>                      <dbl>             <dbl>
#> 1 chr13     0 17700000      2 chr13:0:177000…                0                 0
#> 2 chr14     0 17200000      2 chr14:0:172000…                0                 0
#> 3 chr15     0 19000000      2 chr15:0:190000…                0                 0
#> # ℹ 4 more variables: ATAC_nonzerovals <dbl>, ATAC_peaks <dbl>,
#> #   RNA_nonzerovals <dbl>, RNA_genes <dbl>
#>  Warning RNA 0-counts cells. 374 cells have missing data in any of 23 segments, top 5 with missing data are:
#>  Cell AAACCGAAGTGGACAA-1-RNA with 1 0-segments (4%)
#>  Cell AAACGCGCAGGTTCAC-1-RNA with 1 0-segments (4%)
#>  Cell AAACGTACATAATGAG-1-RNA with 1 0-segments (4%)
#>  Cell AAAGGTTAGGCCAATT-1-RNA with 1 0-segments (4%)
#>  Cell AACATTGTCCTACCTA-1-RNA with 1 0-segments (4%)
#>  Warning ATAC 0-counts cells. 209 cells have no data in any of 23 segments, top 5 with missing data are:
#>  Cell GTCCTAGAGCCAAATC-1-ATAC with 2 0-segments (9%)
#>  Cell TCTCGCCCAAACATAG-1-ATAC with 2 0-segments (9%)
#>  Cell AAACGCGCAGGTTCAC-1-ATAC with 1 0-segments (4%)
#>  Cell AAACGTACATAATGAG-1-ATAC with 1 0-segments (4%)
#>  Cell AAAGCCGCATAGTCAT-1-ATAC with 1 0-segments (4%)

Now plot the number of nonzero cells vs the number of peaks per segment, to help identify a threshold for filtering segments.

ggplot(x$input$segmentation, aes(x=ATAC_nonzerocells, y=ATAC_peaks)) + geom_point() + theme_bw()

ggplot(x$input$segmentation, aes(x=RNA_nonzerocells, y=RNA_genes)) + geom_point() + theme_bw()

x = Rcongas::filter_segments(x, RNA_nonzerocells = 300, ATAC_nonzerocells = 300)
#> 
#> ── Segments filter
#> → 22 retained segments out of 23.

Generate plots to check out the data distribution

plot_data(
  x,
  what = 'histogram',
  segments = get_input(x, what = 'segmentation') %>%
    ungroup() %>%
    mutate(L = to - from) %>%
    dplyr::arrange(dplyr::desc(L)) %>%
    dplyr::top_n(9) %>%
    pull(segment_id)
)
#> → Normalising RNA counts using input normalisation factors.
#> → Normalising ATAC counts using input normalisation factors.
#> Selecting by L
#> → Showing all segments (this plot can be large).

Add metadata to annotate the counts distributions

data(metadata)

x$input$metadata = metadata
# Modificato plot_data in mdodo da plottare gli istogrammi colorati per celltype
plot_data(x, 
  to_plot = 'type', 
  position = 'stack',
  segments = get_input(x, what = 'segmentation') %>%
    ungroup() %>%
    mutate(L = to - from) %>%
    dplyr::arrange(dplyr::desc(L)) %>%
    dplyr::top_n(9) %>%
    pull(segment_id))
#> → Normalising RNA counts using input normalisation factors.
#> → Normalising ATAC counts using input normalisation factors.
#> Selecting by L
#> → Showing all segments (this plot can be large).
#> Joining with `by = join_by(cell)`

Please see the next vignette that shows how to perform CONGAS+ inference.