
6. Simulate data
a6_Simulate_data.Rmd
library(dplyr)
n_clocks=3
n_events=8
purity=0.9
coverage=100
epsilon=0.20
seed = 123
tolerance = 0.0001
max_attempts = 2
INIT = TRUE
min_mutations_number = 3
cat("coverage: ", coverage)
mu = 1e-4 # mutation rate
cat("mutation rate: ", mu)
w = 1e-2 # cell division rate
cat("cell division rate: ", w)
l = 2e7 # length of the segment
cat("length of the segment: ", l)
time_interval = 7
cat("time_interval: ", time_interval)
res_simulate <- tickTack::get_simulation_tickTack(number_clocks=n_clocks,
number_events=n_events,
purity=purity,
coverage=coverage,
epsilon=epsilon,
seed = seed)
data_simulation = as.data.frame(res_simulate$data_simulation) %>% mutate(chr= 1:length(res_simulate$data_simulation$taus))
x = res_simulate$x
df = x$cna %>% left_join(data_simulation)
# timing inference ticktack hierarchical
x <- tickTack::fit_h(x, max_attempts=max_attempts, INIT=INIT, tolerance = tolerance)
results_simulated <- x$results_timing
results_model_selection <- tickTack::model_selection_h(results_simulated)
## Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
best_K <- results_model_selection$best_K
data_simulation$taus
## [1] 0.1028646 0.1028646 0.4348927 0.4348927 0.9849570 0.9849570 0.9849570
## [8] 0.4348927
Results
The results
object that is returned together with the
CNAqc input object contains four components: data
,
draws_and_summary
, log_lik_matrix_list
and
elbo_iterations
.
# View summary for a specific K, here K = 2
results <- x$results
Interpreting the output
We can inspect the main output of interest to understand the timing
of clonal peaks. results$draws_and_summary
contains: -
draws
the draws from the approximate
posterior distribution of the taus and weights; -
summary
a summary with the main statistics
of the approximate posterior distributions; -
summarized_results
represents the clock
assignment, a tibble with the estimate of taus for each segment with a
copy number event that has been included in the hierarchical
inference
# View summary for a specific K, here K = 2
results$draws_and_summary[[2]]$summary
## # A tibble: 18 × 7
## variable mean median sd mad q5 q95
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 tau[1] 0.988 0.989 0.00366 0.00318 0.982 0.993
## 2 tau[2] 0.0925 0.0900 0.0184 0.0181 0.0664 0.125
## 3 w[1,1] 0.0104 0.00522 0.0180 0.00512 0.000739 0.0367
## 4 w[2,1] 0.00559 0.00230 0.0121 0.00242 0.000268 0.0217
## 5 w[3,1] 0.264 0.259 0.0613 0.0600 0.172 0.371
## 6 w[4,1] 0.256 0.255 0.0227 0.0232 0.222 0.294
## 7 w[5,1] 0.996 0.999 0.0102 0.00167 0.984 1.00
## 8 w[6,1] 0.996 0.998 0.00519 0.00212 0.988 1.00
## 9 w[7,1] 0.971 0.977 0.0237 0.0150 0.930 0.992
## 10 w[8,1] 0.249 0.249 0.0268 0.0271 0.205 0.293
## 11 w[1,2] 0.990 0.995 0.0180 0.00512 0.963 0.999
## 12 w[2,2] 0.994 0.998 0.0121 0.00242 0.978 1.00
## 13 w[3,2] 0.736 0.741 0.0613 0.0600 0.629 0.828
## 14 w[4,2] 0.744 0.745 0.0227 0.0232 0.706 0.778
## 15 w[5,2] 0.00416 0.00141 0.0102 0.00167 0.000125 0.0159
## 16 w[6,2] 0.00370 0.00212 0.00519 0.00212 0.000263 0.0122
## 17 w[7,2] 0.0292 0.0225 0.0237 0.0150 0.00752 0.0701
## 18 w[8,2] 0.751 0.751 0.0268 0.0271 0.707 0.795
# View detailed summarized results for a specific K, here K = 2
results$draws_and_summary[[2]]$summarized_results
## # A tibble: 8 × 8
## segment_original_indx segment_name segment_id karyotype chr clock_mean
## <int> <chr> <dbl> <chr> <int> <dbl>
## 1 1 1_1_2e+07 1 2:1 1 0.0900
## 2 2 2_1_2e+07 2 2:0 2 0.0900
## 3 3 3_1_2e+07 3 2:1 3 0.0900
## 4 4 4_1_2e+07 4 2:2 4 0.0900
## 5 5 5_1_2e+07 5 2:2 5 0.989
## 6 6 6_1_2e+07 6 2:2 6 0.989
## 7 7 7_1_2e+07 7 2:1 7 0.989
## 8 8 8_1_2e+07 8 2:2 8 0.0900
## # ℹ 2 more variables: clock_low <dbl>, clock_high <dbl>
Obtain the best K with model_selection_h
W e can run the model_selection_h
function to obtain the
scores for each inference performed with a different K and take the one
with best ICL score if the BIC score prefer 2 components instead of 1,
otherwise choose 1 as best K. The function takes as input the
results
and n_components
and outputs the
best_K
and the corresponding best_fit
together
with the model_selection_tibble
and the
entropy_list
used to evaluate the ICL score.
results_model_selection <- tickTack::model_selection_h(results, n_components = 0)
## Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
best_K <- results_model_selection$best_K
model_selection_tibble <- results_model_selection$model_selection_tibble
entropy <- results_model_selection$entropy_list
print(best_K)
Visulizing the output
The results can be viewed is genome-wise perspective using the
tickTack::plot_timing_h
function.
tickTack::plot_timing_h(results, best_K)
Visualize distributions of draws from the approximate posterior
The approximate posterior distributions can be viewed using the
tickTack::plot_posterior_clocks_h
and
tickTack::plot_posterior_weights_h
functions, that
internally use functions from Bayesplot.
posterior_clocks <- tickTack::plot_posterior_clocks_h(results, best_K)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
posterior_weights <- tickTack::plot_posterior_weights_h(results, best_K)
Visualize the behavior of the ELBO during the inference
K = nrow(results_model_selection$model_selection_tibble)
p_elbo <- list()
for (i in 1:K){
p_elbo[[i]] <- tickTack::plot_elbo_h(results$elbo_iterations[[i]]) + ggplot2::ggtitle(paste0("K = ", i))
}
p_elbo <- gridExtra::grid.arrange(grobs = p_elbo, ncol = 2) #add global title
p_elbo
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
Visualize all the inference results for each K
plot_model_selection_inference <- list()
for (i in 1:K){
plot_model_selection_inference[[i]] <- tickTack::plot_timing_h(results, i) + ggplot2::ggtitle(paste0("K = ", i))
}
plot_model_selection_inference <- gridExtra::grid.arrange(grobs = plot_model_selection_inference, ncol = 2) #add global title
plot_model_selection_inference
## TableGrob (2 x 2) "arrange": 4 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]