R：随机采样抛硬币组

2024-04-19

我正在使用 R 编程语言。

Suppose:

有一枚硬币，如果它正面朝上，那么下一次抛掷正面的概率是 0.6（如果是反面，那么下一次抛掷反面的概率也是 0.6）
一个班有100名学生
每个学生随机抛掷硬币几次
Student_n 的最后一次抛硬币不会影响 Student_n+1 的第一次抛硬币（即当下一个学生抛硬币时，第一次抛硬币正面或反面的概率为 0.5，但该学生的下一次抛硬币取决于上一次抛硬币）

下面是一些代表这个问题的 R 代码：

# https://stackoverflow.com/questions/76192042/r-verifying-the-results-of-coin-flips
library(tidyverse)

set.seed(123)
ids <- 1:100
student_id <- sort(sample(ids, 100000, replace = TRUE))
coin_result <- character(1000)
coin_result[1] <- sample(c("H", "T"), 1)

for (i in 2:length(coin_result)) {
  if (student_id[i] != student_id[i-1]) {
    coin_result[i] <- sample(c("H", "T"), 1)
  } else if (coin_result[i-1] == "H") {
    coin_result[i] <- sample(c("H", "T"), 1, prob = c(0.6, 0.4))
  } else {
    coin_result[i] <- sample(c("H", "T"), 1, prob = c(0.4, 0.6))
  }
}

#tidy up
my_data <- data.frame(student_id, coin_result)
my_data <- my_data[order(my_data$student_id),]

final <- my_data %>%
    group_by(student_id) %>%
    mutate(flip_number = row_number())

从这些数据中，我们可以统计 HH、HT、TH 和 TT 序列出现的次数：

head(my_data)
# A tibble: 6 x 3
# Groups:   student_id [1]
  student_id coin_result  flip_number
       <int> <chr>              <int>
1          1 H                      1
2          1 H                      2
3          1 H                      3
4          1 H                      4
5          1 T                      5
6          1 H                      6

my_data %>%
  group_by(student_id) %>%
  summarize(Sequence = str_c(coin_result, lead(coin_result)), .groups = 'drop') %>%
  filter(!is.na(Sequence)) %>%
  count(Sequence)

# A tibble: 4 × 2
  Sequence     n
  <chr>    <int>
1 HH       29763
2 HT       19782
3 TH       19775
4 TT       30580

我的问题：使用这些数据，我试图完成以下任务：

步骤 1：使用放回抽样法，选择 100 名学生
步骤 2：对于步骤 1 中选择的给定学生，在其翻转序列中随机选择一个位置（例如，将此位置称为“x”）
步骤 3：对于同一名学生，在翻转序列中随机选择第二个位置（例如，将此位置称为“y”），使得 y > x（即“y”出现在“x”之后）。
步骤 4：对步骤 1 中选择的所有学生重复步骤 2 和步骤 3
步骤5：统计步骤1中选择的所有学生出现HH、HT、TH和TT序列的次数
步骤 6：重复步骤 1 - 步骤 5 多次（例如 1000 次）

例如，假设学生 15 有 6 次翻转：H, H, T, H, T, T- 如果 x = 2 且 y = 5，那么我们将有H, T, H, T

这是我自己解决问题的尝试：

# Set the number of iterations
k <- 1000

# Initialize a data frame 
results <- data.frame(iteration_number = numeric(0),
                      h_given_h = numeric(0),
                      h_given_t = numeric(0),
                      t_given_h = numeric(0),
                      t_given_t = numeric(0))

# Set the number of students to sample
n_students <- length(unique(my_data$student_id))

# Loop over the number of iterations
for (i in 1:k) {
  # Randomly sample student ids with replacement
  sampled_ids <- sample(ids, n_students, replace = TRUE)
  
  # Initialize a data frame to store the sampled data
  sampled_data <- data.frame(student_id = integer(0), coin_result = character(0), stringsAsFactors = FALSE)
  
  # LOOP
  for (j in sampled_ids) {
    # Get data for the current student
    student_data <- my_data[my_data$student_id == j, ]
    
    # Randomly choose a starting and ending point
    x <- sample(nrow(student_data), 1)
    y <- sample(x:nrow(student_data), 1)
    
    # Select the data between the starting and ending point
    selected_data <- student_data[x:y, ]
    
    # Append the selected data to the sampled data frame
    sampled_data <- rbind(sampled_data, selected_data)
  }
  
  final <- sampled_data %>%
    group_by(student_id) %>%
    mutate(flip_number = row_number())
  
  # Calculate the conditional probabilities
  cond_prob <- final %>%
    group_by(student_id) %>%
    summarize(Sequence = str_c(coin_result, lead(coin_result)), .groups = 'drop') %>%
    filter(!is.na(Sequence)) %>%
    count(Sequence) %>%
    mutate(prob = n / sum(n))
  
  # Extract probabilities
  p_HH <- cond_prob$prob[cond_prob$Sequence == "HH"]
  p_HT <- cond_prob$prob[cond_prob$Sequence == "HT"]
  p_TH <- cond_prob$prob[cond_prob$Sequence == "TH"]
  p_TT <- cond_prob$prob[cond_prob$Sequence == "TT"]

  #print(i)
  # Append 
  results[i, ] <- c(i, p_HH, p_HT, p_TH, p_TT)
}

colnames(results) <- c("iteration_number", "h_given_h", "h_given_t", "t_given_h", "t_given_t")

library(ggplot2)

# Convert to long 
results_long <- tidyr::pivot_longer(results, cols = c(h_given_h, h_given_t, t_given_h, t_given_t), names_to = "condition", values_to = "probability")

# Plot
ggplot(results_long, aes(x = iteration_number, y = probability, color = condition)) +
  geom_line() +
  labs(x = "Iteration", y = "Probability", color = "Condition")

我的问题：虽然代码似乎已经运行，但我不确定我是否正确执行了此操作。有人可以帮我确认一下吗？

例如 - HH 和 TT 的行不应该几乎相同......并且 TH 和 HT 的行不应该几乎相同吗？但在我的图表中，情况显然并非如此？在我看来，在给定的迭代中，如果同一个学生在重新采样的数据集中出现 3 次，则第一次的最后一个转换将“泄漏”到第二次的第一次转换中，从而损害结果。

Thanks!

我相信通过重复模拟过程和采样过程，我们可以找到您想要的结果。我将通过创建函数来模拟数据并根据您编写的代码对其进行采样来演示这一点

模拟数据的函数

sim_data <- function(n_students){
ids <- 1:n_students
student_id <- sort(sample(ids, 100000, replace = TRUE))
coin_result <- character(1000)
coin_result[1] <- sample(c("H", "T"), 1)

for (i in 2:length(coin_result)) {
  if (student_id[i] != student_id[i-1]) {
    coin_result[i] <- sample(c("H", "T"), 1)
  } else if (coin_result[i-1] == "H") {
    coin_result[i] <- sample(c("H", "T"), 1, prob = c(0.6, 0.4))
  } else {
    coin_result[i] <- sample(c("H", "T"), 1, prob = c(0.4, 0.6))
  }
}

data.frame(student_id, coin_result) %>% 
  arrange(student_id) %>% 
  group_by(student_id) %>%
  mutate(flip_number = row_number())
}

dt <- sim_data(100)

采样功能

Sampling <- function(my_data, n_sample = 100){

  # Step 1
  n_students <- length(unique(my_data$student_id))
  sampled_ids <- sample(1:n_students, n_sample, replace = TRUE)
  sampled_data <- data.frame(student_id = integer(0), 
coin_result = character(0), 
stringsAsFactors = FALSE)
  
  for (j in seq_along(sampled_ids)) {
  # Step 2
    student_data <- my_data[my_data$student_id == sampled_ids[j], ]
    
    x <- sample(nrow(student_data), 1)
    
  # Step 3  
    # There is a edge case where position x is the last flip in the sequence, for 
    # all other cases we shouldn't use position "x" in the sample
    if (x == nrow(student_data)) y <- x else
    y <- sample((x + 1):nrow(student_data), 1)
    selected_data <- student_data[x:y,] %>% 
    mutate(loop_id = j)
    
    sampled_data <- rbind(sampled_data, selected_data)
  }
  
  # Step 5  
  cond_prob <- 
    sampled_data %>%
    group_by(loop_id) %>%
    mutate(flip_number = row_number()) %>% 
    mutate(Sequence = str_c(coin_result, lead(coin_result))) %>%
    filter(!is.na(Sequence)) %>%
    ungroup() %>% 
    count(Sequence) %>%
    mutate(prob = n / sum(n))
  
  
  p_HH <- cond_prob$prob[cond_prob$Sequence == "HH"]
  p_HT <- cond_prob$prob[cond_prob$Sequence == "HT"]
  p_TH <- cond_prob$prob[cond_prob$Sequence == "TH"]
  p_TT <- cond_prob$prob[cond_prob$Sequence == "TT"]
  
  c("h_given_h" = p_HH,"h_given_t" =  p_HT, "t_given_h" = p_TH, "t_given_t" = p_TT)
}

比较各种方法k = 100重复我们得到以下结果：

n_replicate <- 100

results_1 <- 
  replicate(n_replicate,Sampling(dt)) %>% 
  t() %>% 
  as.data.frame() %>%
  rowid_to_column('iteration_number') 

results_2 <- replicate(n_replicate,{
  sim_data(100) %>% 
  Sampling()
  } ) %>% 
  t() %>% 
  as.data.frame() %>%
  rowid_to_column('iteration_number') 

 bind_rows("OP" = results_1,"Sample different data" = results_2, .id = "sample_method") %>% 
   tidyr::pivot_longer(cols = c(h_given_h, h_given_t, t_given_h, t_given_t), names_to = "condition", values_to = "probability") %>%
   summarise(prob = mean(probability), .by = c(sample_method, condition)) %>% 
     pivot_wider(names_from = sample_method, values_from = prob ) 
#> A tibble: 4 × 3
#>  condition    OP `Sample different data`
#>  <chr>     <dbl>                   <dbl>
#> 1 h_given_h 0.283                   0.305
#> 2 h_given_t 0.203                   0.199
#> 3 t_given_h 0.203                   0.199
#> 4 t_given_t 0.311                   0.297

library(ggplot2)

  bind_rows(results_1, results_2, .id = "sample_method") %>% 
    tidyr::pivot_longer(cols = c(h_given_h, h_given_t, t_given_h, t_given_t), names_to = "condition", values_to = "probability") %>% 
    ggplot(aes(x = iteration_number, y = probability, color = condition)) +
  geom_line() +
  labs(x = "Iteration", y = "Probability", color = "Condition") +
    facet_wrap(~sample_method, labeller = labeller(sample_method = c("1" = "OP", "2" = "Sample different data") ) )

^{Created on 2023-05-11 with reprex v2.0.2 https://reprex.tidyverse.org}

Edit

条件概率的计算方式也不理想。在给定“H”的情况下估计“H”概率的最佳方法是计算具有结果“H”的硬币的数量，并在下一个硬币上再次获得结果“H”。您呈现它的方式比 P(H|H) 和 P(T|H) 更接近于估计组合 P(HH) 和 P(HT) 的边际概率。

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

r