Sampling

sampling data A short description of the post.

library(tidyverse)
library(moderndive) #install before loading

Question Modify the code for comparing differnet sample sizes from the virtual bowl

Segment 1: sample size = 28 1a.)

virtual_sample_28  <- bowl  %>% 
rep_sample_n(size = 28, reps = 1150)
virtual_sample_28
# A tibble: 32,200 x 3
# Groups:   replicate [1,150]
   replicate ball_ID color
       <int>   <int> <chr>
 1         1    2372 white
 2         1     991 white
 3         1    1475 white
 4         1    2341 white
 5         1     256 white
 6         1    1460 red  
 7         1    1026 white
 8         1     763 white
 9         1    1336 white
10         1    1630 white
# ... with 32,190 more rows
virtual_samples_28 <- bowl %>% 
  rep_sample_n(size = 28, reps = 1150)
virtual_samples_28
# A tibble: 32,200 x 3
# Groups:   replicate [1,150]
   replicate ball_ID color
       <int>   <int> <chr>
 1         1    2229 white
 2         1    1146 white
 3         1    2266 red  
 4         1    2350 white
 5         1    1504 red  
 6         1    1377 white
 7         1    1851 red  
 8         1     576 red  
 9         1     829 red  
10         1    2206 red  
# ... with 32,190 more rows
1b.)
virtual_prop_red_28 <- virtual_samples_28 %>% 
  group_by(replicate) %>% 
  summarize(red = sum(color == "red")) %>% 
  mutate(prop_red = red / 28)
virtual_prop_red_28
# A tibble: 1,150 x 3
   replicate   red prop_red
 *     <int> <int>    <dbl>
 1         1    11    0.393
 2         2    17    0.607
 3         3     7    0.25 
 4         4     8    0.286
 5         5     9    0.321
 6         6    15    0.536
 7         7    12    0.429
 8         8     9    0.321
 9         9     8    0.286
10        10    11    0.393
# ... with 1,140 more rows
1c.)
ggplot(virtual_prop_red_28, aes(x = prop_red)) +
  geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
  labs(x = "Proportion of 28 balls that were", title = "28") 

segment 2

2a.)

virtual_samples_53  <- bowl  %>% 
rep_sample_n(size = 53, reps = 1150)

2b.) compute resulting of proportions red

virtual_prop_red_53 <- virtual_samples_53 %>% 
  group_by(replicate) %>% 
  summarize(red = sum(color == "red")) %>% 
  mutate(prop_red = red / 53)
virtual_prop_red_53
# A tibble: 1,150 x 3
   replicate   red prop_red
 *     <int> <int>    <dbl>
 1         1    21    0.396
 2         2    23    0.434
 3         3    23    0.434
 4         4    12    0.226
 5         5    23    0.434
 6         6    23    0.434
 7         7    28    0.528
 8         8    25    0.472
 9         9    18    0.340
10        10    14    0.264
# ... with 1,140 more rows
FALSE
[1] FALSE

2c.)Plot distribution of virtual_prop_red_SEE QUIZ via a histogram

ggplot(virtual_prop_red_53, aes(x = prop_red)) +
  geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
  labs(x = "Proportion of 53 balls that were red", title = "53") 

3a.)

virtual_samples_118  <- bowl  %>% 
rep_sample_n(size = 118, reps = 1150)

3b.) compute resulting replicas of proportion red

virtual_prop_red_118 <- virtual_samples_118 %>% 
  group_by(replicate) %>% 
  summarize(red = sum(color == "red")) %>% 
  mutate(prop_red = red / 118)

3c.)

ggplot(virtual_prop_red_118, aes(x = prop_red)) +
 geom_histogram (binwidth = 0.05, boundary = 0.4, color = "white") +
  labs(x = "Proportion of 118 balls that were red", title = "118") 

virtual_prop_red_28  %>% 
  summarize(sd = sd(prop_red))
# A tibble: 1 x 1
      sd
   <dbl>
1 0.0912
virtual_prop_red_53  %>% 
  summarize(sd = sd(prop_red))
# A tibble: 1 x 1
      sd
   <dbl>
1 0.0632
virtual_prop_red_118 %>% 
summarize(sd= sd(prop_red))
# A tibble: 1 x 1
      sd
   <dbl>
1 0.0422