Skip to main content
Logo image

Answering Questions with Data Introductory Statistics for Psychology Students

Section 13.1 Correlation GIFs

Note regression lines and confidence bands can be added using geom_smooth(method=lm, se=T)

Subsection 13.1.1 N=10, both variables drawn from a uniform distribution

Figure 13.1.1. Correlation between random deviates from uniform distribution with N=10
all_df<-data.frame()
for(sim in 1:10){
  North_pole <- runif(10,1,10)
  South_pole <- runif(10,1,10)
  t_df<-data.frame(simulation=rep(sim,10),
                                  North_pole,
                                  South_pole)
  all_df<-rbind(all_df,t_df)
}


ggplot(all_df,aes(x=North_pole,y=South_pole))+
  geom_point()+
  geom_smooth(method=lm, se=FALSE)+
  theme_classic()+
  transition_states(
    simulation,
    transition_length = 2,
    state_length = 1
  )+enter_fade() + 
  exit_shrink() +
  ease_aes('sine-in-out')

Subsection 13.1.2 Correlation between random deviates from uniform distribution across four sample sizes

N= 10,50,100,1000 All values sampled from a uniform distribution
Figure 13.1.2. Correlation across four sample sizes (N=10, 50, 100, 1000) from uniform distribution
all_df<-data.frame()
for(sim in 1:10){
  for(n in c(10,50,100,1000)){
  North_pole <- runif(n,1,10)
  South_pole <- runif(n,1,10)
  t_df<-data.frame(nsize=rep(n,n),
                   simulation=rep(sim,n),
                                  North_pole,
                                  South_pole)
  all_df<-rbind(all_df,t_df)
  }
}


ggplot(all_df,aes(x=North_pole,y=South_pole))+
  geom_point()+
  geom_smooth(method=lm, se=FALSE)+
  theme_classic()+
  facet_wrap(~nsize)+
  transition_states(
    simulation,
    transition_length = 2,
    state_length = 1
  )+enter_fade() + 
  exit_shrink() +
  ease_aes('sine-in-out')

Subsection 13.1.3 Correlation between random deviates from normal distribution across four sample sizes

N= 10,50,100,1000 All values sampled from the same normal distribution (mean=0, sd=1)
Figure 13.1.3. Correlation across four sample sizes (N=10, 50, 100, 1000) from normal distribution
all_df<-data.frame()
for(sim in 1:10){
  for(n in c(10,50,100,1000)){
  North_pole <- rnorm(n,0,1)
  South_pole <- rnorm(n,0,1)
  t_df<-data.frame(nsize=rep(n,n),
                   simulation=rep(sim,n),
                                  North_pole,
                                  South_pole)
  all_df<-rbind(all_df,t_df)
  }
}


ggplot(all_df,aes(x=North_pole,y=South_pole))+
  geom_point()+
  geom_smooth(method=lm, se=FALSE)+
  theme_classic()+
  facet_wrap(~nsize)+
  transition_states(
    simulation,
    transition_length = 2,
    state_length = 1
  )+enter_fade() + 
  exit_shrink() +
  ease_aes('sine-in-out')

Subsection 13.1.4 Correlation between X and Y variables that have a true correlation as a function of sample-size

Sample-sizes = 10, 50, 100, 1000. There is a true correlation between X and Y in this case, set at r=.5.
Figure 13.1.4. True correlation (r=.5) across four sample sizes
library(MASS)

all_df<-data.frame()
for(sim in 1:10){
  for(n in c(10,50,100,1000)){
  d <- mvrnorm(n = n, mu = c(0,0), Sigma = matrix(c(1,.5,.5,1), ncol = 2),
                       empirical = FALSE)
  t_df<-data.frame(nsize=rep(n,n),
                   simulation=rep(sim,n),
                   North_pole=d[,1],
                   South_pole=d[,2])
  all_df<-rbind(all_df,t_df)
  }
}

ggplot(all_df,aes(x=North_pole,y=South_pole))+
  geom_point()+
  geom_smooth(method=lm, se=FALSE)+
  theme_classic()+
  facet_wrap(~nsize)+
  transition_states(
    simulation,
    transition_length = 2,
    state_length = 1
  )+enter_fade() + 
  exit_shrink() +
  ease_aes('sine-in-out')

Subsection 13.1.5 Type I errors, sampling random deviates from normal distribution with regression lines

Four sample-sizes, N= 10, 50, 100, 1000. All samples drawn from normal distribution (u=0, sd=1). Shows only the samples where p<.05. This is only true for about 5% of all samples (shown here for first 10 Type I errors for each N). Thus, we see Type I errors (significant correlations when X and Y are both completely random).
Figure 13.1.5. Type I errors in correlation tests across four sample sizes
all_df<-data.frame()
for(n in c(10,50,100,1000)){
  count_sims<-0
  for(sim in 1:1000){
    North_pole <- rnorm(n,0,1)
    South_pole <- rnorm(n,0,1)
    if(cor.test(North_pole,South_pole)$p.value<.05){
      count_sims<-count_sims+1
    t_df<-data.frame(nsize=rep(n,n),
                     simulation=rep(count_sims,n),
                     North_pole,
                     South_pole)
    all_df<-rbind(all_df,t_df)
    
    if(count_sims==10){
      break
    }
    }
  }
}


ggplot(all_df,aes(x=North_pole,y=South_pole))+
  geom_point()+
  geom_smooth(method=lm, se=TRUE)+
  theme_classic()+
  facet_wrap(~nsize)+
  transition_states(
    simulation,
    transition_length = 2,
    state_length = 1
  )+enter_fade() + 
  exit_shrink() +
  ease_aes('sine-in-out')

Subsection 13.1.6 Cell-size and correlation

This simulation illustrates how the behavior of correlating two random normal samples as a function of cell-size. The sample-size is always set at N=10. For each panel, the simulation uses an increasing cell-size to estimate the mean for X and Y. When cell-size is 1, 10 X and Y values are drawn from the same normal (u=0, sd=1). When cell-size is 5, for each X,Y score in the plot, 5 samples were drawn from the same normal, and then the mean of the samples is plotted. The effect of cell-size shrinks the dot cloud, as both X and Y scores provide better estimates of the population mean = 0. Cell-size has no effect on the behavior of r, which swings around because sample-size N is small. These are all random, so there is always a 5% type I error rate (alpha =.05).
Figure 13.1.6. Cell-size and correlation with N=10, Cell-size = 1, 5, 10, 100
get_sampling_means<-function(m,sd,cell_size,s_size){
  save_means<-length(s_size)
  for(i in 1:s_size){
    save_means[i]<-mean(rnorm(cell_size,m,sd))
  }
  return(save_means)
}

all_df<-data.frame()
for(n in c(1,5,10,100)){
  count_sims<-0
  for(sim in 1:10){
    North_pole <- get_sampling_means(0,1,n,10)
    South_pole <- get_sampling_means(0,1,n,10)
      count_sims<-count_sims+1
      t_df<-data.frame(nsize=rep(n,10),
                       simulation=rep(count_sims,10),
                       North_pole,
                       South_pole)
      all_df<-rbind(all_df,t_df)
  }
}


ggplot(all_df,aes(x=North_pole,y=South_pole))+
  geom_point()+
  geom_smooth(method=lm, se=TRUE)+
  theme_classic()+
  facet_wrap(~nsize)+
  ggtitle("Random scatterplots, N=10, Cell-size = 1,5,10,100")+
  transition_states(
    simulation,
    transition_length = 2,
    state_length = 1
  )+enter_fade() + 
  exit_shrink() +
  ease_aes('sine-in-out')

Subsection 13.1.7 Regression

We look at how the residuals (error from points to line) behave as the regression lines moves above and below it’s true value. The total error associated with all of the red lines is represents by the grey area. This total error is smallest (minimized) when the black line overlaps with the blue regression line (the best fit line). The total error expands as the black line moves away from the regression. That’s why the regression line is the least wrong (best fit) line to skewer the data (according to least squares definition)
Figure 13.1.7. Regression line and residuals visualization
d <- mtcars
fit <- lm(mpg ~ hp, data = d)
d$predicted <- predict(fit)   # Save the predicted values
d$residuals <- residuals(fit) # Save the residual values

coefs<-coef(lm(mpg ~ hp, data = mtcars))
coefs[1]
coefs[2]

x<-d$hp
move_line<-c(seq(-6,6,.5),seq(6,-6,-.5))
total_error<-length(length(move_line))
cnt<-0
for(i in move_line){
  cnt<-cnt+1
  predicted_y <- coefs[2]*x + coefs[1]+i
  error_y <- (predicted_y-d$mpg)^2
  total_error[cnt]<-sqrt(sum(error_y)/32)
}

move_line_sims<-rep(move_line,each=32)
total_error_sims<-rep(total_error,each=32)
sims<-rep(1:50,each=32)

d<-d %>% slice(rep(row_number(), 50))

d<-cbind(d,sims,move_line_sims,total_error_sims)


anim<-ggplot(d, aes(x = hp, y = mpg, frame=sims)) +
  geom_smooth(method = "lm", se = FALSE, color = "lightblue") +  
  geom_abline(intercept = 30.09886+move_line_sims, slope = -0.06822828)+
  lims(x = c(0,400), y = c(-10,40))+
  geom_segment(aes(xend = hp, yend = predicted+move_line_sims, color="red"), alpha = .5) + 
  geom_point() +
  geom_ribbon(aes(ymin = predicted+move_line_sims - total_error_sims, ymax = predicted+move_line_sims + total_error_sims), fill = "lightgrey", alpha=.2)+ 
  theme_classic()+
  theme(legend.position="none")+
  xlab("X")+ylab("Y")+
  transition_manual(frames=sims)+
  enter_fade() + 
  exit_fade()+
  ease_aes('sine-in-out')

animate(anim,fps=5)