###############
#Introduction to Linguistics, Fall 2020 
#Psycholinguistics: The garden path effect
#Josh Weirick
#Last edit: 3/4/2021
###############

#load packages 
library(tidyverse)
library(stringr)
library(readxl)

#Read in all three output files
LING201_GP <-  read_csv(file="LING201_GP_results.txt", comment="#",  col_names = FALSE)

#check the resulting dataframes for problems
problems(LING201_GP)

#rename columns.
LING201_GP <-rename(LING201_GP, Time_results_received=X1, MD5_hash=X2, Controller=X3, Item_number=X4, Element_number=X5, Item_code=X6, Group=X7, Word_number=X8,  Word=X9, Reading_time=X10, Newline=X11, Sentence=X12)

#combine the columns Time_results_received and MD5_hash into a new column named par.ident. This will serve as the participant ID number.
LING201_GP$par.ident <- paste(LING201_GP$Time_results_received,LING201_GP$MD5_hash)

#Remove the comprehension questions, practice items, and the fillers, since we're not interested in those:
LING201_GP <- LING201_GP %>% filter(!str_detect(Controller, "Question"))
LING201_GP <- LING201_GP %>% filter(!str_detect(Item_code, "SPR_practice"))
LING201_GP <- LING201_GP %>% filter(!str_detect(Item_code, "SPRFiller_"))

#annotate for sentence type. a=garden path sentence, b=sentence containing overt 'that' (non garden path). 
LING201_GP <- mutate(LING201_GP, Sentence_type=case_when(str_detect(Item_code, "a$") ~ "a",
                                               str_detect(Item_code, "b$") ~ "b"
                                               ))

#subset the data based on the participant ID that you just create, and store the set of dataframes in a new variable called 'split'
split <- split(LING201_GP, LING201_GP$par.ident)

#save data from each partcipant as a separate .csv file. These files will appear in your R working directory.
lapply(names(split), function(x){
  write_csv(split[[x]], file = paste(x, ".csv", sep = ""))
})

#In your R working directory, create a new folder called 'GardenPath'. Inside the folder 'GardenPath', Create a folder called 'Data'. Move the .csv files that you just saved into the  'Data' folder. Compress the 'Data' folder by saving it as a .zip file. On Mac, this can be done by right clicking the 'Data' folder and slecting 'compress'

#create a new variable called 'result'. This will be an empty list that is the same length as 'split'. This will serve as a container variable for the output of the loop that we're about to run. 
result <- list()

#Loop through each .csv file and annotate the file for sentence regions. Just run lines 49-90 all at once. 
unzip("GardenPath/Data.zip", 
      exdir="GardenPath/")           

Lex.files <- dir(     
  "GardenPath/Data", 
  recursive=TRUE,        
  full.names=TRUE)      

for (i in seq(Lex.files)) { cat(i/length(Lex.files), "\n") 
  current.Lex.file <-   
    read.table(Lex.files[i], 
               header = TRUE, 
               sep = ",",
               colClasses = "character",
               
    )
  
  current.Lex.file_a <- current.Lex.file %>% filter(Sentence_type== "a")
  current.Lex.file_b <- current.Lex.file %>% filter(Sentence_type== "b")
  
  current.Lex.file_a$Region <- with(current.Lex.file_a, ave(Item_code, Item_code, FUN = seq_along))
  current.Lex.file_b$Region <- with(current.Lex.file_b, ave(Item_code, Item_code, FUN = seq_along))
  
  i3 <- which(current.Lex.file_b$Region %in% c(3))
  i4 <- which(current.Lex.file_b$Region %in% c(4))
  i5 <- which(current.Lex.file_b$Region %in% c(5))
  i6 <- which(current.Lex.file_b$Region %in% c(6))
  i7 <- which(current.Lex.file_b$Region %in% c(7))
  i8 <- which(current.Lex.file_b$Region %in% c(8))
  
  current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i3, 2.5)
  current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i4, 3)
  current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i5, 4)
  current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i6, 5)
  current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i7, 6)
  current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i8, 7)
  
  current.Lex.file <- rbind(current.Lex.file_b, current.Lex.file_a)
  
  result[[i]] <- current.Lex.file
  
}

#bind the resulting data frames together into a single datafram. 
test <- rbindlist(result)

#Make sure the RT variable is numeric!
test$Reading_time <- as.numeric(as.character(test$Reading_time))

#Find the mean and standard deviation for the reading times. 
summary(test$Reading_time)
sd(test$Reading_time)

#In self-paced reading studies, it is conventional to 'trim' the data to remove extremely high or low reading times. In this case, I suggest removing any observations with reading times greater than three standard deviations above the mean. For my results, that meant removing any observations with RTs above 1542.5848 ms.
test <- filter(test, Reading_time < 1542.5848)

#create a helper data frame containing the mean RTs grouped by sentence type and region. 
ST_region_mean <- test %>%
  group_by(Sentence_type, Region) %>%
  summarise(sd=sd(Reading_time, na.rm = TRUE), Reading_time = mean(Reading_time))

#visualize RT by region for HNPS sentences
ggplot(ST_region_mean, aes(x=Region, y=Reading_time, group=Sentence_type))+
  theme_classic()+
  theme(text=element_text(size=16))+
  #scale_x_discrete(labels=c("SV", "NP1", "NP2", "S1", "S2"))+
  geom_line(aes(color=Sentence_type))+
  geom_point(aes (color=Sentence_type))+
  expand_limits(y=c(350,500))

#save the visualization
ggsave("LING201_GP_figure1.png", dpi=300, height=6, width = 8, units="in")