############### #Introduction to Linguistics, Fall 2020 #Psycholinguistics: The garden path effect #Josh Weirick #Last edit: 3/4/2021 ############### #load packages library(tidyverse) library(stringr) library(readxl) #Read in all three output files LING201_GP <- read_csv(file="LING201_GP_results.txt", comment="#", col_names = FALSE) #check the resulting dataframes for problems problems(LING201_GP) #rename columns. LING201_GP <-rename(LING201_GP, Time_results_received=X1, MD5_hash=X2, Controller=X3, Item_number=X4, Element_number=X5, Item_code=X6, Group=X7, Word_number=X8, Word=X9, Reading_time=X10, Newline=X11, Sentence=X12) #combine the columns Time_results_received and MD5_hash into a new column named par.ident. This will serve as the participant ID number. LING201_GP$par.ident <- paste(LING201_GP$Time_results_received,LING201_GP$MD5_hash) #Remove the comprehension questions, practice items, and the fillers, since we're not interested in those: LING201_GP <- LING201_GP %>% filter(!str_detect(Controller, "Question")) LING201_GP <- LING201_GP %>% filter(!str_detect(Item_code, "SPR_practice")) LING201_GP <- LING201_GP %>% filter(!str_detect(Item_code, "SPRFiller_")) #annotate for sentence type. a=garden path sentence, b=sentence containing overt 'that' (non garden path). LING201_GP <- mutate(LING201_GP, Sentence_type=case_when(str_detect(Item_code, "a$") ~ "a", str_detect(Item_code, "b$") ~ "b" )) #subset the data based on the participant ID that you just create, and store the set of dataframes in a new variable called 'split' split <- split(LING201_GP, LING201_GP$par.ident) #save data from each partcipant as a separate .csv file. These files will appear in your R working directory. lapply(names(split), function(x){ write_csv(split[[x]], file = paste(x, ".csv", sep = "")) }) #In your R working directory, create a new folder called 'GardenPath'. Inside the folder 'GardenPath', Create a folder called 'Data'. Move the .csv files that you just saved into the 'Data' folder. Compress the 'Data' folder by saving it as a .zip file. On Mac, this can be done by right clicking the 'Data' folder and slecting 'compress' #create a new variable called 'result'. This will be an empty list that is the same length as 'split'. This will serve as a container variable for the output of the loop that we're about to run. result <- list() #Loop through each .csv file and annotate the file for sentence regions. Just run lines 49-90 all at once. unzip("GardenPath/Data.zip", exdir="GardenPath/") Lex.files <- dir( "GardenPath/Data", recursive=TRUE, full.names=TRUE) for (i in seq(Lex.files)) { cat(i/length(Lex.files), "\n") current.Lex.file <- read.table(Lex.files[i], header = TRUE, sep = ",", colClasses = "character", ) current.Lex.file_a <- current.Lex.file %>% filter(Sentence_type== "a") current.Lex.file_b <- current.Lex.file %>% filter(Sentence_type== "b") current.Lex.file_a$Region <- with(current.Lex.file_a, ave(Item_code, Item_code, FUN = seq_along)) current.Lex.file_b$Region <- with(current.Lex.file_b, ave(Item_code, Item_code, FUN = seq_along)) i3 <- which(current.Lex.file_b$Region %in% c(3)) i4 <- which(current.Lex.file_b$Region %in% c(4)) i5 <- which(current.Lex.file_b$Region %in% c(5)) i6 <- which(current.Lex.file_b$Region %in% c(6)) i7 <- which(current.Lex.file_b$Region %in% c(7)) i8 <- which(current.Lex.file_b$Region %in% c(8)) current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i3, 2.5) current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i4, 3) current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i5, 4) current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i6, 5) current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i7, 6) current.Lex.file_b$Region <- replace(current.Lex.file_b$Region, i8, 7) current.Lex.file <- rbind(current.Lex.file_b, current.Lex.file_a) result[[i]] <- current.Lex.file } #bind the resulting data frames together into a single datafram. test <- rbindlist(result) #Make sure the RT variable is numeric! test$Reading_time <- as.numeric(as.character(test$Reading_time)) #Find the mean and standard deviation for the reading times. summary(test$Reading_time) sd(test$Reading_time) #In self-paced reading studies, it is conventional to 'trim' the data to remove extremely high or low reading times. In this case, I suggest removing any observations with reading times greater than three standard deviations above the mean. For my results, that meant removing any observations with RTs above 1542.5848 ms. test <- filter(test, Reading_time < 1542.5848) #create a helper data frame containing the mean RTs grouped by sentence type and region. ST_region_mean <- test %>% group_by(Sentence_type, Region) %>% summarise(sd=sd(Reading_time, na.rm = TRUE), Reading_time = mean(Reading_time)) #visualize RT by region for HNPS sentences ggplot(ST_region_mean, aes(x=Region, y=Reading_time, group=Sentence_type))+ theme_classic()+ theme(text=element_text(size=16))+ #scale_x_discrete(labels=c("SV", "NP1", "NP2", "S1", "S2"))+ geom_line(aes(color=Sentence_type))+ geom_point(aes (color=Sentence_type))+ expand_limits(y=c(350,500)) #save the visualization ggsave("LING201_GP_figure1.png", dpi=300, height=6, width = 8, units="in")