BqExAvI Survey Analysis (2017-2022). Part II

Author

Modesto

Published

May 10, 2023

Show the code

knitr::opts_chunk$set(echo = TRUE)
#Load R libraries
paquetes <- c("deeplr", "ggplot2", "BTM", "ggraph", "igraph", "dplyr", "akc","tidytext","udpipe","textrank","ggwordcloud","ggpubr")
unavailable <- setdiff(paquetes, rownames(installed.packages()))
install.packages(unavailable)
lapply(paquetes, library, character.only = TRUE,quietly = TRUE)

1 Contents and Disclaimer

This page refers to student surveys of “Experimental Advanced Biochemistry I” course (Biochemistry Degree, Universidad Autónoma de Madrid). Description of questionnaires and analysis of quantitative questions is in the main page here. The GitHub repo contains the original files of all analyses.

The data is made available under the Creative Common License (CC BY-NC-ND 3.0 ES).

Preliminary

This is only a preliminary analysis. Contact modesto.redrejo@uam.es for more information.

2 Free-text questions

The list of questions is available in the companion report. Free-text questions (50, 51, 52, 75, 76, 77) will be processed to identify more common keywords and terms co-ocurrence. First, text are translated with deeplr package and save it into a new table.

Show the code

#load the data
data <- read.csv("merged_data.csv")
#to deal with the deeplr words limit, each question is translated independently
Q50 <- toEnglish2(data$Q50, auth_key = "9c9bf91e-f863-9406-d285-2f209fc1828d:fx")
Q51 <- toEnglish2(data$Q51, auth_key = "9c9bf91e-f863-9406-d285-2f209fc1828d:fx")
Q52 <- toEnglish2(data$Q52, auth_key = "9c9bf91e-f863-9406-d285-2f209fc1828d:fx")
Q75 <- toEnglish2(na.omit(data$Q75), auth_key = "9c9bf91e-f863-9406-d285-2f209fc1828d:fx")
Q76 <- toEnglish2(na.omit(data$Q76), auth_key = "9c9bf91e-f863-9406-d285-2f209fc1828d:fx")
Q77 <- toEnglish2(na.omit(data$Q77), auth_key = "9c9bf91e-f863-9406-d285-2f209fc1828d:fx")
#subset data and sweep sp to en
data_en <- data[,c(2,52,53,54,77,78,79)]
data_en$Q50 <- Q50
data_en$Q51 <- Q51
data_en$Q52 <- Q52
data_en$Q75[which(!is.na(data_en$Q75))] <- Q75
data_en$Q76[which(!is.na(data_en$Q76))] <- Q76
data_en$Q77[which(!is.na(data_en$Q77))] <- Q77

write.csv(data_en,"survey_open_en.csv")

Then, we processed the table and process it with akc and udpipe R packages. The code is largely based in an akc tutorial and updpipe extended vignette.

Previously, but after some rounds of try and error, we also performed some ad hoc word switching to unify terms (e.g. combine lab and laboratory, remove instructors’ names, etc.). We found that the combination of the new akc package and udpipe achieved the best automatic term extraction and lemmatization with improved selection of key terms with reduced noise (conjunctions and other extra words). However, as akc is designed for text network-based classification which is beyond our scope, so we decided to implement udpipe network plot. Each plot contains the connections of the top-40 most common keywords (nouns and verbs).

Show the code

q_en <-  read.csv("survey_open_en.csv",row.names = NULL, stringsAsFactors = FALSE)

#naif combination of some identical terms
q_en  <- as.data.frame(sapply(q_en,tolower))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('&nbsp;','',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('laboratory','lab',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('script','protocol',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('team','group',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('groupwork','group',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('learning','learn',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('sandra','assistants',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('barbara','assistants',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('barbara acosta','assistants',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('rojo','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub(' ana ', 'instructors',x)))
#since "ana" is within some words, I also changed "ANA" and "Ana Rojo" to rojo manually
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('luis','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('juan','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('modesto','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('benilde','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub("\\\\","",x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('oscar','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('teachers','instructors',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('internship','practices',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('internshipss','practices',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('script','protocol',x)))
q_en <- data.frame(apply(q_en, 2, function(x)  gsub('notebooks','notebook',x)))

#clean and extrack keywords
#from https://hope-data-science.github.io/akc/articles/tutorial_raw_text.html
# https://cran.r-project.org/web/packages/akc/vignettes/akc_vignette.html

#model with udpipe & create dictionary   

#download the model the first time
#ud_model <- udpipe_download_model(language = "english")
#ud_model <- udpipe_load_model(ud_model$file_model)
#load the model from HD
ud_model <- udpipe_load_model("/Users/modesto/Documents/GitHub/bqexav/english-ewt-ud-2.5-191206.udpipe")
x <- list()
Q <- c("Q50","Q51","Q52","Q75","Q76","Q77")
for (i in 1:6){
  x[[i]] <- as.data.frame(udpipe_annotate(ud_model, x = q_en[,i+2], tagger="default",parser="none"))
  names(x)[[i]] <- Q[i]
}


# get stop words from `tidytext` package
stop_words %>%
    pull(word) %>%
    unique() -> my_stopword

#make dictionaries
my_dict <- list()
for (i in 1:6){
  my_dict[[i]] <- make_dict(x[[i]]$lemma)
  #names(my_dict)[[i]] <- Q[i]
}

#extract keywords with akc
extracted_keywords <- list()
for (i in 1:6){
  q_en %>%
    keyword_extract(id = "X",text = colnames(q_en)[i+2],
    dict = my_dict[[i]],stopword = my_stopword) -> extracted_keywords[[i]]
    names(extracted_keywords)[[i]] <- Q[i]
}

#clean data
clean_data <- list()
for (i in 1:6){
  extracted_keywords[[i]] %>%
    keyword_clean() -> clean_data[[i]]
    names(clean_data)[[i]] <- Q[i]
}



#visualize network as in udpipe vignette

#plot from akc extracted_keywords

questions <- read.csv("questions_final.csv", head=TRUE, sep=",") #load questions
titles <- questions[c(50,51,52,75,76,77),4]
stats <- list()
wordnetwork <- list()
plotkey <- list()
for (i in 1:6){
  stats[[i]] <- cooccurrence(x = clean_data[[i]], 
                     term = "keyword", group = "id")
  wordnetwork[[i]] <- head(stats[[i]], 40)
  wordnetwork[[i]] <- graph_from_data_frame(wordnetwork[[i]])
  plotkey[[i]] <- ggraph(wordnetwork[[i]], layout = "fr") +
    geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "indianred2") +
    geom_node_text(aes(label = name), col = "darkgreen", size = 7, repel=T,box.padding=0.06) +
    theme_graph(base_family = "Arial", plot_margin = margin(0, 30, 0, 30)) +
    theme(legend.position = "none", plot.margin=unit(c(0,0,2,0), "cm")) +
    labs(title =titles[[i]], subtitle = "Cooccurrences within same sentence (Nouns & Adjective)") 

}
#show the plots
ggarrange(plotlist=plotkey,nrow=6,ncol=1)

3 Answers per year

Now we split the plots per year. To facilitate the understanding, we have selected the top-20 words.

Show the code

#extract keywords
extracted_keywords <- list()
indice <- matrix(1:36, byrow = TRUE, nrow = 6)
for (i in 1:6){
  for (j in 1:6){
    tablita <-  q_en[q_en$Curso == 2016+j,]
    if (any(is.na(tablita[,i+2][tablita$Curso == 2016+j]))) next
    tablita %>%
      keyword_extract(id = "X",text = colnames(tablita)[i+2],
      dict = my_dict[[i]], stopword = my_stopword) -> extracted_keywords[[indice[i,j]]]
      names(extracted_keywords)[[indice[i,j]]] <- Q[i]
  }
}


#clean data
clean_data <- list()
for (i in 1:36){
  if (is.null(extracted_keywords[[i]])) next
  extracted_keywords[[i]] %>%
    keyword_clean() -> clean_data[[i]]
    names(clean_data)[[i]] <- Q[i]
}



#visualize

#plot from akc extracted_keywords

year <- rep(unique(q_en$Curso), 6)
stats <- list()
wordnetwork <- list()
plotkey <- list()
for (i in 1:36){
  if (is.null(clean_data[[i]])) next
  stats[[i]] <- cooccurrence(x = clean_data[[i]], 
                     term = "keyword", group = "id")
  wordnetwork[[i]] <- head(stats[[i]], 20)
  wordnetwork[[i]] <- graph_from_data_frame(wordnetwork[[i]])
  plotkey[[i]] <- ggraph(wordnetwork[[i]], layout = "fr") +
    geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "indianred2") +
    geom_node_text(aes(label = name), col = "darkgreen", size = 6,repel=T,box.padding=0.06) +
    theme_graph(base_family = "Arial", plot_margin = margin(0, 15, 0, 15)) +
    theme(legend.position = "none", plot.margin=unit(c(0,0,1,0), "cm")) +
    labs(title =paste0(titles[[ifelse( !(i %% 6==0),as.integer(i/6)+1,i/6)]], "- ",year[i]), subtitle = "Cooccurrences within same sentence (Nouns & Adjective)")
}
#show the plots
plotyear <- Filter(Negate(is.null),plotkey)
ggarrange(plotlist=plotyear,nrow=28,ncol=1)

4 Quantitative questions

4.0.1 Session Info

Show the code

sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggpubr_0.6.0      ggwordcloud_0.5.0 textrank_0.3.1    udpipe_0.8.11    
 [5] tidytext_0.4.1    akc_0.9.9         dplyr_1.1.1       igraph_1.4.2     
 [9] ggraph_2.1.0      BTM_0.3.7         ggplot2_3.4.2     deeplr_2.0.0     

loaded via a namespace (and not attached):
 [1] viridis_0.6.2        httr_1.4.5           tidyr_1.3.0         
 [4] tidygraph_1.2.3      jsonlite_1.8.4       viridisLite_0.4.1   
 [7] carData_3.0-5        yaml_2.3.7           tidyfst_1.7.6       
[10] ggrepel_0.9.3        pillar_1.9.0         backports_1.4.1     
[13] lattice_0.21-8       glue_1.6.2           digest_0.6.31       
[16] ggsignif_0.6.4       polyclip_1.10-4      colorspace_2.1-0    
[19] cowplot_1.1.1        htmltools_0.5.5      Matrix_1.5-4        
[22] pkgconfig_2.0.3      textstem_0.1.4       broom_1.0.4         
[25] purrr_1.0.1          scales_1.2.1         tweenr_2.0.2        
[28] ggforce_0.4.1        tibble_3.2.1         car_3.1-2           
[31] generics_0.1.3       farver_2.1.1         withr_2.5.0         
[34] fst_0.9.8            cli_3.6.1            magrittr_2.0.3      
[37] evaluate_0.20        tokenizers_0.3.0     janeaustenr_1.0.0   
[40] fansi_1.0.4          SnowballC_0.7.0      MASS_7.3-58.3       
[43] rstatix_0.7.2        koRpus_0.13-8        tools_4.2.2         
[46] data.table_1.14.8    lifecycle_1.0.3      stringr_1.5.0       
[49] munsell_0.5.0        compiler_4.2.2       rlang_1.1.0         
[52] grid_4.2.2           fstcore_0.9.14       rstudioapi_0.14     
[55] htmlwidgets_1.6.2    labeling_0.4.2       rmarkdown_2.21      
[58] koRpus.lang.en_0.1-4 sylly_0.1-6          gtable_0.3.3        
[61] abind_1.4-5          graphlayouts_0.8.4   R6_2.5.1            
[64] gridExtra_2.3        sylly.en_0.1-3       knitr_1.42          
[67] fastmap_1.1.1        utf8_1.2.3           stringi_1.7.12      
[70] parallel_4.2.2       Rcpp_1.0.10          vctrs_0.6.1         
[73] png_0.1-8            tidyselect_1.2.0     xfun_0.38