library(ggplot2)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(car)
## Loading required package: carData
library(sjPlot)
library(emmeans)
## Warning: package 'emmeans' was built under R version 4.4.3
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(lattice)
library(irr)
## Loading required package: lpSolve
library(cvms)
library(epiR)
## Loading required package: survival
## Package epiR 2.0.84 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
## 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
# NOTE: The file contains analyses for both the raw annotator data and the data after the adjudication phase. To get access to pre- and post-adjudication data, set the approriate flag "use_adjudicated_data", see above.
use_adjudicated_data <- FALSE
# use_adjudicated_data <- TRUE

Inter-annotator agreement on yes/no judgment (if there is an error, then goodSentence is 1, else it is 0. Here, errorType is disgarded.

For momosemes, the data comes in three files, representing the three groups of annotators. We combine them into a single file; each pair of ck, ek, rb becomes anno1 and anno2

path="/Users/zinn/Projects/EKUT/GermaNet/GermaNet_LLM/R_analyses/annotations/"

# first file
if (use_adjudicated_data) {
  data1=read.table(paste0(path,"monosemes_x147_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_x147_corrected.csv")
} else {
    data1=read.table(paste0(path,"monosemes_x147.csv"), header=T, sep=";")
    message("Loaded monosemes_x147.csv")
}
## Loaded monosemes_x147.csv
data1$errorType <-NULL # yes/no case
data1_wide=spread(data1, annotator, value =goodSentence)
colnames(data1_wide)
##  [1] "id"                   "type"                 "numberWordSenses"    
##  [4] "lemma"                "frequency"            "hypernym"            
##  [7] "frequencyHypernym"    "semanticField"        "model"               
## [10] "sentenceLength"       "sentenceStringLength" "ck"                  
## [13] "rb"
colnames(data1_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField"  ,"model" ,  "sentenceLength",      "sentenceStringLength",  "anno1"  ,"anno2")

# second file
if (use_adjudicated_data) {
  data2=read.table(paste0(path,"monosemes_x258_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_x258_corrected.csv")

} else {
    data2=read.table(paste0(path,"monosemes_x258.csv"), header=T, sep=";")
    message("Loaded monosemes_x258.csv")
}
## Loaded monosemes_x258.csv
data2$errorType <-NULL
data2_wide=spread(data2, annotator, value =goodSentence)
colnames(data2_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField"  ,"model" ,   "sentenceLength",        "sentenceStringLength", "anno1"  ,"anno2")

# third file
if (use_adjudicated_data) {
  data3=read.table(paste0(path,"monosemes_x369_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_x369_corrected.csv")
} else {
    data3=read.table(paste0(path,"monosemes_x369.csv"), header=T, sep=";")
    message("Loaded monosemes_x369.csv")
}
## Loaded monosemes_x369.csv
data3$errorType <-NULL
data3_wide=spread(data3, annotator, value =goodSentence)
colnames(data3_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField"  ,"model" ,   "sentenceLength",        "sentenceStringLength", "anno1"  ,"anno2")


# putting all three together for IAA computation
data_wide_mono_yn = rbind(data1_wide, data2_wide, data3_wide)
nrow(data1_wide)
## [1] 297
nrow(data2_wide)
## [1] 297
nrow(data3_wide)
## [1] 306
nrow(data_wide_mono_yn)
## [1] 900
# for the uncorrected data, we should obtain 900 rows, for the adjudicated data 891 (the Lemma "Zertifizierung" with its 9 example sentences was removed)

# new column accuracy: if annotators agree 1, or not 0
data_wide_mono_yn$accuracy = ifelse(data_wide_mono_yn$anno1== data_wide_mono_yn$anno2, 1, 0)
# arithmetic mean of all zero and ones (observed agreement)
mean(data_wide_mono_yn$accuracy)
## [1] 0.8422222
# getting column 12 and 13 for yes/no
relcols =data_wide_mono_yn[, 12:13]
head(relcols)
##   anno1 anno2
## 1   yes   yes
## 2    no    no
## 3    no    no
## 4    no    no
## 5   yes    no
## 6    no    no
# get the kappa, should be higher than with the kappa on error codes
kappa2(relcols)
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 900 
##    Raters = 2 
##     Kappa = 0.494 
## 
##         z = 14.8 
##   p-value = 0
rater.bias(relcols)
##  Rater bias coefficient
## 
##  Subjects = 900 
##    Raters = 2 
##     Ratio = 0.514 
## 
##  Chisq(1) = 0.113 
##   p-value = 0.737
# RAW: Across 900 subjects rated by two raters, the estimated bias ratio (0.514) suggests that one rater assigned the target category about half as often as the other, but this difference was not statistically significant (χ²(1) = 0.113, p = 0.737). Thus, there is no evidence of systematic bias between the two raters.

# ADJ: Across 891 subjects rated by two raters, the estimated bias ratio (0.49) suggests that one rater used the category about half as often as the other. However, this difference was not statistically significant (χ²(1) = 0.0385, p = 0.845), indicating no evidence of systematic bias between the raters.


# get confusion matrix (yes/no values)
cm_yesno=as.data.frame(table(data_wide_mono_yn$anno1, data_wide_mono_yn$anno2))
colnames(cm_yesno) = c("anno1", "anno2", "N")

#cm_yesno$N <- as.numeric(as.character(cm_yesno$N))
#cm_errorTypes$N <- as.numeric(as.character(cm_errorTypes$N))

#plot_confusion_matrix(cm_yesno, target_col= "anno1", prediction_col="anno2", counts_col = "N" )+xlab("Annotator 2")+ylab("Annotator 1")+theme(text= element_text(size= 14))

p <- plot_confusion_matrix(
  cm_yesno,
  target_col = "anno1",
  prediction_col = "anno2",
  counts_col = "N",
  add_row_percentages = FALSE,
  add_col_percentages = FALSE
)
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.
for (i in seq_along(p$layers)) {
  if (inherits(p$layers[[i]]$geom, "GeomText")) {
    p$layers[[i]]$aes_params$size <- 12
  }
}

# p$layers <- p$layers[-length(p$layers)]

p +
  xlab("Annotator 2") +
  ylab("Annotator 1") +
  theme_classic(base_size = 32)

ggsave(
  paste0(path, "./results/cm_monosemes_yn.png"),
  width = 15,
  height = 15,
  dpi = 300
)

# Using a second library for kappa computation, but mainly to also get PABAK values. Again, for yes/no judgement
##### EPI package working on good Sentence (yes/no)
rater1 <- data_wide_mono_yn$anno1
rater2 <- data_wide_mono_yn$anno2
confusion_matrix_epi <- table(rater1, rater2) 

kappa_result_epi <- epi.kappa(confusion_matrix_epi, method = "cohen")
cohen_kappa_epi <- kappa_result_epi$kappa$est
# Extract Cohen's kappa values and CI
cohen_kappa_epi_lower <- kappa_result_epi$kappa$lower
cohen_kappa_epi_upper <- kappa_result_epi$kappa$upper

# Extract PABAK values and CI
pabak <- kappa_result_epi$pabak$est
pabak_lower <- kappa_result_epi$pabak$lower
pabak_upper <- kappa_result_epi$pabak$upper

kappa_epi_df <- data.frame(
  Metric = c("Cohen's Kappa", "PABAK"),
  Value = c(cohen_kappa_epi, pabak),
  Lower_CI = c(cohen_kappa_epi_lower, pabak_lower),
  Upper_CI = c(cohen_kappa_epi_upper, pabak_upper)
)

# Display the data frame
print(kappa_epi_df)
##          Metric     Value  Lower_CI  Upper_CI
## 1 Cohen's Kappa 0.4941739 0.4178222 0.5705256
## 2         PABAK 0.6844444 0.6334747 0.7308849
data_wide_mono_yn$logfreq=log(data_wide_mono_yn$frequency)
data_wide_mono_yn$scalelogfreq=scale(data_wide_mono_yn$logfreq)

data_wide_mono_yn$logFrequencyHypernym = log(data_wide_mono_yn$frequencyHypernym+1)
data_wide_mono_yn$scaleloghyper = scale(data_wide_mono_yn$logFrequencyHypernym)

data_wide_mono_yn$scaleSentenceLength = scale(data_wide_mono_yn$sentenceLength)
data_wide_mono_yn$scaleSentenceStringLength = scale(data_wide_mono_yn$sentenceStringLength)

data_wide_mono_yn$accuracy = as.factor(data_wide_mono_yn$accuracy)

# 0.842 1s vs. 0.157 0s
prop.table(table(data_wide_mono_yn$accuracy))
## 
##         0         1 
## 0.1577778 0.8422222
# the higher the frequency of the lemma, the more agreement between coders
# adj: 0.0122
# accuracy is a binary 0 or 1, scalelogfreq is binomial
# accuracy is a Bernoulli variable, which is a special case of the binomial distribution with n = 1.

accuracy.glm=glm(accuracy ~ scalelogfreq, data=data_wide_mono_yn, family="binomial")
Anova(accuracy.glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: accuracy
##              LR Chisq Df Pr(>Chisq)   
## scalelogfreq   7.4323  1   0.006406 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(accuracy.glm)
## 
## Call:
## glm(formula = accuracy ~ scalelogfreq, family = "binomial", data = data_wide_mono_yn)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.69493    0.09299   18.23  < 2e-16 ***
## scalelogfreq  0.24098    0.08731    2.76  0.00578 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 784.74  on 899  degrees of freedom
## Residual deviance: 777.31  on 898  degrees of freedom
## AIC: 781.31
## 
## Number of Fisher Scoring iterations: 4
data_wide_mono_yn <- data_wide_mono_yn %>%
  mutate(predicted = predict(accuracy.glm, type = "response"))

ggplot(data_wide_mono_yn, aes(x=scalelogfreq, y=accuracy)) +
  geom_point(alpha=0.3) +
  geom_line(aes(y=predicted), color="blue", size=1.2) +
  labs(title="Effect of scaled log frequency on accuracy",
       y="Predicted probability of accuracy",
       x="Scaled log frequency")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# but no for the frequency of the hypernym
accuracy.glm=glm(accuracy ~ scaleloghyper, data=data_wide_mono_yn, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##               LR Chisq Df Pr(>Chisq)
## scaleloghyper 0.015603  1     0.9006
range(data_wide_mono_yn$scalelogfreq)
## [1] -3.010601  1.423661
range(data_wide_mono_yn$scaleloghyper)
## [1] -2.443225  1.384123

IAA on errortypes (goodSentence is disgarded). We re-read all the data.

# first file
if (use_adjudicated_data) {
  data1=read.table(paste0(path,"monosemes_x147_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_x147_corrected.csv")
} else {
    data1=read.table(paste0(path,"monosemes_x147.csv"), header=T, sep=";")
    message("Loaded monosemes_x147.csv")
}
## Loaded monosemes_x147.csv
data1$goodSentence <-NULL # going for error types
data1_wide = spread(data1, annotator, value =errorType)
colnames(data1_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" ,"model" , "sentenceLength",       "sentenceStringLength", "anno1"  ,"anno2")

# second file
if (use_adjudicated_data) {
  data2=read.table(paste0(path,"monosemes_x258_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_x258_corrected.csv")

} else {
    data2=read.table(paste0(path,"monosemes_x258.csv"), header=T, sep=";")
    message("Loaded monosemes_x258.csv")
}
## Loaded monosemes_x258.csv
data2$goodSentence <-NULL
data2_wide = spread(data2, annotator, value =errorType)
colnames(data2_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField", "model" ,  "sentenceLength",       "sentenceStringLength", "anno1"  ,"anno2")

# third file
if (use_adjudicated_data) {
  data3=read.table(paste0(path,"monosemes_x369_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_x369_corrected.csv")
} else {
    data3=read.table(paste0(path,"monosemes_x369.csv"), header=T, sep=";")
    message("Loaded monosemes_x369.csv")
}
## Loaded monosemes_x369.csv
data3$goodSentence <-NULL
data3_wide = spread(data3, annotator, value =errorType)
colnames(data3_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField", "model" ,  "sentenceLength",       "sentenceStringLength", "anno1"  ,"anno2")

# binding them together
data_wide_mono_et = rbind(data1_wide, data2_wide, data3_wide)
nrow(data_wide_mono_et)
## [1] 900
colnames(data_wide_mono_et)
##  [1] "id"                   "type"                 "numberWordSenses"    
##  [4] "lemma"                "frequency"            "hypernym"            
##  [7] "frequencyHypernym"    "semanticField"        "model"               
## [10] "sentenceLength"       "sentenceStringLength" "anno1"               
## [13] "anno2"
# adding accuravy
data_wide_mono_et$accuracy = ifelse(data_wide_mono_et$anno1== data_wide_mono_et$anno2, 1, 0)

# map all errortypes, if existent to 1
data_wide_mono_et$anno1_cat = ifelse(data_wide_mono_et$anno1=="FL" | data_wide_mono_et$anno1=="FA" | data_wide_mono_et$anno1=="FO" | data_wide_mono_et$anno1=="FG" | data_wide_mono_et$anno1=="nn" | data_wide_mono_et$anno1=="ni" | data_wide_mono_et$anno1=="nt" | data_wide_mono_et$anno1=="ha", 1, 0)

data_wide_mono_et$anno2_cat = ifelse(data_wide_mono_et$anno2=="FL" | data_wide_mono_et$anno2=="FA" | data_wide_mono_et$anno2=="FO" | data_wide_mono_et$anno2=="FG" | data_wide_mono_et$anno2=="nn" | data_wide_mono_et$anno2=="ni" | data_wide_mono_et$anno2=="nt" | data_wide_mono_et$anno2=="ha", 1, 0)

data_wide_mono_et$accuracyCat = ifelse(data_wide_mono_et$anno1_cat == data_wide_mono_et$anno2_cat, 1, 0)

mean(data_wide_mono_et$accuracy)
## [1] 0.8033333
mean(data_wide_mono_et$accuracyCat)
## [1] 0.8422222
cm_errorTypes=as.data.frame(table(data_wide_mono_et$anno1, data_wide_mono_et$anno2))
nrow(cm_errorTypes)
## [1] 72
colnames(cm_errorTypes) = c("anno1", "anno2", "N")

plot_confusion_matrix(cm_errorTypes, target_col= "anno1", prediction_col="anno2", counts_col = "N" )+xlab("Annotator 2")+ylab("Annotator 1")+theme(text= element_text(size= 14))
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.

ggsave(paste0(path, "./results/cm_monosemes_ynerrortypes.png"), width= 15, height=15)

#--- CONFUSION MATRIX WITH NO DATA ONLY
# let us focus on the NO data, to make the confusion matrix more readable
data_wide_mono_et_no = data_wide_mono_et[data_wide_mono_et$anno1!= "0" & data_wide_mono_et$anno2 != "0",]
nrow(data_wide_mono_et_no)
## [1] 103
cm_no=as.data.frame(table(data_wide_mono_et_no$anno1, data_wide_mono_et_no$anno2))
colnames(cm_no) = c("anno1", "anno2", "N")

plot_confusion_matrix(cm_no, target_col= "anno1", prediction_col="anno2", counts_col = "N" )+xlab("Annotator 2")+ylab("Annotator 1")+theme(text= element_text(size= 14))
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.

ggsave(paste0(path, "./results/cm_monosemes_errortypes_only.png"), width= 15, height=15)

plot_confusion_matrix(cm_no, add_row_percentages = FALSE, add_col_percentages = FALSE, target_col= "anno1", prediction_col="anno2", counts_col = "N" )+xlab("Annotator 2")+ylab("Annotator 1")+theme(text= element_text(size= 14))
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.

ggsave(paste0(path, "./results/cm_monosemes_errortypes_only_no_percentages.png"), width= 15, height=15)
#-----

refcols_mono_et = data_wide_mono_et[, 12:13]
kappa2(refcols_mono_et)
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 900 
##    Raters = 2 
##     Kappa = 0.422 
## 
##         z = 20.6 
##   p-value = 0
rater.bias(refcols_mono_et)
##  Rater bias coefficient
## 
##  Subjects = 900 
##    Raters = 2 
##     Ratio = 0.415 
## 
##  Chisq(1) = 5.58 
##   p-value = 0.0181
#RAW: The two raters show a systematic bias in their use of categories.
#Specifically, one rater assigns the target category about 2.4× less often than the other
#(1 / 0.415 \approx 2.41), and this difference is unlikely to be due to chance
#(χ²(1) = 5.58, p = 0.0181).

#ADJ: Across 891 subjects rated by two raters, the estimated bias ratio (0.516) indicates that one rater applied the category slightly less often than the other. However, this difference was not statistically significant (χ²(1) = 0.125, p = 0.724), showing no evidence of systematic bias between the raters.


# do the EPI/PABAK statistics on error codes
rater1 <- data_wide_mono_et$anno1
rater2 <- data_wide_mono_et$anno2
#confusion_matrix_epi <- table(rater1, rater2) # rater2 does not assign FO
rater1[1] <- "FO"
rater2[1] <- "FO"
confusion_matrix_epi <- table(rater1, rater2) 

kappa_result_epi <- epi.kappa(confusion_matrix_epi, method = "cohen")
kappa_result_epi
## $prop.agree
##         obs       exp
## 1 0.8033333 0.6578309
## 
## $pabak
##         est     lower     upper
## 1 0.6066667 0.5516685 0.6576599
## 
## $kappa
##         est         se     lower     upper
## 1 0.4252355 0.03872141 0.3493429 0.5011281
## 
## $z
##   test.statistic      p.value
## 1       10.98192 4.668844e-28
cohen_kappa_epi <- kappa_result_epi$kappa$est
# Extract Cohen's kappa values and CI
cohen_kappa_epi_lower <- kappa_result_epi$kappa$lower
cohen_kappa_epi_upper <- kappa_result_epi$kappa$upper

# Extract PABAK values and CI
pabak <- kappa_result_epi$pabak$est
pabak_lower <- kappa_result_epi$pabak$lower
pabak_upper <- kappa_result_epi$pabak$upper

kappa_epi_df <- data.frame(
  Metric = c("Cohen's Kappa", "PABAK"),
  Value = c(cohen_kappa_epi, pabak),
  Lower_CI = c(cohen_kappa_epi_lower, pabak_lower),
  Upper_CI = c(cohen_kappa_epi_upper, pabak_upper)
)

# Display the data frame
print(kappa_epi_df)
##          Metric     Value  Lower_CI  Upper_CI
## 1 Cohen's Kappa 0.4252355 0.3493429 0.5011281
## 2         PABAK 0.6066667 0.5516685 0.6576599
data_wide_mono_et$logfreq=log(data_wide_mono_et$frequency)
data_wide_mono_et$scalelogfreq=scale(data_wide_mono_et$logfreq)

data_wide_mono_et$logFrequencyHypernym = log(data_wide_mono_et$frequencyHypernym+1)
data_wide_mono_et$scaleloghyper = scale(data_wide_mono_et$logFrequencyHypernym)
range(data_wide_mono_et$logFrequencyHypernym)
## [1]  0.00000 16.08331
range(data_wide_mono_et$scaleloghyper)
## [1] -2.443225  1.384123
# p=0 scaled log frequency of the lemma has impact on the IAA
data_wide_mono_et$accuracy = as.factor(data_wide_mono_et$accuracy)
accuracy.glm=glm(accuracy ~ scalelogfreq, data=data_wide_mono_et, family="binomial")
Anova(accuracy.glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: accuracy
##              LR Chisq Df Pr(>Chisq)    
## scalelogfreq   11.914  1   0.000557 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(accuracy.glm)
## 
## Call:
## glm(formula = accuracy ~ scalelogfreq, family = "binomial", data = data_wide_mono_et)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.43098    0.08552  16.732  < 2e-16 ***
## scalelogfreq  0.28048    0.08051   3.484 0.000494 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 892.34  on 899  degrees of freedom
## Residual deviance: 880.43  on 898  degrees of freedom
## AIC: 884.43
## 
## Number of Fisher Scoring iterations: 4
accuracy.glm=glm(accuracy ~ scaleloghyper, data=data_wide_mono_et, family="binomial")
Anova(accuracy.glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: accuracy
##               LR Chisq Df Pr(>Chisq)
## scaleloghyper  0.24574  1     0.6201
summary(accuracy.glm)
## 
## Call:
## glm(formula = accuracy ~ scaleloghyper, family = "binomial", 
##     data = data_wide_mono_et)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.40777    0.08390  16.779   <2e-16 ***
## scaleloghyper  0.04116    0.08261   0.498    0.618    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 892.34  on 899  degrees of freedom
## Residual deviance: 892.10  on 898  degrees of freedom
## AIC: 896.1
## 
## Number of Fisher Scoring iterations: 4

IAA for polysemes

# set to true if we only want to evaluate post-study data
use_poststudy_data <- TRUE
use_poststudy_data_use_adj <- TRUE

# first polyseme file
if (use_adjudicated_data) {
  data1=read.table(paste0(path,"polysemes_et_rb_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_et_rb_corrected.csv")
} else {
    data1=read.table(paste0(path,"polysemes_et_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_et_rb.csv")
}
## Loaded polysemes_et_rb.csv
# this may override the data
if (use_poststudy_data) {
  if (use_poststudy_data_use_adj) {
    data1=read.table(paste0(path,"polysemes_poststudy_et_rb_corrected.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_rb_corrected.csv")
  } else {
    data1=read.table(paste0(path,"polysemes_poststudy_et_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_rb.csv")
  }
}
## Loaded polysemes_poststudy_et_rb_corrected.csv
data1$errorType <-NULL
data1_wide = spread(data1, annotator, value =goodSentence)
colnames(data1_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" , "model" , "sentenceLength", "sentenceStringLength", "anno1"  ,"anno2")
nrow(data1_wide)
## [1] 210
# second polyseme file
if (use_adjudicated_data) {
  data2=read.table(paste0(path,"polysemes_ck_rb_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_ck_rb_corrected.csv")
} else {
    data2=read.table(paste0(path,"polysemes_ck_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_ck_rb.csv")
}
## Loaded polysemes_ck_rb.csv
# this may override the data
if (use_poststudy_data) {
  if (use_poststudy_data_use_adj) {
    data2=read.table(paste0(path,"polysemes_poststudy_ck_rb_corrected.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_ck_rb_corrected.csv")
  } else {
    data2=read.table(paste0(path,"polysemes_poststudy_ck_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_ck_rb.csv")
  }
}
## Loaded polysemes_poststudy_ck_rb_corrected.csv
data2$errorType <-NULL
data2_wide = spread(data2, annotator, value =goodSentence)
colnames(data2_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" , "model" , "sentenceLength", "sentenceStringLength", "anno1"  ,"anno2")
nrow(data2_wide)
## [1] 228
# third polyseme file
if (use_adjudicated_data) {
  data3=read.table(paste0(path,"polysemes_et_ck_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_et_ck_corrected.csv")
} else {
    data3=read.table(paste0(path,"polysemes_et_ck.csv"), header=T, sep=";")
    message("Loaded polysemes_et_ck.csv")
}
## Loaded polysemes_et_ck.csv
# this may override the data
if (use_poststudy_data) {
  if (use_poststudy_data_use_adj) {
    data3=read.table(paste0(path,"polysemes_poststudy_et_ck_corrected.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_ck_corrected.csv")
  } else {
    data3=read.table(paste0(path,"polysemes_poststudy_et_ck.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_ck.csv")
  }
}
## Loaded polysemes_poststudy_et_ck_corrected.csv
data3$errorType <-NULL
data3_wide = spread(data3, annotator, value =goodSentence)
colnames(data3_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" , "model" , "sentenceLength", "sentenceStringLength", "anno1"  ,"anno2")
nrow(data3_wide)
## [1] 198
# binding them together
data_wide_poly_yn = rbind(data1_wide, data2_wide, data3_wide)
nrow(data_wide_poly_yn)
## [1] 636
# adding accuravy
data_wide_poly_yn$accuracy = ifelse(data_wide_poly_yn$anno1 == data_wide_poly_yn$anno2, 1, 0)

# observed agreement
mean(data_wide_poly_yn$accuracy)
## [1] 0.8930818
cm_errorTypes=as.data.frame(table(data_wide_poly_yn$anno1, data_wide_poly_yn$anno2))
colnames(cm_errorTypes) = c("anno1", "anno2", "N")

p <- plot_confusion_matrix(
  cm_errorTypes,
  target_col = "anno1",
  prediction_col = "anno2",
  counts_col = "N",
  add_row_percentages = FALSE,
  add_col_percentages = FALSE
)
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.
for (i in seq_along(p$layers)) {
  if (inherits(p$layers[[i]]$geom, "GeomText")) {
    p$layers[[i]]$aes_params$size <- 12
  }
}

# p$layers <- p$layers[-length(p$layers)]

p +
  xlab("Annotator 2") +
  ylab("Annotator 1") +
  theme_classic(base_size = 32)

ggsave(
  paste0(path, "./results/cm_polysemes_yn.png"),
  width = 15,
  height = 15,
  dpi = 300
)


refcols_poly_yn =data_wide_poly_yn[, 12:13]
kappa2(refcols_poly_yn)
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 636 
##    Raters = 2 
##     Kappa = 0.786 
## 
##         z = 19.8 
##   p-value = 0
rater.bias(refcols_poly_yn)
##  Rater bias coefficient
## 
##  Subjects = 636 
##    Raters = 2 
##     Ratio = 0.471 
## 
##  Chisq(1) = 0.235 
##   p-value = 0.628
# RAW: There is no statistically significant difference between the raters’ category usage. Even though the ratio is numerically below 1, that difference is well within random variation.

#ADJ: Across 2,700 subjects rated by two raters, the estimated bias ratio (0.48) suggests that one rater used the category somewhat less often than the other. However, this difference was not statistically significant (χ²(1) = 0.57, p = 0.45), indicating no evidence of systematic bias between the raters.

# do the EPI/PABAK statistics on error codes

rater1 <- data_wide_poly_yn$anno1
rater2 <- data_wide_poly_yn$anno2
confusion_matrix_poly_epi <- table(rater1, rater2) 

kappa_result_poly_epi <- epi.kappa(confusion_matrix_poly_epi, method = "cohen")
kappa_result_poly_epi
## $prop.agree
##         obs exp
## 1 0.8930818 0.5
## 
## $pindex
##           est         se       lower      upper
## 1 0.006289308 0.02787733 -0.04834925 0.06092787
## 
## $bindex
##            est        se      lower      upper
## 1 -0.006289308 0.0280375 -0.0612418 0.04866318
## 
## $pabak
##         est     lower     upper
## 1 0.7861635 0.7328583 0.8320258
## 
## $kappa
##         est         se     lower     upper
## 1 0.7861635 0.02450603 0.7381326 0.8341945
## 
## $z
##   test.statistic       p.value
## 1       32.08041 8.271332e-226
## 
## $mcnemar
##   test.statistic df   p.value
## 1      0.2352941  1 0.6276258
cohen_kappa_poly_epi <- kappa_result_poly_epi$kappa$est
cohen_kappa_poly_epi
## [1] 0.7861635
# Extract Cohen's kappa values and CI
cohen_kappa_poly_epi_lower <- kappa_result_poly_epi$kappa$lower
cohen_kappa_poly_epi_upper <- kappa_result_poly_epi$kappa$upper

# Extract PABAK values and CI
pabak <- kappa_result_poly_epi$pabak$est
pabak_lower <- kappa_result_poly_epi$pabak$lower
pabak_upper <- kappa_result_poly_epi$pabak$upper

kappa_poly_epi_df <- data.frame(
  Metric = c("Cohen's Kappa", "PABAK"),
  Value = c(cohen_kappa_poly_epi, pabak),
  Lower_CI = c(cohen_kappa_poly_epi_lower, pabak_lower),
  Upper_CI = c(cohen_kappa_poly_epi_upper, pabak_upper)
)

# Display the data frame
print(kappa_poly_epi_df)
##          Metric     Value  Lower_CI  Upper_CI
## 1 Cohen's Kappa 0.7861635 0.7381326 0.8341945
## 2         PABAK 0.7861635 0.7328583 0.8320258
# all of the former can be abbreviatedb y this
kappa_by_numberWordSenses_polysemes_yn <- data_wide_poly_yn %>%
  group_by(numberWordSenses) %>%
  summarise(
    kappa = {
      refcols <- pick(anno1, anno2)
      if (nrow(refcols) > 1 &&
          n_distinct(refcols[[1]]) > 1 &&
          n_distinct(refcols[[2]]) > 1) {
        kappa2(refcols)$value
      } else {
        NA_real_
      }
    },
    .groups = "drop"
)

kappa_by_numberWordSenses_polysemes_yn
## # A tibble: 7 × 2
##   numberWordSenses kappa
##              <int> <dbl>
## 1                2 0.816
## 2                3 0.924
## 3                4 0.714
## 4                5 0.862
## 5                6 0.709
## 6                7 0.622
## 7                8 0.762
# View(kappa_by_numberWordSenses_polysemes_yn)
write.csv(kappa_by_numberWordSenses_polysemes_yn, paste0(path, "./results/iaa_polysemes_yn_kappa_by_numberWordSenses.csv"), row.names = FALSE)
data_wide_poly_yn$logfreq=log(data_wide_poly_yn$frequency)
data_wide_poly_yn$scalelogfreq=scale(data_wide_poly_yn$logfreq)

data_wide_poly_yn$logFrequencyHypernym = log(data_wide_poly_yn$frequencyHypernym+1)
data_wide_poly_yn$scaleloghyper = scale(data_wide_poly_yn$logFrequencyHypernym)

range(data_wide_poly_yn$scalelogfreq)
## [1] -2.310349  1.397426
range(data_wide_poly_yn$scaleloghyper)
## [1] -1.348125  1.502341
mean(data_wide_poly_yn$accuracy)
## [1] 0.8930818
mean(data_wide_poly_yn$numberWordSenses)
## [1] 3.90566
range(data_wide_poly_yn$numberWordSenses)
## [1] 2 8
# the higher the frequency of the lemma, the more agreement between coders (polysem)
accuracy.glm=glm(accuracy ~ scalelogfreq, data=data_wide_poly_yn, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##              LR Chisq Df Pr(>Chisq)  
## scalelogfreq   5.5281  1    0.01871 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(accuracy.glm)
## 
## Call:
## glm(formula = accuracy ~ scalelogfreq, family = "binomial", data = data_wide_poly_yn)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    2.1560     0.1320  16.336   <2e-16 ***
## scalelogfreq   0.2863     0.1195   2.397   0.0165 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 432.51  on 635  degrees of freedom
## Residual deviance: 426.98  on 634  degrees of freedom
## AIC: 430.98
## 
## Number of Fisher Scoring iterations: 5
# no effect for frequency of hypernym
accuracy.glm=glm(accuracy ~ scaleloghyper, data=data_wide_poly_yn, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##               LR Chisq Df Pr(>Chisq)
## scaleloghyper  0.99253  1     0.3191
# no effect for numberWordSenses
accuracy.glm=glm(accuracy ~ numberWordSenses, data=data_wide_poly_yn, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##                  LR Chisq Df Pr(>Chisq)
## numberWordSenses   1.4942  1     0.2216
# IAA for polysemes only (spreading on error type)

# set to true if we only want to evaluate post-study data
use_poststudy_data <- TRUE
use_poststudy_data_use_adj <- TRUE


# first polyseme file
if (use_adjudicated_data) {
  data1=read.table(paste0(path,"polysemes_et_rb_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_et_rb_corrected.csv")
} else {
    data1=read.table(paste0(path,"polysemes_et_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_et_rb.csv")
}
## Loaded polysemes_et_rb.csv
# this may override the data
if (use_poststudy_data) {
  if (use_poststudy_data_use_adj) {
    data1=read.table(paste0(path,"polysemes_poststudy_et_rb_corrected.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_rb_corrected.csv")
  } else {
    data1=read.table(paste0(path,"polysemes_poststudy_et_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_rb.csv")
  }
}
## Loaded polysemes_poststudy_et_rb_corrected.csv
data1$goodSentence <-NULL
data1_wide=spread(data1, annotator, value =errorType)
colnames(data1_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" , "model" , "sentenceLength", "sentenceStringLength", "anno1"  ,"anno2")

# second polyseme file
if (use_adjudicated_data) {
  data2=read.table(paste0(path,"polysemes_ck_rb_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_ck_rb_corrected.csv")
} else {
    data2=read.table(paste0(path,"polysemes_ck_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_ck_rb.csv")
}
## Loaded polysemes_ck_rb.csv
# this may override the data
if (use_poststudy_data) {
  if (use_poststudy_data_use_adj) {
    data2=read.table(paste0(path,"polysemes_poststudy_ck_rb_corrected.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_ck_rb_corrected.csv")
  } else {
    data2=read.table(paste0(path,"polysemes_poststudy_ck_rb.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_ck_rb.csv")
  }
}
## Loaded polysemes_poststudy_ck_rb_corrected.csv
data2$goodSentence <-NULL
data2_wide=spread(data2, annotator, value =errorType)
colnames(data2_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" , "model" , "sentenceLength", "sentenceStringLength", "anno1"  ,"anno2")

# third polyseme file
if (use_adjudicated_data) {
  data3=read.table(paste0(path,"polysemes_et_ck_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_et_ck_corrected.csv")
} else {
    data3=read.table(paste0(path,"polysemes_et_ck.csv"), header=T, sep=";")
    message("Loaded polysemes_et_ck.csv")
}
## Loaded polysemes_et_ck.csv
# this may override the data
if (use_poststudy_data) {
  if (use_poststudy_data_use_adj) {
    data3=read.table(paste0(path,"polysemes_poststudy_et_ck_corrected.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_ck_corrected.csv")
  } else {
    data3=read.table(paste0(path,"polysemes_poststudy_et_ck.csv"), header=T, sep=";")
    message("Loaded polysemes_poststudy_et_ck.csv")
  }
}
## Loaded polysemes_poststudy_et_ck_corrected.csv
data3$goodSentence <-NULL
data3_wide = spread(data3, annotator, value =errorType)
colnames(data3_wide) <- c("id" , "type" ,"numberWordSenses" , "lemma","frequency" ,"hypernym" ,"frequencyHypernym", "semanticField" , "model" , "sentenceLength", "sentenceStringLength", "anno1"  ,"anno2")

# binding them together
data_wide_poly_et = rbind(data1_wide, data2_wide, data3_wide)
nrow(data_wide_poly_et)
## [1] 636
# adding accuravy
data_wide_poly_et$accuracy = ifelse(data_wide_poly_et$anno1== data_wide_poly_et$anno2, 1, 0)

# observed agreement
mean(data_wide_poly_et$accuracy)
## [1] 0.8742138
cm_errorTypes=as.data.frame(table(data_wide_poly_et$anno1, data_wide_poly_et$anno2))
colnames(cm_errorTypes) = c("anno1", "anno2", "N")

plot_confusion_matrix(cm_errorTypes, target_col= "anno1", prediction_col="anno2", counts_col = "N" )+xlab("Annotator 2")+ylab("Annotator 1")+theme(text= element_text(size= 14))
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.

ggsave(paste0(path, "./results/cm_polysemes_errortypes.png"), width= 15, height=15)

refcols_poly_et =data_wide_poly_et[, 12:13]
kappa2(refcols_poly_et)
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 636 
##    Raters = 2 
##     Kappa = 0.818 
## 
##         z = 38.2 
##   p-value = 0
rater.bias(refcols_poly_et)
##  Rater bias coefficient
## 
##  Subjects = 636 
##    Raters = 2 
##     Ratio = 0.525 
## 
##  Chisq(1) = 0.2 
##   p-value = 0.655
# RAW: “No significant systematic bias was detected between the two raters (Ratio = 0.502, χ²(1) = 0.0075, p = 0.931).”

# ADJ: Across 2,700 subjects rated by two raters, the estimated bias ratio (0.504) indicates that one rater used the category about as often as the other. This difference was not statistically significant (χ²(1) = 0.0212, p = 0.884), showing no evidence of systematic bias between the raters.

# do the EPI/PABAK statistics on error codes
rater1 <- data_wide_poly_et$anno1
rater2 <- data_wide_poly_et$anno2
# confusion_matrix_poly_epi <- table(rater1, rater2) 

# rater2 uses "FO", but rater1 does not
rater1[1] = "FO"
rater2[1] = "FO"
confusion_matrix_poly_epi <- table(rater1, rater2) 

kappa_result_poly_epi <- epi.kappa(confusion_matrix_poly_epi, method = "cohen")
kappa_result_poly_epi
## $prop.agree
##         obs       exp
## 1 0.8757862 0.3089153
## 
## $pabak
##         est     lower     upper
## 1 0.7515723 0.6952218 0.8008254
## 
## $kappa
##         est         se     lower     upper
## 1 0.8202625 0.01892449 0.7831712 0.8573538
## 
## $z
##   test.statistic p.value
## 1       43.34397       0
cohen_kappa_poly_epi <- kappa_result_poly_epi$kappa$est

# Extract Cohen's kappa values and CI
cohen_kappa_poly_epi_lower <- kappa_result_poly_epi$kappa$lower
cohen_kappa_poly_epi_upper <- kappa_result_poly_epi$kappa$upper

# Extract PABAK values and CI
pabak <- kappa_result_poly_epi$pabak$est
pabak_lower <- kappa_result_poly_epi$pabak$lower
pabak_upper <- kappa_result_poly_epi$pabak$upper

kappa_poly_epi_df <- data.frame(
  Metric = c("Cohen's Kappa", "PABAK"),
  Value = c(cohen_kappa_poly_epi, pabak),
  
  Lower_CI = c(cohen_kappa_poly_epi_lower, pabak_lower),
  Upper_CI = c(cohen_kappa_poly_epi_upper, pabak_upper)
)

# Display the data frame
print(kappa_poly_epi_df)
##          Metric     Value  Lower_CI  Upper_CI
## 1 Cohen's Kappa 0.8202625 0.7831712 0.8573538
## 2         PABAK 0.7515723 0.6952218 0.8008254
# let us focus on the NO data, to make the confusion matrix more readable
data_wide_poly_et_no = data_wide_poly_et[data_wide_poly_et$anno1!= "0" & data_wide_poly_et$anno2 != "0",]
cm_no=as.data.frame(table(data_wide_poly_et_no$anno1, data_wide_poly_et_no$anno2))
colnames(cm_no) = c("anno1", "anno2", "N")

plot_confusion_matrix(cm_no, target_col= "anno1", prediction_col="anno2", counts_col = "N" )+xlab("Annotator 2")+ylab("Annotator 1")+theme(text= element_text(size= 14))
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in check_gg_image_packages(add_arrows = add_arrows, add_zero_shading =
## add_zero_shading): 'rsvg' is missing. Will not plot arrows and zero-shading.

ggsave(paste0(path, "./results/cm_polysemes_errortypes_only.png"), width= 15, height=15)
data_wide_poly_et$logfreq=log(data_wide_poly_et$frequency)
data_wide_poly_et$scalelogfreq=scale(data_wide_poly_et$logfreq)

data_wide_poly_et$logFrequencyHypernym = log(data_wide_poly_et$frequencyHypernym+1)
data_wide_poly_et$scaleloghyper = scale(data_wide_poly_et$logFrequencyHypernym)

range(data_wide_poly_et$scalelogfreq)
## [1] -2.310349  1.397426
range(data_wide_poly_et$scaleloghyper)
## [1] -1.348125  1.502341
mean(data_wide_poly_et$accuracy)
## [1] 0.8742138
mean(data_wide_poly_et$numberWordSenses)
## [1] 3.90566
range(data_wide_poly_et$numberWordSenses)
## [1] 2 8
# the higher the frequency of the lemma, the more agreement between coders (polysem)
accuracy.glm=glm(accuracy ~ scalelogfreq, data=data_wide_poly_et, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##              LR Chisq Df Pr(>Chisq)  
## scalelogfreq   5.6245  1    0.01771 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(accuracy.glm)
## 
## Call:
## glm(formula = accuracy ~ scalelogfreq, family = "binomial", data = data_wide_poly_et)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.9669     0.1224  16.066   <2e-16 ***
## scalelogfreq   0.2706     0.1121   2.413   0.0158 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 481.19  on 635  degrees of freedom
## Residual deviance: 475.57  on 634  degrees of freedom
## AIC: 479.57
## 
## Number of Fisher Scoring iterations: 4
# no effect for frequency of hypernym
accuracy.glm=glm(accuracy ~ scaleloghyper, data=data_wide_poly_et, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##               LR Chisq Df Pr(>Chisq)
## scaleloghyper   1.3751  1     0.2409
# no effect for numberWordSenses
accuracy.glm=glm(accuracy ~ numberWordSenses, data=data_wide_poly_et, family="binomial")
Anova(accuracy.glm, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: accuracy
##                  LR Chisq Df Pr(>Chisq)
## numberWordSenses   1.0548  1     0.3044