library(ggplot2)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(car)
## Loading required package: carData
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(emmeans)
## Warning: package 'emmeans' was built under R version 4.4.3
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(lattice)
library(irr)
## Loading required package: lpSolve
library(cvms)
library(epiR)
## Loading required package: survival
## Package epiR 2.0.84 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
## 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: TH.data
## Warning: package 'TH.data' was built under R version 4.4.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
library(multcompView)
library(effects)
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.
library(ggeffects) # install.packages("ggeffects")
library(summarytools)
library(report)
library(performance)
## Warning: package 'performance' was built under R version 4.4.3
## 
## Attaching package: 'performance'
## The following object is masked from 'package:irr':
## 
##     icc

Polyseme case:

use_poststudy_data_use_adj <- TRUE
#use_poststudy_data_use_adj <- FALSE


path="/Users/zinn/Projects/EKUT/GermaNet/GermaNet_LLM/R_analyses/annotations/"

if (use_poststudy_data_use_adj) {
  data=read.table(paste0(path,"polysemes_poststudy_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_poststudy_corrected.csv")
} else {
  data=read.table(paste0(path,"polysemes_poststudy.csv"), header=T, sep=";")
  message("Loaded polysemes_poststudy.csv")
}
## Loaded polysemes_poststudy_corrected.csv
nrow(data)
## [1] 1272
View(data)
# for Table 5. LLMs and their error types
counts <- data %>%
  group_by(errorType, model) %>%
  summarise(count = n(), .groups = "drop")

print(counts)
## # A tibble: 16 × 3
##    errorType model               count
##    <chr>     <chr>               <int>
##  1 0         deepseek-chat         268
##  2 0         gpt-5.2-chat-latest   364
##  3 FA        deepseek-chat         123
##  4 FA        gpt-5.2-chat-latest   121
##  5 FG        deepseek-chat          21
##  6 FG        gpt-5.2-chat-latest    10
##  7 FL        deepseek-chat         122
##  8 FL        gpt-5.2-chat-latest    50
##  9 ha        deepseek-chat          25
## 10 ha        gpt-5.2-chat-latest    32
## 11 ni        deepseek-chat           6
## 12 ni        gpt-5.2-chat-latest    13
## 13 nn        deepseek-chat          42
## 14 nn        gpt-5.2-chat-latest    28
## 15 nt        deepseek-chat          29
## 16 nt        gpt-5.2-chat-latest    18
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(data$logfreq)
data$loghyper=log(data$frequencyHypernym+1)
data$scaleloghyper=scale(data$loghyper)
data$lengthLemma=nchar(data$lemma)
data$lengthHypernym=nchar(data$hypernym)

mean(data$sentenceLength)
## [1] 10.66981
median(data$sentenceLength)
## [1] 10
mean(data$sentenceStringLength)
## [1] 71.94654
median(data$sentenceStringLength)
## [1] 71
chatdata=data[data$model=="gpt-5.2-chat-latest",]
nrow(chatdata)
## [1] 636
mean(chatdata$sentenceStringLength)
## [1] 68.58176
mean(chatdata$sentenceLength)
## [1] 10.28931
deepseekdata=data[data$model=="deepseek-chat",]
nrow(deepseekdata)
## [1] 636
mean(deepseekdata$sentenceStringLength)
## [1] 75.31132
mean(deepseekdata$sentenceLength)
## [1] 11.05031
# 5.2.1 Length of example sentences and sentence quality
# ---------------------------------

sl = lmer(sentenceLength ~ model + (1 | lemma),  data = data)

Anova(sl, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: sentenceLength
##              Chisq Df Pr(>Chisq)    
## (Intercept) 2742.3  1  < 2.2e-16 ***
## model         54.3  1  1.721e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(sl)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sentenceLength ~ model + (1 | lemma)
##    Data: data
## 
## REML criterion at convergence: 5257.6
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.74740 -0.62476 -0.01782  0.63930  3.10037 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  lemma    (Intercept) 1.410    1.187   
##  Residual             3.392    1.842   
## Number of obs: 1272, groups:  lemma, 34
## 
## Fixed effects:
##                           Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)                11.3901     0.2175   37.1577  52.367  < 2e-16 ***
## modelgpt-5.2-chat-latest   -0.7610     0.1033 1237.1351  -7.369 3.14e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## mdlgp-5.2-- -0.237
# sentenceLength (not significant)
goodness.glmer=glmer(as.factor(goodSentence)  ~ sentenceLength +(1|lemma), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                 Chisq Df Pr(>Chisq)
## (Intercept)    0.1313  1     0.7171
## sentenceLength 0.1870  1     0.6654
summary(goodness.glmer)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ sentenceLength + (1 | lemma)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1674.2    1689.6    -834.1    1668.2      1269 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0020 -0.9646 -0.4193  0.8795  2.3690 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  lemma  (Intercept) 0.6519   0.8074  
## Number of obs: 1272, groups:  lemma, 34
## 
## Fixed effects:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -0.13598    0.37531  -0.362    0.717
## sentenceLength  0.01351    0.03123   0.432    0.665
## 
## Correlation of Fixed Effects:
##             (Intr)
## sentncLngth -0.913
plot(density(data$scalelogfreq))

# 1. Create contingency table
tbl <- table(data$errorType, data$numberWordSenses)
# 2. Run chi-square test
chisq.test(tbl)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 234.38, df = 42, p-value < 2.2e-16
chisq.test(tbl, simulate.p.value = TRUE, B = 10000)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 10000
##  replicates)
## 
## data:  tbl
## X-squared = 234.38, df = NA, p-value = 9.999e-05
# => the two categorical variables in your table are highly dependent.
# Open a PNG device
png(paste0(path, "./results/polysemes_errorType_by_numSenses_poststudy_poststudy.png"), width = 1200, height = 800, res = 150)

mosaicplot(tbl, color = TRUE, las = 2,
           main = "Mosaic plot: Error Type by Number of Senses",
           xlab = "Error Type", ylab = "Number of Word Senses")

# Close the device to save the file
dev.off()
## quartz_off_screen 
##                 2
png(paste0(path, "./results/polysemes_wordSenseDistribution_poststudy_poststudy.png"), width = 1200, height = 800, res = 150)

hist(data$numberWordSenses,
     main = "Distribution of Number of Word Senses",
     xlab = "Number of Senses",
     ylab = "Number of Sentences",
     col = "skyblue", border = "white")

dev.off()
## quartz_off_screen 
##                 2
use_poststudy_data_use_adj <- TRUE

if (use_poststudy_data_use_adj) {
  data=read.table(paste0(path,"polysemes_poststudy_corrected.csv"), header=T, sep=";")
  message("Loaded polysemes_poststudy_corrected.csv") 
} else {
  data=read.table(paste0(path,"polysemes_poststudy.csv"), header=T, sep=";")
  message("Loaded polysemes_poststudy.csv")
}
## Loaded polysemes_poststudy_corrected.csv
# this is the long format, not the wide one (!)
nrow(data)
## [1] 1272
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(data$logfreq)

table(data$numberWordSenses)
## 
##   2   3   4   5   6   7   8 
## 456 108 336 120  72  84  96
range(data$numberWordSenses)
## [1] 2 8
# for polysemes, center for model fit (intercept at zero)
data$cNumberWordSenses = data$numberWordSenses - 2
range(data$cNumberWordSenses)
## [1] 0 6
table(data$cNumberWordSenses)
## 
##   0   1   2   3   4   5   6 
## 456 108 336 120  72  84  96
# less skew:
data$numberWordSenses_fourBins <- ifelse(data$cNumberWordSenses > 2, 3, data$cNumberWordSenses)
table(data$numberWordSenses_fourBins)
## 
##   0   1   2   3 
## 456 108 336 372
# 5.2.2 No frequency data for word senses; what can be said about lemma frequency and its level of polysemy?
# ----------------------------------------------------------------------------------------------------------

cor.test(data$logfreq, data$numberWordSenses, method = c("kendall")) 
## 
##  Kendall's rank correlation tau
## 
## data:  data$logfreq and data$numberWordSenses
## z = 2.8105, p-value = 0.004947
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## 0.05881814
cor.test(data$logfreq, data$numberWordSenses_fourBins, method = c("kendall")) 
## 
##  Kendall's rank correlation tau
## 
## data:  data$logfreq and data$numberWordSenses_fourBins
## z = 4.5041, p-value = 6.665e-06
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## 0.09714772
# just checking: we get identical values for scalelogfreq
cor.test(data$scalelogfreq, data$numberWordSenses, method = c("kendall")) 
## 
##  Kendall's rank correlation tau
## 
## data:  data$scalelogfreq and data$numberWordSenses
## z = 2.8105, p-value = 0.004947
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## 0.05881814
cor.test(data$scalelogfreq, data$numberWordSenses_fourBins, method = c("kendall")) 
## 
##  Kendall's rank correlation tau
## 
## data:  data$scalelogfreq and data$numberWordSenses_fourBins
## z = 4.5041, p-value = 6.665e-06
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## 0.09714772
cor.test(data$logfreq, data$numberWordSenses, method = c("spearman")) 
## Warning in cor.test.default(data$logfreq, data$numberWordSenses, method =
## c("spearman")): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  data$logfreq and data$numberWordSenses
## S = 304744017, p-value = 6.676e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1115656
cor.test(data$logfreq, data$numberWordSenses_fourBins, method = c("spearman")) 
## Warning in cor.test.default(data$logfreq, data$numberWordSenses_fourBins, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  data$logfreq and data$numberWordSenses_fourBins
## S = 292155304, p-value = 1.084e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.148266
cor.test(data$logfreq, data$numberWordSenses, method = c("pearson")) 
## 
##  Pearson's product-moment correlation
## 
## data:  data$logfreq and data$numberWordSenses
## t = 5.2008, df = 1270, p-value = 2.311e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09015976 0.19780239
## sample estimates:
##       cor 
## 0.1444083
cor.test(data$logfreq, data$numberWordSenses_fourBins, method = c("pearson")) 
## 
##  Pearson's product-moment correlation
## 
## data:  data$logfreq and data$numberWordSenses_fourBins
## t = 3.4811, df = 1270, p-value = 0.0005163
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0424827 0.1513751
## sample estimates:
##        cor 
## 0.09721982
# logfreq ~ numberWordSenses
# ---------------------------

logfreq_lm=lm(logfreq  ~ numberWordSenses, data=data)
logfreq_lm4=lm(logfreq  ~ numberWordSenses_fourBins, data=data)
AIC(logfreq_lm, logfreq_lm4)
##             df      AIC
## logfreq_lm   3 5520.425
## logfreq_lm4  3 5535.152
# => In a simple LM, numberWordSenses_fourBins fits the data substantially better 

summary(logfreq_lm4)
## 
## Call:
## lm(formula = logfreq ~ numberWordSenses_fourBins, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8135 -1.6584  0.5452  1.6519  3.2366 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               12.33128    0.09305 132.522  < 2e-16 ***
## numberWordSenses_fourBins  0.16675    0.04790   3.481 0.000516 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.128 on 1270 degrees of freedom
## Multiple R-squared:  0.009452,   Adjusted R-squared:  0.008672 
## F-statistic: 12.12 on 1 and 1270 DF,  p-value: 0.0005163
report(logfreq_lm4)
## We fitted a linear model (estimated using OLS) to predict logfreq with
## numberWordSenses_fourBins (formula: logfreq ~ numberWordSenses_fourBins). The
## model explains a statistically significant and very weak proportion of variance
## (R2 = 9.45e-03, F(1, 1270) = 12.12, p < .001, adj. R2 = 8.67e-03). The model's
## intercept, corresponding to numberWordSenses_fourBins = 0, is at 12.33 (95% CI
## [12.15, 12.51], t(1270) = 132.52, p < .001). Within this model:
## 
##   - The effect of numberWordSenses fourBins is statistically significant and
## positive (beta = 0.17, 95% CI [0.07, 0.26], t(1270) = 3.48, p < .001; Std. beta
## = 0.10, 95% CI [0.04, 0.15])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(logfreq_lm4, type="pred", terms=c("numberWordSenses_fourBins"))

ggsave(paste0(path, "./results/polysemes_logfreq_numberWordSenses_poststudy.png"), width= 7, height=7)

# goodSentence ~ numberWordSenses
# -------------------------------

goodness.glmer_nws=glmer(as.factor(goodSentence)  ~ numberWordSenses +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_nws, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                   Chisq Df Pr(>Chisq)
## (Intercept)      0.0097  1     0.9216
## numberWordSenses 0.0413  1     0.8389
summary(goodness.glmer_nws)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses + (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1767.0    1782.4    -880.5    1761.0      1269 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0764 -0.9612 -0.9246  0.9767  1.0816 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.02446  0.1564  
## Number of obs: 1272, groups:  sentenceLength, 15
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)       0.013909   0.141400   0.098    0.922
## numberWordSenses -0.006206   0.030526  -0.203    0.839
## 
## Correlation of Fixed Effects:
##             (Intr)
## nmbrWrdSnss -0.835
goodness.glmer_nws4=glmer(as.factor(goodSentence)  ~ numberWordSenses_fourBins +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# fourBins fits better
anova(goodness.glmer_nws, goodness.glmer_nws4)
## Data: data
## Models:
## goodness.glmer_nws: as.factor(goodSentence) ~ numberWordSenses + (1 | sentenceLength)
## goodness.glmer_nws4: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 | sentenceLength)
##                     npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)
## goodness.glmer_nws     3 1767.0 1782.4 -880.50    1761.0                     
## goodness.glmer_nws4    3 1762.8 1778.2 -878.39    1756.8 4.2119  0
Anova(goodness.glmer_nws4, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                            Chisq Df Pr(>Chisq)  
## (Intercept)               1.6751  1    0.19558  
## numberWordSenses_fourBins 4.2720  1    0.03875 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_nws4)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1762.8    1778.2    -878.4    1756.8      1269 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1371 -0.9987 -0.8760  0.9888  1.1415 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.01973  0.1405  
## Number of obs: 1272, groups:  sentenceLength, 15
## 
## Fixed effects:
##                           Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                0.13069    0.10098   1.294   0.1956  
## numberWordSenses_fourBins -0.09577    0.04634  -2.067   0.0387 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## nmbrWrdSn_B -0.676
# but is there a better model?
goodness.glmer_nws4_rs=glmer(as.factor(goodSentence)  ~ numberWordSenses_fourBins +(1+numberWordSenses_fourBins|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# rs is better
anova(goodness.glmer_nws4, goodness.glmer_nws4_rs)
## Data: data
## Models:
## goodness.glmer_nws4: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 | sentenceLength)
## goodness.glmer_nws4_rs: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 + numberWordSenses_fourBins | sentenceLength)
##                        npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer_nws4       3 1762.8 1778.2 -878.39    1756.8          
## goodness.glmer_nws4_rs    5 1757.0 1782.8 -873.52    1747.0 9.7461  2
##                        Pr(>Chisq)   
## goodness.glmer_nws4                 
## goodness.glmer_nws4_rs    0.00765 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(goodness.glmer_nws4_rs, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                            Chisq Df Pr(>Chisq)
## (Intercept)               2.3919  1     0.1220
## numberWordSenses_fourBins 2.4444  1     0.1179
summary(goodness.glmer_nws4_rs)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 + numberWordSenses_fourBins |  
##     sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1757.0    1782.8    -873.5    1747.0      1267 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6997 -0.9797 -0.6296  0.9806  1.5883 
## 
## Random effects:
##  Groups         Name                      Variance Std.Dev. Corr 
##  sentenceLength (Intercept)               0.20058  0.4479        
##                 numberWordSenses_fourBins 0.07608  0.2758   -0.98
## Number of obs: 1272, groups:  sentenceLength, 15
## 
## Fixed effects:
##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                 0.2701     0.1746   1.547    0.122
## numberWordSenses_fourBins  -0.1569     0.1004  -1.563    0.118
## 
## Correlation of Fixed Effects:
##             (Intr)
## nmbrWrdSn_B -0.914
plot_model(goodness.glmer_nws4_rs, type="pred", terms=c("numberWordSenses_fourBins"))
## You are calculating adjusted predictions on the population-level (i.e.
##   `type = "fixed"`) for a *generalized* linear mixed model.
##   This may produce biased estimates due to Jensen's inequality. Consider
##   setting `bias_correction = TRUE` to correct for this bias.
##   See also the documentation of the `bias_correction` argument.

ggsave(paste0(path, "./results/polysemes_numberWordSenses_poststudy.png"), width= 7, height=7)


# goodSentence ~ numberWordSenses*model
# -------------------------------------

# fixed effect: model, removed annotator as ranef because of singularFit
goodness.glmer_model=glmer(as.factor(goodSentence)  ~ model +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# isSingular
# goodness.glmer_model_rs=glmer(as.factor(goodSentence)  ~ model +(1+model|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))


Anova(goodness.glmer_model, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##              Chisq Df Pr(>Chisq)    
## (Intercept)  9.145  1   0.002494 ** 
## model       30.239  1  3.819e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ model + (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1736.1    1751.6    -865.1    1730.1      1269 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2952 -0.9406 -0.7573  0.9455  1.3205 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.04453  0.211   
## Number of obs: 1272, groups:  sentenceLength, 15
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.3241     0.1072  -3.024  0.00249 ** 
## modelgpt-5.2-chat-latest   0.6397     0.1163   5.499 3.82e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## mdlgp-5.2-- -0.528
plot_model(goodness.glmer_model, type="pred", terms=c("model"))

ggsave(paste0(path, "./results/polysemes_model_poststudy.png"), width= 7, height=7)

# todo: but is this the best model?
ranef(goodness.glmer_model)
## $sentenceLength
##      (Intercept)
## 5   0.0001779395
## 6  -0.0120006883
## 7  -0.2319249572
## 8  -0.2034673778
## 9   0.0433919942
## 10 -0.1521393018
## 11  0.2017245822
## 12  0.1088679478
## 13 -0.0836190650
## 14  0.1189805439
## 15  0.2080210745
## 16 -0.0271274655
## 17  0.0428513997
## 18 -0.0063673686
## 19 -0.0068195784
## 
## with conditional variances for "sentenceLength"
# goodSentence ~ numberWordSenses * model
# ---------------------------------------

# fixed effects: number word senses and model, but no interaction
goodness.glmer_poly_nws_model=glmer(as.factor(goodSentence)  ~ numberWordSenses_fourBins*model +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# factoring in model also improves model fit
anova(goodness.glmer_nws4_rs, goodness.glmer_poly_nws_model)
## Data: data
## Models:
## goodness.glmer_nws4_rs: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 + numberWordSenses_fourBins | sentenceLength)
## goodness.glmer_poly_nws_model: as.factor(goodSentence) ~ numberWordSenses_fourBins * model + (1 | sentenceLength)
##                               npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer_nws4_rs           5 1757.0 1782.8 -873.52    1747.0          
## goodness.glmer_poly_nws_model    5 1735.2 1760.9 -862.60    1725.2 21.832  0
##                               Pr(>Chisq)
## goodness.glmer_nws4_rs                  
## goodness.glmer_poly_nws_model
Anova(goodness.glmer_poly_nws_model, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                                   Chisq Df Pr(>Chisq)    
## (Intercept)                      3.6511  1    0.05603 .  
## numberWordSenses_fourBins        0.3072  1    0.57939    
## model                           19.3237  1  1.103e-05 ***
## numberWordSenses_fourBins:model  1.3824  1    0.23969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_poly_nws_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses_fourBins * model +  
##     (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1735.2    1760.9    -862.6    1725.2      1267 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4149 -0.9183 -0.7645  0.9634  1.3080 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.03054  0.1748  
## Number of obs: 1272, groups:  sentenceLength, 15
## 
## Fixed effects:
##                                                    Estimate Std. Error z value
## (Intercept)                                        -0.26699    0.13973  -1.911
## numberWordSenses_fourBins                          -0.03653    0.06590  -0.554
## modelgpt-5.2-chat-latest                            0.79143    0.18004   4.396
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest -0.10815    0.09198  -1.176
##                                                    Pr(>|z|)    
## (Intercept)                                           0.056 .  
## numberWordSenses_fourBins                             0.579    
## modelgpt-5.2-chat-latest                            1.1e-05 ***
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest    0.240    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) nmWS_B m-5.2-
## nmbrWrdSn_B -0.697              
## mdlgp-5.2-- -0.632  0.539       
## nWS_B:-5.2-  0.474 -0.689 -0.765
plot_model(goodness.glmer_poly_nws_model, type="pred", terms=c("numberWordSenses_fourBins", "model"))

ggsave(paste0(path, "./results/polysemes_numberWordSenses*model_poststudy.png"), width= 7, height=7)
plot_model(goodness.glmer_poly_nws_model, type="pred", terms=c("model","numberWordSenses_fourBins"))

ggsave(paste0(path, "./results/polysemes_numberWordSenses*model_inv_poststudy.png"), width= 7, height=7)
use_poststudy_data_use_adj <- TRUE

if (use_poststudy_data_use_adj) {
  data_full=read.table(paste0(path,"polysemes_corrected_hyper_pol.csv"), header=T, sep=";")
  data_post=read.table(paste0(path,"polysemes_poststudy_corrected_hyper_pol.csv"), header=T, sep=";")
  message("Loaded adjudicated data")
} else {
  data_full=read.table(paste0(path,"polysemes_hyper_pol.csv"), header=T, sep=";")
  data_post=read.table(paste0(path,"polysemes_poststudy_hyper_pol.csv"), header=T, sep=";")
  message("Loaded raw data")
}
## Loaded adjudicated data
# this is the long format, not the wide one (!)
nrow(data_full)
## [1] 5400
ncol(data_full)
## [1] 16
# in poststudy data, rename deepseek to deepseek-3.2
data_post$model[data_post$model == "deepseek-chat"] <- "deepseek-chat-3.2"
nrow(data_post)
## [1] 1272
ncol(data_post)
## [1] 16
View(data_post)
View(data_full)

pooritems<-unique(data_post$lemma)
sort(pooritems)
##  [1] "Abschluss"      "Absender"       "Alpaka"         "Anklage"       
##  [5] "August"         "Besuch"         "Bühne"          "Chance"        
##  [9] "Feuer"          "Foto"           "Grund"          "Hemmung"       
## [13] "Hoffnung"       "Höhe"           "Jäger"          "Kadenz"        
## [17] "Katalog"        "Kirche"         "Licht"          "Mogul"         
## [21] "Neutralisation" "Nonne"          "Öl"             "Olive"         
## [25] "Player"         "Schönheit"      "Schulter"       "Spielball"     
## [29] "System"         "Text"           "Turbulenz"      "Uhr"           
## [33] "Welle"          "Wohnung"
subs= data_full[data_full$lemma %in% pooritems,]
sort(unique(subs$lemma))
##  [1] "Abschluss"      "Absender"       "Alpaka"         "Anklage"       
##  [5] "August"         "Besuch"         "Bühne"          "Chance"        
##  [9] "Feuer"          "Foto"           "Grund"          "Hemmung"       
## [13] "Hoffnung"       "Höhe"           "Jäger"          "Kadenz"        
## [17] "Katalog"        "Kirche"         "Licht"          "Mogul"         
## [21] "Neutralisation" "Nonne"          "Öl"             "Olive"         
## [25] "Player"         "Schönheit"      "Schulter"       "Spielball"     
## [29] "System"         "Text"           "Turbulenz"      "Uhr"           
## [33] "Welle"          "Wohnung"
data_post$version = "new"
subs$version = "old"
both = rbind(data_post, subs)

both$langmod =substr(both$model, 1,4)
table(both$langmod, both$version)
##       
##        new old
##   clau   0 636
##   deep 636 636
##   gpt- 636 636
both_ex=both[both$langmod != "clau",]
#exclude claude, because not repeated
table(both_ex$langmod, both_ex$version)
##       
##        new old
##   deep 636 636
##   gpt- 636 636
both_ex$version= factor(both_ex$version, levels=c("old", "new"))
goodness.glmer_nws_new=glmer(as.factor(goodSentence)  ~ numberWordSenses + langmod*version+(1|sentenceLength), data=both_ex,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_nws_new, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                    Chisq Df Pr(>Chisq)    
## (Intercept)       0.1856  1  0.6665792    
## numberWordSenses 11.6339  1  0.0006476 ***
## langmod           3.3244  1  0.0682595 .  
## version          24.3548  1  8.013e-07 ***
## langmod:version  26.6337  1  2.459e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_nws_new)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses + langmod * version +  
##     (1 | sentenceLength)
##    Data: both_ex
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    3482.4    3517.5   -1735.2    3470.4      2538 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5469 -1.0151  0.7378  0.9457  1.4338 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.04871  0.2207  
## Number of obs: 2544, groups:  sentenceLength, 16
## 
## Fixed effects:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -0.05808    0.13480  -0.431 0.666579    
## numberWordSenses        0.07536    0.02209   3.411 0.000648 ***
## langmodgpt-            -0.21601    0.11848  -1.823 0.068259 .  
## versionnew             -0.56949    0.11540  -4.935 8.01e-07 ***
## langmodgpt-:versionnew  0.87054    0.16868   5.161 2.46e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) nmbrWS lngmd- vrsnnw
## nmbrWrdSnss -0.593                     
## langmodgpt- -0.420 -0.078              
## versionnew  -0.411 -0.045  0.512       
## lngmdgpt-:v  0.287  0.088 -0.729 -0.711
plot_model(goodness.glmer_nws_new, type="pred", terms=c("version", "langmod"))

if (use_poststudy_data_use_adj) {
  ggsave(paste0(path, "./results/polysemes_poststudy_2x2.png"), width=7, height=7)
} else {
  ggsave(paste0(path, "./results/polysemes_poststudy_adj_2x2.png"), width=7, height=7)
}

View(data_post)
data_post$logfreq=log(data$frequency)
data_post$scalelogfreq=scale(data$logfreq)

table(data_post$numberWordSenses)
## 
##   2   3   4   5   6   7   8 
## 456 108 336 120  72  84  96
range(data_post$numberWordSenses)
## [1] 2 8
# for polysemes, center for model fit (intercept at zero)
data_post$cNumberWordSenses = data$numberWordSenses - 2
range(data_post$cNumberWordSenses)
## [1] 0 6
table(data_post$cNumberWordSenses)
## 
##   0   1   2   3   4   5   6 
## 456 108 336 120  72  84  96
# less skew:
data_post$numberWordSenses_fourBins <- ifelse(data_post$cNumberWordSenses > 2, 3, data_post$cNumberWordSenses)
table(data_post$numberWordSenses_fourBins)
## 
##   0   1   2   3 
## 456 108 336 372
# hypernyms that are monosemes
# ----------------------------

# we are only interested in data where hypernymPolysemy equals 1 and artifical equals no
data_monosemous_hyps_only = data_post[data_post$hypernymPolysemy=="1" & data_post$artificial=="no" & data_post$frequencyHypernym>0,]
nrow(data_monosemous_hyps_only)
## [1] 456
# 456 out of 636 entries

View(data_monosemous_hyps_only)

data_monosemous_hyps_only$loghyper=log(data_monosemous_hyps_only$frequencyHypernym)
data_monosemous_hyps_only$scaleloghyper=scale(data_monosemous_hyps_only$loghyper)

range(data_monosemous_hyps_only$scaleloghyper)
## [1] -3.471406  2.171320
# 5.2.3 The frequency of the hypernyn
# ----------------------------------

# scaleloghyper only
goodness.glmer_scaleloghyper=glmer(as.factor(goodSentence)  ~ scaleloghyper + (1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))

#goodness.glmer_scaleloghyper_rs=glmer(as.factor(goodSentence)  ~ scaleloghyper +(1+scaleloghyper|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))

# rs does not yield better model fit
#anova(goodness.glmer_scaleloghyper, goodness.glmer_scaleloghyper_rs)

Anova(goodness.glmer_scaleloghyper, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                Chisq Df Pr(>Chisq)   
## (Intercept)   9.7745  1   0.001769 **
## scaleloghyper 1.3594  1   0.243642   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_scaleloghyper)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper + (1 | sentenceLength)
##    Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     614.5     626.9    -304.2     608.5       453 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5515 -1.1301  0.6943  0.8020  1.2332 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.1183   0.3439  
## Number of obs: 456, groups:  sentenceLength, 15
## 
## Fixed effects:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    0.47932    0.15331   3.126  0.00177 **
## scaleloghyper  0.11417    0.09793   1.166  0.24364   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## scalelghypr 0.000
report(goodness.glmer_scaleloghyper)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict goodSentence with scaleloghyper (formula: as.factor(goodSentence) ~
## scaleloghyper). The model included sentenceLength as random effect (formula: ~1
## | sentenceLength). The model's total explanatory power is weak (conditional R2
## = 0.04) and the part related to the fixed effects alone (marginal R2) is of
## 3.81e-03. The model's intercept, corresponding to scaleloghyper = 0, is at 0.48
## (95% CI [0.18, 0.78], p = 0.002). Within this model:
## 
##   - The effect of scaleloghyper is statistically non-significant and positive
## (beta = 0.11, 95% CI [-0.08, 0.31], p = 0.244; Std. beta = 0.11, 95% CI [-0.08,
## 0.31])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
# scaleloghyper interacting with model
#-------------------------------------
goodness.glmer_scaleloghyper_model=glmer(as.factor(goodSentence)  ~ scaleloghyper*model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_scaleloghyper_model, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                       Chisq Df Pr(>Chisq)    
## (Intercept)          0.0253  1     0.8735    
## scaleloghyper        0.0419  1     0.8377    
## model               21.1174  1   4.32e-06 ***
## scaleloghyper:model  1.0782  1     0.2991    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_scaleloghyper_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper * model + (1 | sentenceLength)
##    Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     595.6     616.2    -292.8     585.6       451 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8914 -1.0100  0.5642  0.8183  1.2977 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.131    0.3619  
## Number of obs: 456, groups:  sentenceLength, 15
## 
## Fixed effects:
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                             0.02911    0.18286   0.159    0.874    
## scaleloghyper                           0.02808    0.13709   0.205    0.838    
## modelgpt-5.2-chat-latest                0.94644    0.20596   4.595 4.32e-06 ***
## scaleloghyper:modelgpt-5.2-chat-latest  0.20760    0.19993   1.038    0.299    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) scllgh m-5.2-
## scalelghypr -0.010              
## mdlgp-5.2-- -0.498  0.014       
## scll:-5.2--  0.001 -0.683  0.032
plot_model(goodness.glmer_scaleloghyper_model, type="pred", terms=c("scaleloghyper", "model"))
## Data were 'prettified'. Consider using `terms="scaleloghyper [all]"` to
##   get smooth plots.

ggsave(paste0(path, "./results/polysemes_scaleloghyper_model_poststudy.png"), width=7, height=7)

data_monosemous_hyps_only$model_short <- factor(data_monosemous_hyps_only$model,
                       levels = c("deepseek-chat", "gpt-5.2-chat-latest"),
                       labels = c("Deepseek", "ChatGPT"))

# goodSentence ~ nws*scaleloghyper
# ---------------------------------

data_post$model_short <- factor(data_post$model,
                       levels = c("deepseek-chat", "gpt-5.2-chat-latest"),
                       labels = c("Deepseek", "ChatGPT"))


goodness.glmer_nws_scaleloghyper=glmer(as.factor(goodSentence)  ~ numberWordSenses_fourBins*scaleloghyper +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))

goodness.glmer_nws_scaleloghyper_add=glmer(as.factor(goodSentence)  ~ numberWordSenses_fourBins+scaleloghyper +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_nws_scaleloghyper, goodness.glmer_nws_scaleloghyper_add)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_nws_scaleloghyper_add: as.factor(goodSentence) ~ numberWordSenses_fourBins + scaleloghyper + (1 | sentenceLength)
## goodness.glmer_nws_scaleloghyper: as.factor(goodSentence) ~ numberWordSenses_fourBins * scaleloghyper + (1 | sentenceLength)
##                                      npar    AIC    BIC  logLik -2*log(L)
## goodness.glmer_nws_scaleloghyper_add    4 616.48 632.97 -304.24    608.48
## goodness.glmer_nws_scaleloghyper        5 617.92 638.53 -303.96    607.92
##                                       Chisq Df Pr(>Chisq)
## goodness.glmer_nws_scaleloghyper_add                     
## goodness.glmer_nws_scaleloghyper     0.5646  1     0.4524
Anova(goodness.glmer_nws_scaleloghyper, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                                          Chisq Df Pr(>Chisq)   
## (Intercept)                             7.3923  1    0.00655 **
## numberWordSenses_fourBins               0.0005  1    0.98306   
## scaleloghyper                           1.9041  1    0.16762   
## numberWordSenses_fourBins:scaleloghyper 0.5699  1    0.45031   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_nws_scaleloghyper)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses_fourBins * scaleloghyper +  
##     (1 | sentenceLength)
##    Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     617.9     638.5    -304.0     607.9       451 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5961 -1.1234  0.6973  0.7945  1.3211 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.118    0.3436  
## Number of obs: 456, groups:  sentenceLength, 15
## 
## Fixed effects:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                              0.489049   0.179872   2.719  0.00655
## numberWordSenses_fourBins                0.001813   0.085412   0.021  0.98306
## scaleloghyper                            0.157322   0.114011   1.380  0.16762
## numberWordSenses_fourBins:scaleloghyper -0.067241   0.089073  -0.755  0.45031
##                                           
## (Intercept)                             **
## numberWordSenses_fourBins                 
## scaleloghyper                             
## numberWordSenses_fourBins:scaleloghyper   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) nmWS_B scllgh
## nmbrWrdSn_B -0.516              
## scalelghypr  0.073 -0.056       
## nmbrWrdS_B: -0.021 -0.133 -0.498
report(goodness.glmer_nws_scaleloghyper)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict goodSentence with numberWordSenses_fourBins and scaleloghyper (formula:
## as.factor(goodSentence) ~ numberWordSenses_fourBins * scaleloghyper). The model
## included sentenceLength as random effect (formula: ~1 | sentenceLength). The
## model's total explanatory power is weak (conditional R2 = 0.04) and the part
## related to the fixed effects alone (marginal R2) is of 5.35e-03. The model's
## intercept, corresponding to numberWordSenses_fourBins = 0 and scaleloghyper =
## 0, is at 0.49 (95% CI [0.14, 0.84], p = 0.007). Within this model:
## 
##   - The effect of numberWordSenses fourBins is statistically non-significant and
## positive (beta = 1.81e-03, 95% CI [-0.17, 0.17], p = 0.983; Std. beta =
## 2.20e-03, 95% CI [-0.20, 0.20])
##   - The effect of scaleloghyper is statistically non-significant and positive
## (beta = 0.16, 95% CI [-0.07, 0.38], p = 0.168; Std. beta = 0.08, 95% CI [-0.13,
## 0.29])
##   - The effect of numberWordSenses fourBins × scaleloghyper is statistically
## non-significant and negative (beta = -0.07, 95% CI [-0.24, 0.11], p = 0.450;
## Std. beta = -0.08, 95% CI [-0.29, 0.13])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(goodness.glmer_nws_scaleloghyper, type="pred", terms=c("numberWordSenses_fourBins", "scaleloghyper"))

ggsave(paste0(path, "./results/polysemes_nws*scaleloghyper_poststudy.png"), width= 7, height=7)

max(vif(goodness.glmer_nws_scaleloghyper))
## [1] 1.376861
# 1.1

goodness.glmer_poly_3way=glmer(as.factor(goodSentence)  ~ scaleloghyper*numberWordSenses_fourBins*model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))


goodness.glmer_poly_2way=glmer(as.factor(goodSentence)  ~ scaleloghyper + numberWordSenses_fourBins*model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_poly_3way, goodness.glmer_poly_2way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
## goodness.glmer_poly_3way: as.factor(goodSentence) ~ scaleloghyper * numberWordSenses_fourBins * model + (1 | sentenceLength)
##                          npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer_poly_2way    6 598.53 623.26 -293.26    586.53          
## goodness.glmer_poly_3way    9 602.53 639.63 -292.26    584.53 1.9997  3
##                          Pr(>Chisq)
## goodness.glmer_poly_2way           
## goodness.glmer_poly_3way     0.5725
goodness.glmer_poly_2way_alt=glmer(as.factor(goodSentence)  ~ scaleloghyper*model + numberWordSenses_fourBins +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))

anova(goodness.glmer_poly_3way, goodness.glmer_poly_2way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
## goodness.glmer_poly_3way: as.factor(goodSentence) ~ scaleloghyper * numberWordSenses_fourBins * model + (1 | sentenceLength)
##                          npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer_poly_2way    6 598.53 623.26 -293.26    586.53          
## goodness.glmer_poly_3way    9 602.53 639.63 -292.26    584.53 1.9997  3
##                          Pr(>Chisq)
## goodness.glmer_poly_2way           
## goodness.glmer_poly_3way     0.5725
anova(goodness.glmer_poly_2way_alt, goodness.glmer_poly_2way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_2way_alt: as.factor(goodSentence) ~ scaleloghyper * model + numberWordSenses_fourBins + (1 | sentenceLength)
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
##                              npar    AIC    BIC  logLik -2*log(L) Chisq Df
## goodness.glmer_poly_2way_alt    6 597.55 622.29 -292.78    585.55         
## goodness.glmer_poly_2way        6 598.53 623.26 -293.26    586.53     0  0
##                              Pr(>Chisq)
## goodness.glmer_poly_2way_alt           
## goodness.glmer_poly_2way
# => 2way is the best fit


goodness.glmer_poly_1way=glmer(as.factor(goodSentence)  ~ scaleloghyper + numberWordSenses_fourBins + model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_poly_2way, goodness.glmer_poly_1way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_1way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins + model + (1 | sentenceLength)
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
##                          npar    AIC    BIC  logLik -2*log(L) Chisq Df
## goodness.glmer_poly_1way    5 596.63 617.24 -293.31    586.63         
## goodness.glmer_poly_2way    6 598.53 623.26 -293.26    586.53   0.1  1
##                          Pr(>Chisq)
## goodness.glmer_poly_1way           
## goodness.glmer_poly_2way     0.7519
Anova(goodness.glmer_poly_2way, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                                   Chisq Df Pr(>Chisq)    
## (Intercept)                      0.0008  1  0.9779870    
## scaleloghyper                    1.5413  1  0.2144254    
## numberWordSenses_fourBins        0.0767  1  0.7818777    
## model                           13.2600  1  0.0002711 ***
## numberWordSenses_fourBins:model  0.1002  1  0.7516069    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_poly_2way)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins *  
##     model + (1 | sentenceLength)
##    Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     598.5     623.3    -293.3     586.5       450 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8360 -1.0129  0.5788  0.8073  1.3247 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.1304   0.3612  
## Number of obs: 456, groups:  sentenceLength, 15
## 
## Fixed effects:
##                                                     Estimate Std. Error z value
## (Intercept)                                        -0.006152   0.222967  -0.028
## scaleloghyper                                       0.126001   0.101492   1.241
## numberWordSenses_fourBins                           0.031820   0.114928   0.277
## modelgpt-5.2-chat-latest                            0.999912   0.274594   3.641
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest -0.053042   0.167579  -0.317
##                                                    Pr(>|z|)    
## (Intercept)                                        0.977987    
## scaleloghyper                                      0.214425    
## numberWordSenses_fourBins                          0.781878    
## modelgpt-5.2-chat-latest                           0.000271 ***
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest 0.751607    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) scllgh nmWS_B m-5.2-
## scalelghypr  0.041                     
## nmbrWrdSn_B -0.572 -0.091              
## mdlgp-5.2-- -0.563  0.039  0.454       
## nWS_B:-5.2-  0.372 -0.018 -0.662 -0.662
report(goodness.glmer_poly_2way)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict goodSentence with scaleloghyper, numberWordSenses_fourBins and model
## (formula: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins *
## model). The model included sentenceLength as random effect (formula: ~1 |
## sentenceLength). The model's total explanatory power is weak (conditional R2 =
## 0.10) and the part related to the fixed effects alone (marginal R2) is of 0.07.
## The model's intercept, corresponding to scaleloghyper = 0,
## numberWordSenses_fourBins = 0 and model = deepseek-chat-3.2, is at -6.15e-03
## (95% CI [-0.44, 0.43], p = 0.978). Within this model:
## 
##   - The effect of scaleloghyper is statistically non-significant and positive
## (beta = 0.13, 95% CI [-0.07, 0.32], p = 0.214; Std. beta = 0.13, 95% CI [-0.07,
## 0.32])
##   - The effect of numberWordSenses fourBins is statistically non-significant and
## positive (beta = 0.03, 95% CI [-0.19, 0.26], p = 0.782; Std. beta = 0.04, 95%
## CI [-0.23, 0.31])
##   - The effect of model [gpt-5.2-chat-latest] is statistically significant and
## positive (beta = 1.00, 95% CI [0.46, 1.54], p < .001; Std. beta = 0.94, 95% CI
## [0.54, 1.34])
##   - The effect of numberWordSenses fourBins × model [gpt-5.2-chat-latest] is
## statistically non-significant and negative (beta = -0.05, 95% CI [-0.38, 0.28],
## p = 0.752; Std. beta = -0.06, 95% CI [-0.46, 0.33])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(goodness.glmer_poly_2way, type="pred", terms=c("model", "scaleloghyper", "numberWordSenses_fourBins"))

plot_model(goodness.glmer_poly_2way, type="pred", ncol = 4, terms=c("scaleloghyper [all]", "model", "numberWordSenses_fourBins"))

ggsave(paste0(path, "./results/polysemes_model_2way_poststudy.png"), width= 7, height=7)


plot(allEffects(goodness.glmer_poly_2way))
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictor scaleloghyper is a one-column matrix that was converted to a vector
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictor scaleloghyper is a one-column matrix that was converted to a vector

eff <- ggpredict(goodness.glmer_poly_2way, terms = c("numberWordSenses_fourBins", "model", "scaleloghyper"))
plot(eff)

emmeans(goodness.glmer_poly_3way, pairwise ~ model)
## NOTE: Results may be misleading due to involvement in interactions
## $emmeans
##  model               emmean    SE  df asymp.LCL asymp.UCL
##  deepseek-chat-3.2   0.0335 0.183 Inf    -0.325     0.392
##  gpt-5.2-chat-latest 0.9957 0.196 Inf     0.611     1.381
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast                                    estimate   SE  df z.ratio p.value
##  (deepseek-chat-3.2) - (gpt-5.2-chat-latest)   -0.962 0.21 Inf  -4.591 <0.0001
## 
## Results are given on the log odds ratio (not the response) scale.
emm_model <- emmeans(goodness.glmer_poly_2way,
                     ~ model,
                     type = "response")
## NOTE: Results may be misleading due to involvement in interactions
emm_df <- as.data.frame(emm_model)
emm_df
##  model                response         SE  df asymp.LCL asymp.UCL
##  deepseek-chat-3.2   0.5072539 0.04572661 Inf 0.4183507 0.5957008
##  gpt-5.2-chat-latest 0.7251800 0.03893344 Inf 0.6427710 0.7946517
## 
## Unknown transformation "as.factor": no transformation done 
## Confidence level used: 0.95
# Open a PNG device
png(paste0(path,"./results/polysemes_predictedProbabilityGoodSentence_poststudy.png"), width = 1200, height = 800, res = 150)

ggplot(emm_df, aes(x = model,
                   y = response,
                   ymin = asymp.LCL,
                   ymax = asymp.UCL)) +
  geom_pointrange(size = 0.8) +
  geom_line(aes(group = 1), linetype = "dashed", color = "grey50") +
  labs(
    x = "Model",
    y = "Predicted Probability of 'Good Sentence'",
    title = "Estimated Marginal Means by Model"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    plot.title = element_text(face = "bold")
  )
  
# Close the device to save the file
dev.off()
## quartz_off_screen 
##                 2
plot_model(goodness.glmer_poly_3way,
           type = "pred",
           terms=c("model"),
           transform_terms = function(term) {
             dplyr::recode(term,
               "deepseek-chat"   = "Deepseek",
               "gpt-5.2-chat-latest"  = "ChatGPT"
             )
           })

# 3.10