library(ggplot2)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(car)
## Loading required package: carData
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(emmeans)
## Warning: package 'emmeans' was built under R version 4.4.3
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(lattice)
library(irr)
## Loading required package: lpSolve
library(cvms)
library(epiR)
## Loading required package: survival
## Package epiR 2.0.84 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
##
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: TH.data
## Warning: package 'TH.data' was built under R version 4.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
library(multcompView)
library(effects)
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
library(ggeffects) # install.packages("ggeffects")
library(summarytools)
library(report)
library(performance)
## Warning: package 'performance' was built under R version 4.4.3
##
## Attaching package: 'performance'
## The following object is masked from 'package:irr':
##
## icc
Polyseme case:
use_poststudy_data_use_adj <- TRUE
#use_poststudy_data_use_adj <- FALSE
path="/Users/zinn/Projects/EKUT/GermaNet/GermaNet_LLM/R_analyses/annotations/"
if (use_poststudy_data_use_adj) {
data=read.table(paste0(path,"polysemes_poststudy_corrected.csv"), header=T, sep=";")
message("Loaded polysemes_poststudy_corrected.csv")
} else {
data=read.table(paste0(path,"polysemes_poststudy.csv"), header=T, sep=";")
message("Loaded polysemes_poststudy.csv")
}
## Loaded polysemes_poststudy_corrected.csv
nrow(data)
## [1] 1272
View(data)
# for Table 5. LLMs and their error types
counts <- data %>%
group_by(errorType, model) %>%
summarise(count = n(), .groups = "drop")
print(counts)
## # A tibble: 16 × 3
## errorType model count
## <chr> <chr> <int>
## 1 0 deepseek-chat 268
## 2 0 gpt-5.2-chat-latest 364
## 3 FA deepseek-chat 123
## 4 FA gpt-5.2-chat-latest 121
## 5 FG deepseek-chat 21
## 6 FG gpt-5.2-chat-latest 10
## 7 FL deepseek-chat 122
## 8 FL gpt-5.2-chat-latest 50
## 9 ha deepseek-chat 25
## 10 ha gpt-5.2-chat-latest 32
## 11 ni deepseek-chat 6
## 12 ni gpt-5.2-chat-latest 13
## 13 nn deepseek-chat 42
## 14 nn gpt-5.2-chat-latest 28
## 15 nt deepseek-chat 29
## 16 nt gpt-5.2-chat-latest 18
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(data$logfreq)
data$loghyper=log(data$frequencyHypernym+1)
data$scaleloghyper=scale(data$loghyper)
data$lengthLemma=nchar(data$lemma)
data$lengthHypernym=nchar(data$hypernym)
mean(data$sentenceLength)
## [1] 10.66981
median(data$sentenceLength)
## [1] 10
mean(data$sentenceStringLength)
## [1] 71.94654
median(data$sentenceStringLength)
## [1] 71
chatdata=data[data$model=="gpt-5.2-chat-latest",]
nrow(chatdata)
## [1] 636
mean(chatdata$sentenceStringLength)
## [1] 68.58176
mean(chatdata$sentenceLength)
## [1] 10.28931
deepseekdata=data[data$model=="deepseek-chat",]
nrow(deepseekdata)
## [1] 636
mean(deepseekdata$sentenceStringLength)
## [1] 75.31132
mean(deepseekdata$sentenceLength)
## [1] 11.05031
# 5.2.1 Length of example sentences and sentence quality
# ---------------------------------
sl = lmer(sentenceLength ~ model + (1 | lemma), data = data)
Anova(sl, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: sentenceLength
## Chisq Df Pr(>Chisq)
## (Intercept) 2742.3 1 < 2.2e-16 ***
## model 54.3 1 1.721e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(sl)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sentenceLength ~ model + (1 | lemma)
## Data: data
##
## REML criterion at convergence: 5257.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.74740 -0.62476 -0.01782 0.63930 3.10037
##
## Random effects:
## Groups Name Variance Std.Dev.
## lemma (Intercept) 1.410 1.187
## Residual 3.392 1.842
## Number of obs: 1272, groups: lemma, 34
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 11.3901 0.2175 37.1577 52.367 < 2e-16 ***
## modelgpt-5.2-chat-latest -0.7610 0.1033 1237.1351 -7.369 3.14e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## mdlgp-5.2-- -0.237
# sentenceLength (not significant)
goodness.glmer=glmer(as.factor(goodSentence) ~ sentenceLength +(1|lemma), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 0.1313 1 0.7171
## sentenceLength 0.1870 1 0.6654
summary(goodness.glmer)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ sentenceLength + (1 | lemma)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1674.2 1689.6 -834.1 1668.2 1269
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0020 -0.9646 -0.4193 0.8795 2.3690
##
## Random effects:
## Groups Name Variance Std.Dev.
## lemma (Intercept) 0.6519 0.8074
## Number of obs: 1272, groups: lemma, 34
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.13598 0.37531 -0.362 0.717
## sentenceLength 0.01351 0.03123 0.432 0.665
##
## Correlation of Fixed Effects:
## (Intr)
## sentncLngth -0.913
plot(density(data$scalelogfreq))
# 1. Create contingency table
tbl <- table(data$errorType, data$numberWordSenses)
# 2. Run chi-square test
chisq.test(tbl)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 234.38, df = 42, p-value < 2.2e-16
chisq.test(tbl, simulate.p.value = TRUE, B = 10000)
##
## Pearson's Chi-squared test with simulated p-value (based on 10000
## replicates)
##
## data: tbl
## X-squared = 234.38, df = NA, p-value = 9.999e-05
# => the two categorical variables in your table are highly dependent.
# Open a PNG device
png(paste0(path, "./results/polysemes_errorType_by_numSenses_poststudy_poststudy.png"), width = 1200, height = 800, res = 150)
mosaicplot(tbl, color = TRUE, las = 2,
main = "Mosaic plot: Error Type by Number of Senses",
xlab = "Error Type", ylab = "Number of Word Senses")
# Close the device to save the file
dev.off()
## quartz_off_screen
## 2
png(paste0(path, "./results/polysemes_wordSenseDistribution_poststudy_poststudy.png"), width = 1200, height = 800, res = 150)
hist(data$numberWordSenses,
main = "Distribution of Number of Word Senses",
xlab = "Number of Senses",
ylab = "Number of Sentences",
col = "skyblue", border = "white")
dev.off()
## quartz_off_screen
## 2
use_poststudy_data_use_adj <- TRUE
if (use_poststudy_data_use_adj) {
data=read.table(paste0(path,"polysemes_poststudy_corrected.csv"), header=T, sep=";")
message("Loaded polysemes_poststudy_corrected.csv")
} else {
data=read.table(paste0(path,"polysemes_poststudy.csv"), header=T, sep=";")
message("Loaded polysemes_poststudy.csv")
}
## Loaded polysemes_poststudy_corrected.csv
# this is the long format, not the wide one (!)
nrow(data)
## [1] 1272
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(data$logfreq)
table(data$numberWordSenses)
##
## 2 3 4 5 6 7 8
## 456 108 336 120 72 84 96
range(data$numberWordSenses)
## [1] 2 8
# for polysemes, center for model fit (intercept at zero)
data$cNumberWordSenses = data$numberWordSenses - 2
range(data$cNumberWordSenses)
## [1] 0 6
table(data$cNumberWordSenses)
##
## 0 1 2 3 4 5 6
## 456 108 336 120 72 84 96
# less skew:
data$numberWordSenses_fourBins <- ifelse(data$cNumberWordSenses > 2, 3, data$cNumberWordSenses)
table(data$numberWordSenses_fourBins)
##
## 0 1 2 3
## 456 108 336 372
# 5.2.2 No frequency data for word senses; what can be said about lemma frequency and its level of polysemy?
# ----------------------------------------------------------------------------------------------------------
cor.test(data$logfreq, data$numberWordSenses, method = c("kendall"))
##
## Kendall's rank correlation tau
##
## data: data$logfreq and data$numberWordSenses
## z = 2.8105, p-value = 0.004947
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.05881814
cor.test(data$logfreq, data$numberWordSenses_fourBins, method = c("kendall"))
##
## Kendall's rank correlation tau
##
## data: data$logfreq and data$numberWordSenses_fourBins
## z = 4.5041, p-value = 6.665e-06
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.09714772
# just checking: we get identical values for scalelogfreq
cor.test(data$scalelogfreq, data$numberWordSenses, method = c("kendall"))
##
## Kendall's rank correlation tau
##
## data: data$scalelogfreq and data$numberWordSenses
## z = 2.8105, p-value = 0.004947
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.05881814
cor.test(data$scalelogfreq, data$numberWordSenses_fourBins, method = c("kendall"))
##
## Kendall's rank correlation tau
##
## data: data$scalelogfreq and data$numberWordSenses_fourBins
## z = 4.5041, p-value = 6.665e-06
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.09714772
cor.test(data$logfreq, data$numberWordSenses, method = c("spearman"))
## Warning in cor.test.default(data$logfreq, data$numberWordSenses, method =
## c("spearman")): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: data$logfreq and data$numberWordSenses
## S = 304744017, p-value = 6.676e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1115656
cor.test(data$logfreq, data$numberWordSenses_fourBins, method = c("spearman"))
## Warning in cor.test.default(data$logfreq, data$numberWordSenses_fourBins, :
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: data$logfreq and data$numberWordSenses_fourBins
## S = 292155304, p-value = 1.084e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.148266
cor.test(data$logfreq, data$numberWordSenses, method = c("pearson"))
##
## Pearson's product-moment correlation
##
## data: data$logfreq and data$numberWordSenses
## t = 5.2008, df = 1270, p-value = 2.311e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.09015976 0.19780239
## sample estimates:
## cor
## 0.1444083
cor.test(data$logfreq, data$numberWordSenses_fourBins, method = c("pearson"))
##
## Pearson's product-moment correlation
##
## data: data$logfreq and data$numberWordSenses_fourBins
## t = 3.4811, df = 1270, p-value = 0.0005163
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0424827 0.1513751
## sample estimates:
## cor
## 0.09721982
# logfreq ~ numberWordSenses
# ---------------------------
logfreq_lm=lm(logfreq ~ numberWordSenses, data=data)
logfreq_lm4=lm(logfreq ~ numberWordSenses_fourBins, data=data)
AIC(logfreq_lm, logfreq_lm4)
## df AIC
## logfreq_lm 3 5520.425
## logfreq_lm4 3 5535.152
# => In a simple LM, numberWordSenses_fourBins fits the data substantially better
summary(logfreq_lm4)
##
## Call:
## lm(formula = logfreq ~ numberWordSenses_fourBins, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8135 -1.6584 0.5452 1.6519 3.2366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.33128 0.09305 132.522 < 2e-16 ***
## numberWordSenses_fourBins 0.16675 0.04790 3.481 0.000516 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.128 on 1270 degrees of freedom
## Multiple R-squared: 0.009452, Adjusted R-squared: 0.008672
## F-statistic: 12.12 on 1 and 1270 DF, p-value: 0.0005163
report(logfreq_lm4)
## We fitted a linear model (estimated using OLS) to predict logfreq with
## numberWordSenses_fourBins (formula: logfreq ~ numberWordSenses_fourBins). The
## model explains a statistically significant and very weak proportion of variance
## (R2 = 9.45e-03, F(1, 1270) = 12.12, p < .001, adj. R2 = 8.67e-03). The model's
## intercept, corresponding to numberWordSenses_fourBins = 0, is at 12.33 (95% CI
## [12.15, 12.51], t(1270) = 132.52, p < .001). Within this model:
##
## - The effect of numberWordSenses fourBins is statistically significant and
## positive (beta = 0.17, 95% CI [0.07, 0.26], t(1270) = 3.48, p < .001; Std. beta
## = 0.10, 95% CI [0.04, 0.15])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(logfreq_lm4, type="pred", terms=c("numberWordSenses_fourBins"))
ggsave(paste0(path, "./results/polysemes_logfreq_numberWordSenses_poststudy.png"), width= 7, height=7)
# goodSentence ~ numberWordSenses
# -------------------------------
goodness.glmer_nws=glmer(as.factor(goodSentence) ~ numberWordSenses +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_nws, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 0.0097 1 0.9216
## numberWordSenses 0.0413 1 0.8389
summary(goodness.glmer_nws)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses + (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1767.0 1782.4 -880.5 1761.0 1269
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0764 -0.9612 -0.9246 0.9767 1.0816
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.02446 0.1564
## Number of obs: 1272, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.013909 0.141400 0.098 0.922
## numberWordSenses -0.006206 0.030526 -0.203 0.839
##
## Correlation of Fixed Effects:
## (Intr)
## nmbrWrdSnss -0.835
goodness.glmer_nws4=glmer(as.factor(goodSentence) ~ numberWordSenses_fourBins +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# fourBins fits better
anova(goodness.glmer_nws, goodness.glmer_nws4)
## Data: data
## Models:
## goodness.glmer_nws: as.factor(goodSentence) ~ numberWordSenses + (1 | sentenceLength)
## goodness.glmer_nws4: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## goodness.glmer_nws 3 1767.0 1782.4 -880.50 1761.0
## goodness.glmer_nws4 3 1762.8 1778.2 -878.39 1756.8 4.2119 0
Anova(goodness.glmer_nws4, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 1.6751 1 0.19558
## numberWordSenses_fourBins 4.2720 1 0.03875 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_nws4)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1762.8 1778.2 -878.4 1756.8 1269
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1371 -0.9987 -0.8760 0.9888 1.1415
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.01973 0.1405
## Number of obs: 1272, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.13069 0.10098 1.294 0.1956
## numberWordSenses_fourBins -0.09577 0.04634 -2.067 0.0387 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## nmbrWrdSn_B -0.676
# but is there a better model?
goodness.glmer_nws4_rs=glmer(as.factor(goodSentence) ~ numberWordSenses_fourBins +(1+numberWordSenses_fourBins|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# rs is better
anova(goodness.glmer_nws4, goodness.glmer_nws4_rs)
## Data: data
## Models:
## goodness.glmer_nws4: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 | sentenceLength)
## goodness.glmer_nws4_rs: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 + numberWordSenses_fourBins | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer_nws4 3 1762.8 1778.2 -878.39 1756.8
## goodness.glmer_nws4_rs 5 1757.0 1782.8 -873.52 1747.0 9.7461 2
## Pr(>Chisq)
## goodness.glmer_nws4
## goodness.glmer_nws4_rs 0.00765 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(goodness.glmer_nws4_rs, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 2.3919 1 0.1220
## numberWordSenses_fourBins 2.4444 1 0.1179
summary(goodness.glmer_nws4_rs)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 + numberWordSenses_fourBins |
## sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1757.0 1782.8 -873.5 1747.0 1267
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6997 -0.9797 -0.6296 0.9806 1.5883
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## sentenceLength (Intercept) 0.20058 0.4479
## numberWordSenses_fourBins 0.07608 0.2758 -0.98
## Number of obs: 1272, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2701 0.1746 1.547 0.122
## numberWordSenses_fourBins -0.1569 0.1004 -1.563 0.118
##
## Correlation of Fixed Effects:
## (Intr)
## nmbrWrdSn_B -0.914
plot_model(goodness.glmer_nws4_rs, type="pred", terms=c("numberWordSenses_fourBins"))
## You are calculating adjusted predictions on the population-level (i.e.
## `type = "fixed"`) for a *generalized* linear mixed model.
## This may produce biased estimates due to Jensen's inequality. Consider
## setting `bias_correction = TRUE` to correct for this bias.
## See also the documentation of the `bias_correction` argument.
ggsave(paste0(path, "./results/polysemes_numberWordSenses_poststudy.png"), width= 7, height=7)
# goodSentence ~ numberWordSenses*model
# -------------------------------------
# fixed effect: model, removed annotator as ranef because of singularFit
goodness.glmer_model=glmer(as.factor(goodSentence) ~ model +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# isSingular
# goodness.glmer_model_rs=glmer(as.factor(goodSentence) ~ model +(1+model|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_model, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 9.145 1 0.002494 **
## model 30.239 1 3.819e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ model + (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1736.1 1751.6 -865.1 1730.1 1269
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.2952 -0.9406 -0.7573 0.9455 1.3205
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.04453 0.211
## Number of obs: 1272, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3241 0.1072 -3.024 0.00249 **
## modelgpt-5.2-chat-latest 0.6397 0.1163 5.499 3.82e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## mdlgp-5.2-- -0.528
plot_model(goodness.glmer_model, type="pred", terms=c("model"))
ggsave(paste0(path, "./results/polysemes_model_poststudy.png"), width= 7, height=7)
# todo: but is this the best model?
ranef(goodness.glmer_model)
## $sentenceLength
## (Intercept)
## 5 0.0001779395
## 6 -0.0120006883
## 7 -0.2319249572
## 8 -0.2034673778
## 9 0.0433919942
## 10 -0.1521393018
## 11 0.2017245822
## 12 0.1088679478
## 13 -0.0836190650
## 14 0.1189805439
## 15 0.2080210745
## 16 -0.0271274655
## 17 0.0428513997
## 18 -0.0063673686
## 19 -0.0068195784
##
## with conditional variances for "sentenceLength"
# goodSentence ~ numberWordSenses * model
# ---------------------------------------
# fixed effects: number word senses and model, but no interaction
goodness.glmer_poly_nws_model=glmer(as.factor(goodSentence) ~ numberWordSenses_fourBins*model +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# factoring in model also improves model fit
anova(goodness.glmer_nws4_rs, goodness.glmer_poly_nws_model)
## Data: data
## Models:
## goodness.glmer_nws4_rs: as.factor(goodSentence) ~ numberWordSenses_fourBins + (1 + numberWordSenses_fourBins | sentenceLength)
## goodness.glmer_poly_nws_model: as.factor(goodSentence) ~ numberWordSenses_fourBins * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer_nws4_rs 5 1757.0 1782.8 -873.52 1747.0
## goodness.glmer_poly_nws_model 5 1735.2 1760.9 -862.60 1725.2 21.832 0
## Pr(>Chisq)
## goodness.glmer_nws4_rs
## goodness.glmer_poly_nws_model
Anova(goodness.glmer_poly_nws_model, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 3.6511 1 0.05603 .
## numberWordSenses_fourBins 0.3072 1 0.57939
## model 19.3237 1 1.103e-05 ***
## numberWordSenses_fourBins:model 1.3824 1 0.23969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_poly_nws_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses_fourBins * model +
## (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1735.2 1760.9 -862.6 1725.2 1267
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4149 -0.9183 -0.7645 0.9634 1.3080
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.03054 0.1748
## Number of obs: 1272, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value
## (Intercept) -0.26699 0.13973 -1.911
## numberWordSenses_fourBins -0.03653 0.06590 -0.554
## modelgpt-5.2-chat-latest 0.79143 0.18004 4.396
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest -0.10815 0.09198 -1.176
## Pr(>|z|)
## (Intercept) 0.056 .
## numberWordSenses_fourBins 0.579
## modelgpt-5.2-chat-latest 1.1e-05 ***
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest 0.240
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) nmWS_B m-5.2-
## nmbrWrdSn_B -0.697
## mdlgp-5.2-- -0.632 0.539
## nWS_B:-5.2- 0.474 -0.689 -0.765
plot_model(goodness.glmer_poly_nws_model, type="pred", terms=c("numberWordSenses_fourBins", "model"))
ggsave(paste0(path, "./results/polysemes_numberWordSenses*model_poststudy.png"), width= 7, height=7)
plot_model(goodness.glmer_poly_nws_model, type="pred", terms=c("model","numberWordSenses_fourBins"))
ggsave(paste0(path, "./results/polysemes_numberWordSenses*model_inv_poststudy.png"), width= 7, height=7)
use_poststudy_data_use_adj <- TRUE
if (use_poststudy_data_use_adj) {
data_full=read.table(paste0(path,"polysemes_corrected_hyper_pol.csv"), header=T, sep=";")
data_post=read.table(paste0(path,"polysemes_poststudy_corrected_hyper_pol.csv"), header=T, sep=";")
message("Loaded adjudicated data")
} else {
data_full=read.table(paste0(path,"polysemes_hyper_pol.csv"), header=T, sep=";")
data_post=read.table(paste0(path,"polysemes_poststudy_hyper_pol.csv"), header=T, sep=";")
message("Loaded raw data")
}
## Loaded adjudicated data
# this is the long format, not the wide one (!)
nrow(data_full)
## [1] 5400
ncol(data_full)
## [1] 16
# in poststudy data, rename deepseek to deepseek-3.2
data_post$model[data_post$model == "deepseek-chat"] <- "deepseek-chat-3.2"
nrow(data_post)
## [1] 1272
ncol(data_post)
## [1] 16
View(data_post)
View(data_full)
pooritems<-unique(data_post$lemma)
sort(pooritems)
## [1] "Abschluss" "Absender" "Alpaka" "Anklage"
## [5] "August" "Besuch" "Bühne" "Chance"
## [9] "Feuer" "Foto" "Grund" "Hemmung"
## [13] "Hoffnung" "Höhe" "Jäger" "Kadenz"
## [17] "Katalog" "Kirche" "Licht" "Mogul"
## [21] "Neutralisation" "Nonne" "Öl" "Olive"
## [25] "Player" "Schönheit" "Schulter" "Spielball"
## [29] "System" "Text" "Turbulenz" "Uhr"
## [33] "Welle" "Wohnung"
subs= data_full[data_full$lemma %in% pooritems,]
sort(unique(subs$lemma))
## [1] "Abschluss" "Absender" "Alpaka" "Anklage"
## [5] "August" "Besuch" "Bühne" "Chance"
## [9] "Feuer" "Foto" "Grund" "Hemmung"
## [13] "Hoffnung" "Höhe" "Jäger" "Kadenz"
## [17] "Katalog" "Kirche" "Licht" "Mogul"
## [21] "Neutralisation" "Nonne" "Öl" "Olive"
## [25] "Player" "Schönheit" "Schulter" "Spielball"
## [29] "System" "Text" "Turbulenz" "Uhr"
## [33] "Welle" "Wohnung"
data_post$version = "new"
subs$version = "old"
both = rbind(data_post, subs)
both$langmod =substr(both$model, 1,4)
table(both$langmod, both$version)
##
## new old
## clau 0 636
## deep 636 636
## gpt- 636 636
both_ex=both[both$langmod != "clau",]
#exclude claude, because not repeated
table(both_ex$langmod, both_ex$version)
##
## new old
## deep 636 636
## gpt- 636 636
both_ex$version= factor(both_ex$version, levels=c("old", "new"))
goodness.glmer_nws_new=glmer(as.factor(goodSentence) ~ numberWordSenses + langmod*version+(1|sentenceLength), data=both_ex,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_nws_new, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 0.1856 1 0.6665792
## numberWordSenses 11.6339 1 0.0006476 ***
## langmod 3.3244 1 0.0682595 .
## version 24.3548 1 8.013e-07 ***
## langmod:version 26.6337 1 2.459e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_nws_new)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses + langmod * version +
## (1 | sentenceLength)
## Data: both_ex
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 3482.4 3517.5 -1735.2 3470.4 2538
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.5469 -1.0151 0.7378 0.9457 1.4338
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.04871 0.2207
## Number of obs: 2544, groups: sentenceLength, 16
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05808 0.13480 -0.431 0.666579
## numberWordSenses 0.07536 0.02209 3.411 0.000648 ***
## langmodgpt- -0.21601 0.11848 -1.823 0.068259 .
## versionnew -0.56949 0.11540 -4.935 8.01e-07 ***
## langmodgpt-:versionnew 0.87054 0.16868 5.161 2.46e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) nmbrWS lngmd- vrsnnw
## nmbrWrdSnss -0.593
## langmodgpt- -0.420 -0.078
## versionnew -0.411 -0.045 0.512
## lngmdgpt-:v 0.287 0.088 -0.729 -0.711
plot_model(goodness.glmer_nws_new, type="pred", terms=c("version", "langmod"))
if (use_poststudy_data_use_adj) {
ggsave(paste0(path, "./results/polysemes_poststudy_2x2.png"), width=7, height=7)
} else {
ggsave(paste0(path, "./results/polysemes_poststudy_adj_2x2.png"), width=7, height=7)
}
View(data_post)
data_post$logfreq=log(data$frequency)
data_post$scalelogfreq=scale(data$logfreq)
table(data_post$numberWordSenses)
##
## 2 3 4 5 6 7 8
## 456 108 336 120 72 84 96
range(data_post$numberWordSenses)
## [1] 2 8
# for polysemes, center for model fit (intercept at zero)
data_post$cNumberWordSenses = data$numberWordSenses - 2
range(data_post$cNumberWordSenses)
## [1] 0 6
table(data_post$cNumberWordSenses)
##
## 0 1 2 3 4 5 6
## 456 108 336 120 72 84 96
# less skew:
data_post$numberWordSenses_fourBins <- ifelse(data_post$cNumberWordSenses > 2, 3, data_post$cNumberWordSenses)
table(data_post$numberWordSenses_fourBins)
##
## 0 1 2 3
## 456 108 336 372
# hypernyms that are monosemes
# ----------------------------
# we are only interested in data where hypernymPolysemy equals 1 and artifical equals no
data_monosemous_hyps_only = data_post[data_post$hypernymPolysemy=="1" & data_post$artificial=="no" & data_post$frequencyHypernym>0,]
nrow(data_monosemous_hyps_only)
## [1] 456
# 456 out of 636 entries
View(data_monosemous_hyps_only)
data_monosemous_hyps_only$loghyper=log(data_monosemous_hyps_only$frequencyHypernym)
data_monosemous_hyps_only$scaleloghyper=scale(data_monosemous_hyps_only$loghyper)
range(data_monosemous_hyps_only$scaleloghyper)
## [1] -3.471406 2.171320
# 5.2.3 The frequency of the hypernyn
# ----------------------------------
# scaleloghyper only
goodness.glmer_scaleloghyper=glmer(as.factor(goodSentence) ~ scaleloghyper + (1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
#goodness.glmer_scaleloghyper_rs=glmer(as.factor(goodSentence) ~ scaleloghyper +(1+scaleloghyper|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
# rs does not yield better model fit
#anova(goodness.glmer_scaleloghyper, goodness.glmer_scaleloghyper_rs)
Anova(goodness.glmer_scaleloghyper, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 9.7745 1 0.001769 **
## scaleloghyper 1.3594 1 0.243642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_scaleloghyper)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper + (1 | sentenceLength)
## Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 614.5 626.9 -304.2 608.5 453
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.5515 -1.1301 0.6943 0.8020 1.2332
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.1183 0.3439
## Number of obs: 456, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.47932 0.15331 3.126 0.00177 **
## scaleloghyper 0.11417 0.09793 1.166 0.24364
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## scalelghypr 0.000
report(goodness.glmer_scaleloghyper)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict goodSentence with scaleloghyper (formula: as.factor(goodSentence) ~
## scaleloghyper). The model included sentenceLength as random effect (formula: ~1
## | sentenceLength). The model's total explanatory power is weak (conditional R2
## = 0.04) and the part related to the fixed effects alone (marginal R2) is of
## 3.81e-03. The model's intercept, corresponding to scaleloghyper = 0, is at 0.48
## (95% CI [0.18, 0.78], p = 0.002). Within this model:
##
## - The effect of scaleloghyper is statistically non-significant and positive
## (beta = 0.11, 95% CI [-0.08, 0.31], p = 0.244; Std. beta = 0.11, 95% CI [-0.08,
## 0.31])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
# scaleloghyper interacting with model
#-------------------------------------
goodness.glmer_scaleloghyper_model=glmer(as.factor(goodSentence) ~ scaleloghyper*model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_scaleloghyper_model, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 0.0253 1 0.8735
## scaleloghyper 0.0419 1 0.8377
## model 21.1174 1 4.32e-06 ***
## scaleloghyper:model 1.0782 1 0.2991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_scaleloghyper_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper * model + (1 | sentenceLength)
## Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 595.6 616.2 -292.8 585.6 451
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8914 -1.0100 0.5642 0.8183 1.2977
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.131 0.3619
## Number of obs: 456, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.02911 0.18286 0.159 0.874
## scaleloghyper 0.02808 0.13709 0.205 0.838
## modelgpt-5.2-chat-latest 0.94644 0.20596 4.595 4.32e-06 ***
## scaleloghyper:modelgpt-5.2-chat-latest 0.20760 0.19993 1.038 0.299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) scllgh m-5.2-
## scalelghypr -0.010
## mdlgp-5.2-- -0.498 0.014
## scll:-5.2-- 0.001 -0.683 0.032
plot_model(goodness.glmer_scaleloghyper_model, type="pred", terms=c("scaleloghyper", "model"))
## Data were 'prettified'. Consider using `terms="scaleloghyper [all]"` to
## get smooth plots.
ggsave(paste0(path, "./results/polysemes_scaleloghyper_model_poststudy.png"), width=7, height=7)
data_monosemous_hyps_only$model_short <- factor(data_monosemous_hyps_only$model,
levels = c("deepseek-chat", "gpt-5.2-chat-latest"),
labels = c("Deepseek", "ChatGPT"))
# goodSentence ~ nws*scaleloghyper
# ---------------------------------
data_post$model_short <- factor(data_post$model,
levels = c("deepseek-chat", "gpt-5.2-chat-latest"),
labels = c("Deepseek", "ChatGPT"))
goodness.glmer_nws_scaleloghyper=glmer(as.factor(goodSentence) ~ numberWordSenses_fourBins*scaleloghyper +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
goodness.glmer_nws_scaleloghyper_add=glmer(as.factor(goodSentence) ~ numberWordSenses_fourBins+scaleloghyper +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_nws_scaleloghyper, goodness.glmer_nws_scaleloghyper_add)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_nws_scaleloghyper_add: as.factor(goodSentence) ~ numberWordSenses_fourBins + scaleloghyper + (1 | sentenceLength)
## goodness.glmer_nws_scaleloghyper: as.factor(goodSentence) ~ numberWordSenses_fourBins * scaleloghyper + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L)
## goodness.glmer_nws_scaleloghyper_add 4 616.48 632.97 -304.24 608.48
## goodness.glmer_nws_scaleloghyper 5 617.92 638.53 -303.96 607.92
## Chisq Df Pr(>Chisq)
## goodness.glmer_nws_scaleloghyper_add
## goodness.glmer_nws_scaleloghyper 0.5646 1 0.4524
Anova(goodness.glmer_nws_scaleloghyper, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 7.3923 1 0.00655 **
## numberWordSenses_fourBins 0.0005 1 0.98306
## scaleloghyper 1.9041 1 0.16762
## numberWordSenses_fourBins:scaleloghyper 0.5699 1 0.45031
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_nws_scaleloghyper)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ numberWordSenses_fourBins * scaleloghyper +
## (1 | sentenceLength)
## Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 617.9 638.5 -304.0 607.9 451
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.5961 -1.1234 0.6973 0.7945 1.3211
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.118 0.3436
## Number of obs: 456, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.489049 0.179872 2.719 0.00655
## numberWordSenses_fourBins 0.001813 0.085412 0.021 0.98306
## scaleloghyper 0.157322 0.114011 1.380 0.16762
## numberWordSenses_fourBins:scaleloghyper -0.067241 0.089073 -0.755 0.45031
##
## (Intercept) **
## numberWordSenses_fourBins
## scaleloghyper
## numberWordSenses_fourBins:scaleloghyper
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) nmWS_B scllgh
## nmbrWrdSn_B -0.516
## scalelghypr 0.073 -0.056
## nmbrWrdS_B: -0.021 -0.133 -0.498
report(goodness.glmer_nws_scaleloghyper)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict goodSentence with numberWordSenses_fourBins and scaleloghyper (formula:
## as.factor(goodSentence) ~ numberWordSenses_fourBins * scaleloghyper). The model
## included sentenceLength as random effect (formula: ~1 | sentenceLength). The
## model's total explanatory power is weak (conditional R2 = 0.04) and the part
## related to the fixed effects alone (marginal R2) is of 5.35e-03. The model's
## intercept, corresponding to numberWordSenses_fourBins = 0 and scaleloghyper =
## 0, is at 0.49 (95% CI [0.14, 0.84], p = 0.007). Within this model:
##
## - The effect of numberWordSenses fourBins is statistically non-significant and
## positive (beta = 1.81e-03, 95% CI [-0.17, 0.17], p = 0.983; Std. beta =
## 2.20e-03, 95% CI [-0.20, 0.20])
## - The effect of scaleloghyper is statistically non-significant and positive
## (beta = 0.16, 95% CI [-0.07, 0.38], p = 0.168; Std. beta = 0.08, 95% CI [-0.13,
## 0.29])
## - The effect of numberWordSenses fourBins × scaleloghyper is statistically
## non-significant and negative (beta = -0.07, 95% CI [-0.24, 0.11], p = 0.450;
## Std. beta = -0.08, 95% CI [-0.29, 0.13])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(goodness.glmer_nws_scaleloghyper, type="pred", terms=c("numberWordSenses_fourBins", "scaleloghyper"))
ggsave(paste0(path, "./results/polysemes_nws*scaleloghyper_poststudy.png"), width= 7, height=7)
max(vif(goodness.glmer_nws_scaleloghyper))
## [1] 1.376861
# 1.1
goodness.glmer_poly_3way=glmer(as.factor(goodSentence) ~ scaleloghyper*numberWordSenses_fourBins*model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
goodness.glmer_poly_2way=glmer(as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins*model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_poly_3way, goodness.glmer_poly_2way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
## goodness.glmer_poly_3way: as.factor(goodSentence) ~ scaleloghyper * numberWordSenses_fourBins * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer_poly_2way 6 598.53 623.26 -293.26 586.53
## goodness.glmer_poly_3way 9 602.53 639.63 -292.26 584.53 1.9997 3
## Pr(>Chisq)
## goodness.glmer_poly_2way
## goodness.glmer_poly_3way 0.5725
goodness.glmer_poly_2way_alt=glmer(as.factor(goodSentence) ~ scaleloghyper*model + numberWordSenses_fourBins +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_poly_3way, goodness.glmer_poly_2way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
## goodness.glmer_poly_3way: as.factor(goodSentence) ~ scaleloghyper * numberWordSenses_fourBins * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer_poly_2way 6 598.53 623.26 -293.26 586.53
## goodness.glmer_poly_3way 9 602.53 639.63 -292.26 584.53 1.9997 3
## Pr(>Chisq)
## goodness.glmer_poly_2way
## goodness.glmer_poly_3way 0.5725
anova(goodness.glmer_poly_2way_alt, goodness.glmer_poly_2way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_2way_alt: as.factor(goodSentence) ~ scaleloghyper * model + numberWordSenses_fourBins + (1 | sentenceLength)
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer_poly_2way_alt 6 597.55 622.29 -292.78 585.55
## goodness.glmer_poly_2way 6 598.53 623.26 -293.26 586.53 0 0
## Pr(>Chisq)
## goodness.glmer_poly_2way_alt
## goodness.glmer_poly_2way
# => 2way is the best fit
goodness.glmer_poly_1way=glmer(as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins + model +(1|sentenceLength), data=data_monosemous_hyps_only,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_poly_2way, goodness.glmer_poly_1way)
## Data: data_monosemous_hyps_only
## Models:
## goodness.glmer_poly_1way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins + model + (1 | sentenceLength)
## goodness.glmer_poly_2way: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer_poly_1way 5 596.63 617.24 -293.31 586.63
## goodness.glmer_poly_2way 6 598.53 623.26 -293.26 586.53 0.1 1
## Pr(>Chisq)
## goodness.glmer_poly_1way
## goodness.glmer_poly_2way 0.7519
Anova(goodness.glmer_poly_2way, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 0.0008 1 0.9779870
## scaleloghyper 1.5413 1 0.2144254
## numberWordSenses_fourBins 0.0767 1 0.7818777
## model 13.2600 1 0.0002711 ***
## numberWordSenses_fourBins:model 0.1002 1 0.7516069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_poly_2way)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins *
## model + (1 | sentenceLength)
## Data: data_monosemous_hyps_only
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 598.5 623.3 -293.3 586.5 450
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8360 -1.0129 0.5788 0.8073 1.3247
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.1304 0.3612
## Number of obs: 456, groups: sentenceLength, 15
##
## Fixed effects:
## Estimate Std. Error z value
## (Intercept) -0.006152 0.222967 -0.028
## scaleloghyper 0.126001 0.101492 1.241
## numberWordSenses_fourBins 0.031820 0.114928 0.277
## modelgpt-5.2-chat-latest 0.999912 0.274594 3.641
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest -0.053042 0.167579 -0.317
## Pr(>|z|)
## (Intercept) 0.977987
## scaleloghyper 0.214425
## numberWordSenses_fourBins 0.781878
## modelgpt-5.2-chat-latest 0.000271 ***
## numberWordSenses_fourBins:modelgpt-5.2-chat-latest 0.751607
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) scllgh nmWS_B m-5.2-
## scalelghypr 0.041
## nmbrWrdSn_B -0.572 -0.091
## mdlgp-5.2-- -0.563 0.039 0.454
## nWS_B:-5.2- 0.372 -0.018 -0.662 -0.662
report(goodness.glmer_poly_2way)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict goodSentence with scaleloghyper, numberWordSenses_fourBins and model
## (formula: as.factor(goodSentence) ~ scaleloghyper + numberWordSenses_fourBins *
## model). The model included sentenceLength as random effect (formula: ~1 |
## sentenceLength). The model's total explanatory power is weak (conditional R2 =
## 0.10) and the part related to the fixed effects alone (marginal R2) is of 0.07.
## The model's intercept, corresponding to scaleloghyper = 0,
## numberWordSenses_fourBins = 0 and model = deepseek-chat-3.2, is at -6.15e-03
## (95% CI [-0.44, 0.43], p = 0.978). Within this model:
##
## - The effect of scaleloghyper is statistically non-significant and positive
## (beta = 0.13, 95% CI [-0.07, 0.32], p = 0.214; Std. beta = 0.13, 95% CI [-0.07,
## 0.32])
## - The effect of numberWordSenses fourBins is statistically non-significant and
## positive (beta = 0.03, 95% CI [-0.19, 0.26], p = 0.782; Std. beta = 0.04, 95%
## CI [-0.23, 0.31])
## - The effect of model [gpt-5.2-chat-latest] is statistically significant and
## positive (beta = 1.00, 95% CI [0.46, 1.54], p < .001; Std. beta = 0.94, 95% CI
## [0.54, 1.34])
## - The effect of numberWordSenses fourBins × model [gpt-5.2-chat-latest] is
## statistically non-significant and negative (beta = -0.05, 95% CI [-0.38, 0.28],
## p = 0.752; Std. beta = -0.06, 95% CI [-0.46, 0.33])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(goodness.glmer_poly_2way, type="pred", terms=c("model", "scaleloghyper", "numberWordSenses_fourBins"))
plot_model(goodness.glmer_poly_2way, type="pred", ncol = 4, terms=c("scaleloghyper [all]", "model", "numberWordSenses_fourBins"))
ggsave(paste0(path, "./results/polysemes_model_2way_poststudy.png"), width= 7, height=7)
plot(allEffects(goodness.glmer_poly_2way))
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictor scaleloghyper is a one-column matrix that was converted to a vector
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictor scaleloghyper is a one-column matrix that was converted to a vector
eff <- ggpredict(goodness.glmer_poly_2way, terms = c("numberWordSenses_fourBins", "model", "scaleloghyper"))
plot(eff)
emmeans(goodness.glmer_poly_3way, pairwise ~ model)
## NOTE: Results may be misleading due to involvement in interactions
## $emmeans
## model emmean SE df asymp.LCL asymp.UCL
## deepseek-chat-3.2 0.0335 0.183 Inf -0.325 0.392
## gpt-5.2-chat-latest 0.9957 0.196 Inf 0.611 1.381
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df z.ratio p.value
## (deepseek-chat-3.2) - (gpt-5.2-chat-latest) -0.962 0.21 Inf -4.591 <0.0001
##
## Results are given on the log odds ratio (not the response) scale.
emm_model <- emmeans(goodness.glmer_poly_2way,
~ model,
type = "response")
## NOTE: Results may be misleading due to involvement in interactions
emm_df <- as.data.frame(emm_model)
emm_df
## model response SE df asymp.LCL asymp.UCL
## deepseek-chat-3.2 0.5072539 0.04572661 Inf 0.4183507 0.5957008
## gpt-5.2-chat-latest 0.7251800 0.03893344 Inf 0.6427710 0.7946517
##
## Unknown transformation "as.factor": no transformation done
## Confidence level used: 0.95
# Open a PNG device
png(paste0(path,"./results/polysemes_predictedProbabilityGoodSentence_poststudy.png"), width = 1200, height = 800, res = 150)
ggplot(emm_df, aes(x = model,
y = response,
ymin = asymp.LCL,
ymax = asymp.UCL)) +
geom_pointrange(size = 0.8) +
geom_line(aes(group = 1), linetype = "dashed", color = "grey50") +
labs(
x = "Model",
y = "Predicted Probability of 'Good Sentence'",
title = "Estimated Marginal Means by Model"
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(angle = 25, hjust = 1),
plot.title = element_text(face = "bold")
)
# Close the device to save the file
dev.off()
## quartz_off_screen
## 2
plot_model(goodness.glmer_poly_3way,
type = "pred",
terms=c("model"),
transform_terms = function(term) {
dplyr::recode(term,
"deepseek-chat" = "Deepseek",
"gpt-5.2-chat-latest" = "ChatGPT"
)
})
# 3.10