library(ggplot2)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(car)
## Loading required package: carData
library(sjPlot)
## #refugeeswelcome
library(emmeans)
## Warning: package 'emmeans' was built under R version 4.4.3
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(lattice)
library(irr)
## Loading required package: lpSolve
library(cvms)
library(epiR)
## Loading required package: survival
## Package epiR 2.0.84 is loaded
## Type help(epi.about) for summary information
## Type browseVignettes(package = 'epiR') to learn how to use epiR for applied epidemiological analyses
## 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(effects)
## Use the command
##     lattice::trellis.par.set(effectsTheme())
##   to customize lattice options for effects plots.
## See ?effectsTheme for details.
library(ggeffects) # install.packages("ggeffects")

Using long format; this is the script for mononsemes.

Using the CSV files generated/appended by Prolog

in line 40, use *_corrected.csv for post-adjudication phase data

To be read in conjunction with paper for submission to “Natural Language Processing”, CUP

C. Zinn, 2025

path="/Users/zinn/Projects/EKUT/GermaNet/GermaNet_LLM/R_analyses/annotations/"

use_poststudy_data_use_adj <- TRUE
#use_poststudy_data_use_adj <- FALSE

if (use_poststudy_data_use_adj) {
  data=read.table( paste0(path, "monosemes_poststudy_corrected.csv"), header=T, sep=";")
  message("Loaded monosemes_poststudy_corrected.csv")
  
} else {
  data=read.table( paste0(path, "monosemes_poststudy.csv"), header=T, sep=";")
  message("Loaded monosemes_poststudy.csv")
}
## Loaded monosemes_poststudy_corrected.csv
# for Table 5. LLMs and their error types
counts <- data %>%
  group_by(errorType, model) %>%
  summarise(count = n(), .groups = "drop")

print(counts)
## # A tibble: 6 × 3
##   errorType model               count
##   <chr>     <chr>               <int>
## 1 0         deepseek-chat           6
## 2 0         gpt-5.2-chat-latest    46
## 3 FL        deepseek-chat          10
## 4 ha        gpt-5.2-chat-latest     2
## 5 nn        deepseek-chat           2
## 6 nn        gpt-5.2-chat-latest     6
countsAnno <- data %>%
  group_by(errorType, model, annotator) %>%
  summarise(count = n(), .groups = "drop")

print(countsAnno)
## # A tibble: 11 × 4
##    errorType model               annotator count
##    <chr>     <chr>               <chr>     <int>
##  1 0         deepseek-chat       et            4
##  2 0         deepseek-chat       rb            2
##  3 0         gpt-5.2-chat-latest et           22
##  4 0         gpt-5.2-chat-latest rb           24
##  5 FL        deepseek-chat       et            5
##  6 FL        deepseek-chat       rb            5
##  7 ha        gpt-5.2-chat-latest et            1
##  8 ha        gpt-5.2-chat-latest rb            1
##  9 nn        deepseek-chat       rb            2
## 10 nn        gpt-5.2-chat-latest et            4
## 11 nn        gpt-5.2-chat-latest rb            2
# see how each sentence id is ranked by errorType per annotator (sorted by id)
countsId <- data %>%
  count(id, errorType, annotator) %>%
  arrange(id)

# print(countsId)

nrow(data)
## [1] 72
# 5.1.1 Length of example sentences
# ---------------------------------

mean(data$sentenceLength)
## [1] 12.16667
median(data$sentenceLength)
## [1] 11
mean(data$sentenceStringLength)
## [1] 82.97222
median(data$sentenceStringLength)
## [1] 80.5
# split by model
claude=data[data$model=="claude-3-5-haiku-20241022",]
mean(claude$sentenceLength)
## [1] NaN
mean(claude$sentenceStringLength)
## [1] NaN
deep=data[data$model=="deepseek-chat",]
mean(deep$sentenceLength)
## [1] 9.444444
mean(deep$sentenceStringLength)
## [1] 70.88889
chat=data[data$model=="gpt-4o",]
mean(chat$sentenceLength)
## [1] NaN
mean(chat$sentenceStringLength)
## [1] NaN
# lemma might influence sentence length, so this may be too simplistic.
# sl_simple=lm(sentenceLength ~ model, data=data)

sl = lmer(sentenceLength ~ model + (1 | lemma),  data = data)
Anova(sl, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: sentenceLength
##               Chisq Df Pr(>Chisq)    
## (Intercept) 135.282  1  < 2.2e-16 ***
## model        21.445  1  3.642e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(sl)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sentenceLength ~ model + (1 | lemma)
##    Data: data
## 
## REML criterion at convergence: 253.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.40226 -0.66317 -0.07835  0.61341  2.37538 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  lemma    (Intercept) 5.521    2.350   
##  Residual             1.244    1.115   
## Number of obs: 72, groups:  lemma, 11
## 
## Fixed effects:
##                          Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)               10.0075     0.8604 17.8828  11.631 8.99e-10 ***
## modelgpt-5.2-chat-latest   2.8150     0.6079 69.3818   4.631 1.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## mdlgp-5.2-- -0.546
data$logSentenceStringLength=log(data$sentenceStringLength)

# sentenceLength (not significant)
goodness.glmer=glmer(as.factor(goodSentence)  ~ sentenceLength +(1|lemma), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                 Chisq Df Pr(>Chisq)   
## (Intercept)    6.8001  1   0.009115 **
## sentenceLength 9.0446  1   0.002635 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#summary(goodness.glmer)
# 5.1.2.A lemma & hypernym frequencies and sentence quality
# -------------------------------------------------------
data=read.table( paste0(path, "monosemes_corrected.csv"), header=T, sep=";")
nrow(data)
## [1] 1791
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(log(data$frequency))
mean(data$scalelogfreq)
## [1] 2.729649e-14
png(paste0(path, "./results/lemmaFrequency.png"), width = 800, height = 600)
plot(density(scale(data$logfreq)), main = "Density Plot")
# Close the device to save
dev.off()
## quartz_off_screen 
##                 2
ggplot(data, aes(x = frequency)) +
  geom_histogram(bins = 50) +
  labs(title = "Word Frequency Distribution",
       x = "Word Frequency",
       y = "Number of Word Types") +
  theme_minimal()

# frequencies are highly skewed (they usually are!), log-transform:
ggplot(data, aes(x = frequency)) +
  geom_histogram(bins = 50) +
  scale_x_log10() +
  labs(title = "Word Frequency Distribution (log scale)",
       x = "Word Frequency (log10)",
       y = "Number of Word Types") +
  theme_minimal()

# Prepare data
df_zipf <- data %>%
  arrange(desc(frequency)) %>%
  mutate(rank = row_number())

# Plot (log–log)
ggplot(df_zipf, aes(x = rank, y = frequency)) +
  geom_line() +
  scale_x_log10() +
  scale_y_log10() +
  labs(
    title = "Zipf Plot (Rank–Frequency Distribution)",
    x = "Rank (log scale)",
    y = "Frequency (log scale)"
  ) +
  theme_minimal()

# goodSentence˜scalelogfreq
# .........................

# isSingular
# goodness.glmer.scalelogfreq=glmer(as.factor(goodSentence)  ~ scalelogfreq +(1|annotator)+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# removing annotator helps with singularity
goodness.glmer.scalelogfreq=glmer(as.factor(goodSentence)  ~ scalelogfreq +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer.scalelogfreq, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                 Chisq Df Pr(>Chisq)    
## (Intercept)  223.4240  1     <2e-16 ***
## scalelogfreq   4.2871  1     0.0384 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ranef(goodness.glmer.scalelogfreq)
## $sentenceLength
##     (Intercept)
## 7  -0.105788684
## 8  -0.081478364
## 9  -0.049633220
## 10 -0.376119440
## 11  0.284501864
## 12 -0.064219777
## 13 -0.066604158
## 14  0.093096135
## 15  0.024493814
## 16  0.139068935
## 17 -0.049147983
## 18 -0.164098872
## 19  0.166281629
## 20  0.337673906
## 21  0.007740963
## 22 -0.068946655
## 23 -0.061292063
## 24  0.021932917
## 25 -0.073512173
## 28  0.021763640
## 
## with conditional variances for "sentenceLength"
summary(goodness.glmer.scalelogfreq)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq + (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1748.8    1765.3    -871.4    1742.8      1788 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5631  0.4151  0.4680  0.5002  0.6579 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.06547  0.2559  
## Number of obs: 1791, groups:  sentenceLength, 20
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.43503    0.09601  14.947   <2e-16 ***
## scalelogfreq  0.12536    0.06054   2.071   0.0384 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## scalelogfrq 0.060
plot_model(goodness.glmer.scalelogfreq, type="pred", terms=c("scalelogfreq [all]"))
## You are calculating adjusted predictions on the population-level (i.e.
##   `type = "fixed"`) for a *generalized* linear mixed model.
##   This may produce biased estimates due to Jensen's inequality. Consider
##   setting `bias_correction = TRUE` to correct for this bias.
##   See also the documentation of the `bias_correction` argument.

ggsave(paste0(path, "./results/monosemes_scalelogfreq.png"), width= 7, height=7)
# 5.1.2.B hypernym frequency and sentence quality
# ------------------------------------------

# read the data anew
data=read.table( paste0(path, "monosemes_corrected.csv"), header=T, sep=";")
nrow(data)
## [1] 1791
View(data)
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(log(data$frequency))
range(data$scalelogfreq)
## [1] -3.003094  1.422094
# write NA in case there is no frequency information for the hypernym
data$frequencyHypernym[data$frequencyHypernym == 0] <- NA
data$loghyper=log(data$frequencyHypernym)
data$scaleloghyper=scale(data$loghyper)

data_with_nonzero_freq_hyps = data[data$frequencyHypernym=='NA',]
nrow(data_with_nonzero_freq_hyps)
## [1] 180
# 180

plot(data$logfreq, data$loghyper)

cor.test(data$logfreq, data$loghyper, method = c("kendall")) 
## 
##  Kendall's rank correlation tau
## 
## data:  data$logfreq and data$loghyper
## z = -4.5984, p-value = 4.258e-06
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##         tau 
## -0.07727761
cor.test(data$logfreq, data$loghyper, method = c("spearman")) 
## Warning in cor.test.default(data$logfreq, data$loghyper, method =
## c("spearman")): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  data$logfreq and data$loghyper
## S = 769255633, p-value = 2.929e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1039146
cor.test(data$logfreq, data$loghyper, method = c("pearson")) 
## 
##  Pearson's product-moment correlation
## 
## data:  data$logfreq and data$loghyper
## t = -2.701, df = 1609, p-value = 0.006986
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.11564238 -0.01840586
## sample estimates:
##         cor 
## -0.06718364
# isSingular
# goodness.glmer.scaleloghyper=glmer(as.factor(goodSentence)  ~ scaleloghyper +(1|annotator)+(1|sentenceLength), data=data, family="binomial",control=glmerControl(optimizer="bobyqa"))

# removing annotator helps with singularity
goodness.glmer.scaleloghyper=glmer(as.factor(goodSentence)  ~ scaleloghyper +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer.scaleloghyper, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                 Chisq Df Pr(>Chisq)    
## (Intercept)   140.072  1  < 2.2e-16 ***
## scaleloghyper  10.462  1   0.001219 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.scaleloghyper)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper + (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1560.9    1577.1    -777.5    1554.9      1608 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6607  0.3951  0.4484  0.5007  0.7751 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.1267   0.356   
## Number of obs: 1611, groups:  sentenceLength, 20
## 
## Fixed effects:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.4220     0.1202  11.835  < 2e-16 ***
## scaleloghyper   0.2034     0.0629   3.234  0.00122 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## scalelghypr 0.037
plot_model(goodness.glmer.scaleloghyper, type="pred", terms=c("scaleloghyper"))
## Data were 'prettified'. Consider using `terms="scaleloghyper [all]"` to
##   get smooth plots.

ggsave(paste0(path, "./results/monosemes_scaleloghyper.png"), width= 7, height=7)

# 5.1.2.C combined frequencies
# --------------------------
goodness.glmer.freqAdditiv=glmer(as.factor(goodSentence)  ~ scalelogfreq+scaleloghyper +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
goodness.glmer.freqAdditiv_rs1=glmer(as.factor(goodSentence)  ~ scalelogfreq+scaleloghyper +(1+scalelogfreq|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

anova(goodness.glmer.freqAdditiv, goodness.glmer.freqAdditiv_rs1)
## Data: data
## Models:
## goodness.glmer.freqAdditiv: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqAdditiv_rs1: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 + scalelogfreq | sentenceLength)
##                                npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer.freqAdditiv        4 1555.5 1577.1 -773.77    1547.5          
## goodness.glmer.freqAdditiv_rs1    6 1545.7 1578.0 -766.85    1533.7 13.822  2
##                                Pr(>Chisq)    
## goodness.glmer.freqAdditiv                   
## goodness.glmer.freqAdditiv_rs1  0.0009967 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# -> rs1 is better

Anova(goodness.glmer.freqAdditiv_rs1, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                  Chisq Df Pr(>Chisq)    
## (Intercept)   193.1568  1  < 2.2e-16 ***
## scalelogfreq    1.1157  1  0.2908448    
## scaleloghyper  11.3875  1  0.0007394 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.freqAdditiv_rs1, type="III")
## Warning in summary.merMod(goodness.glmer.freqAdditiv_rs1, type = "III"):
## additional arguments ignored
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 +  
##     scalelogfreq | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1545.7    1578.0    -766.9    1533.7      1605 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0396  0.3598  0.4279  0.4951  1.0404 
## 
## Random effects:
##  Groups         Name         Variance Std.Dev. Corr
##  sentenceLength (Intercept)  0.08409  0.2900       
##                 scalelogfreq 0.15830  0.3979   0.83
## Number of obs: 1611, groups:  sentenceLength, 20
## 
## Fixed effects:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.53294    0.11030  13.898  < 2e-16 ***
## scalelogfreq   0.13925    0.13183   1.056 0.290845    
## scaleloghyper  0.22127    0.06557   3.375 0.000739 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) scllgf
## scalelogfrq 0.448        
## scalelghypr 0.028  0.052
plot_model(goodness.glmer.freqAdditiv_rs1, type="pred", terms=c("scalelogfreq [all]", "scaleloghyper"))

# isSingular
# goodness.glmer.freqAdditiv_rs2=glmer(as.factor(goodSentence)  ~ scalelogfreq+scaleloghyper +(1+scalelogfreq*scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# isSingular
# goodness.glmer.freqAdditiv_rs2=glmer(as.factor(goodSentence)  ~ scalelogfreq+scaleloghyper +(1+scalelogfreq+scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# isSingular
# goodness.glmer.freqAdditiv_rs2=glmer(as.factor(goodSentence)  ~ scalelogfreq+scaleloghyper + (1+scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

p1 <- ggpredict(goodness.glmer.freqAdditiv, terms = "scalelogfreq [all]")
p2 <- ggpredict(goodness.glmer.freqAdditiv, terms = "scaleloghyper [all]")

plot(p1) + theme_minimal() + labs(y="Predicted P(good sentence)")

ggsave(paste0(path, "./results/monosemes_scalelogfreq_bw.png"), width= 7, height=7)
plot(p2) + theme_minimal() + labs(y="Predicted P(good sentence)")

ggsave(paste0(path, "./results/monosemes_scaleloghyper_bw.png"), width= 7, height=7)

# --- interaction between the combined frequencies?
# isSingular
# goodness.glmer.freqInteract
# removing annotator helps
goodness.glmer.freqInteract=glmer(as.factor(goodSentence)  ~ scalelogfreq*scaleloghyper +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer.freqInteract, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                               Chisq Df Pr(>Chisq)    
## (Intercept)                148.4832  1  < 2.2e-16 ***
## scalelogfreq                 4.5031  1  0.0338330 *  
## scaleloghyper               13.8612  1  0.0001968 ***
## scalelogfreq:scaleloghyper   4.7810  1  0.0287754 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.freqInteract)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 |  
##     sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1552.7    1579.7    -771.4    1542.7      1606 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7316  0.3849  0.4398  0.4892  1.0889 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.1207   0.3474  
## Number of obs: 1611, groups:  sentenceLength, 20
## 
## Fixed effects:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.43933    0.11812  12.185  < 2e-16 ***
## scalelogfreq                0.14150    0.06668   2.122 0.033833 *  
## scaleloghyper               0.24531    0.06589   3.723 0.000197 ***
## scalelogfreq:scaleloghyper -0.15810    0.07230  -2.187 0.028775 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) scllgf scllgh
## scalelogfrq  0.046              
## scalelghypr  0.050 -0.022       
## scllgfrq:sc -0.032  0.252 -0.193
# random slope for sentence length
# isSingular
# goodness.glmer.freqInteract_rs=glmer(as.factor(goodSentence)  ~ scalelogfreq*scaleloghyper +(1+scalelogfreq|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# isSingular
# goodness.glmer.freqInteract_rs=glmer(as.factor(goodSentence)  ~ scalelogfreq*scaleloghyper +(1+scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# isSingular
# goodness.glmer.freqInteract_rs=glmer(as.factor(goodSentence)  ~ scalelogfreq*scaleloghyper +(1+scalelogfreq*scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# additiv model is better dann scaleloghyper
anova(goodness.glmer.freqAdditiv, goodness.glmer.scaleloghyper)
## Data: data
## Models:
## goodness.glmer.scaleloghyper: as.factor(goodSentence) ~ scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqAdditiv: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 | sentenceLength)
##                              npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer.scaleloghyper    3 1560.9 1577.1 -777.47    1554.9          
## goodness.glmer.freqAdditiv      4 1555.5 1577.1 -773.77    1547.5 7.4054  1
##                              Pr(>Chisq)   
## goodness.glmer.scaleloghyper              
## goodness.glmer.freqAdditiv     0.006503 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# not fitted with same dataset
# anova(goodness.glmer.freqAdditiv, goodness.glmer.scalelogfreq)

# interact model is better than additiv model
anova(goodness.glmer.freqInteract, goodness.glmer.freqAdditiv)
## Data: data
## Models:
## goodness.glmer.freqAdditiv: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqInteract: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 | sentenceLength)
##                             npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer.freqAdditiv     4 1555.5 1577.1 -773.77    1547.5          
## goodness.glmer.freqInteract    5 1552.7 1579.7 -771.36    1542.7 4.8008  1
##                             Pr(>Chisq)  
## goodness.glmer.freqAdditiv              
## goodness.glmer.freqInteract    0.02845 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(goodness.glmer.freqInteract, goodness.glmer.freqAdditiv_rs1)
## Data: data
## Models:
## goodness.glmer.freqInteract: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqAdditiv_rs1: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 + scalelogfreq | sentenceLength)
##                                npar    AIC    BIC  logLik -2*log(L)  Chisq Df
## goodness.glmer.freqInteract       5 1552.7 1579.7 -771.36    1542.7          
## goodness.glmer.freqAdditiv_rs1    6 1545.7 1578.0 -766.85    1533.7 9.0214  1
##                                Pr(>Chisq)   
## goodness.glmer.freqInteract                 
## goodness.glmer.freqAdditiv_rs1   0.002668 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_model(goodness.glmer.freqInteract, type="pred", terms=c("scalelogfreq", "scaleloghyper"), ci.lvl=NA, show.p=T)
## Data were 'prettified'. Consider using `terms="scalelogfreq [all]"` to
##   get smooth plots.

# ==> best model is the interaction model
Anova(goodness.glmer.freqInteract, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                               Chisq Df Pr(>Chisq)    
## (Intercept)                148.4832  1  < 2.2e-16 ***
## scalelogfreq                 4.5031  1  0.0338330 *  
## scaleloghyper               13.8612  1  0.0001968 ***
## scalelogfreq:scaleloghyper   4.7810  1  0.0287754 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.freqInteract)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 |  
##     sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1552.7    1579.7    -771.4    1542.7      1606 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7316  0.3849  0.4398  0.4892  1.0889 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.1207   0.3474  
## Number of obs: 1611, groups:  sentenceLength, 20
## 
## Fixed effects:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.43933    0.11812  12.185  < 2e-16 ***
## scalelogfreq                0.14150    0.06668   2.122 0.033833 *  
## scaleloghyper               0.24531    0.06589   3.723 0.000197 ***
## scalelogfreq:scaleloghyper -0.15810    0.07230  -2.187 0.028775 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) scllgf scllgh
## scalelogfrq  0.046              
## scalelghypr  0.050 -0.022       
## scllgfrq:sc -0.032  0.252 -0.193
plot_model(goodness.glmer.freqAdditiv, type="pred", terms=c("scalelogfreq", "scaleloghyper"), ci.lvl=NA, show.p=T)
## Data were 'prettified'. Consider using `terms="scalelogfreq [all]"` to
##   get smooth plots.

pred <- ggpredict(
  goodness.glmer.freqInteract,
  terms = c("scalelogfreq [-2:2]", "scaleloghyper [-1,0,1]")
)

plot(pred) +
  labs(
    x = "Lemma frequency (scaled log)",
    y = "Predicted probability of good sentence",
    color = "Hypernym frequency"
  ) +
  theme_minimal()

ggsave(paste0(path, "./results/monosemes_frequencyInteraction.png"), width= 7, height=7)
#data=read.table( paste0(path, "monosemes.csv"), header=T, sep=";")
data=read.table( paste0(path, "monosemes_corrected.csv"), header=T, sep=";")

# 5.1.3 goodSentence ~ model + frequencies
# ----------------------------------------
# use scalelogfreq ensuring that log is not zero
nrow(data)
## [1] 1791
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(log(data$frequency))
mean(data$scalelogfreq)
## [1] 2.729649e-14
# write NA in case there is no frequency information for the hypernym
data$frequencyHypernym[data$frequencyHypernym == 0] <- NA
data$loghyper=log(data$frequencyHypernym)
data$scaleloghyper=scale(data$loghyper)


# isSingular
# goodness.glmer_modelOnly=glmer(as.factor(goodSentence)  ~ model +(1|sentenceLength)+(1|annotator), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# model only, (annotator had to go) *** 
goodness.glmer_modelOnly=glmer(as.factor(goodSentence)  ~ model +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_modelOnly, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##              Chisq Df Pr(>Chisq)    
## (Intercept) 74.638  1  < 2.2e-16 ***
## model       33.400  2  5.589e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_modelOnly)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ model + (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1721.1    1743.1    -856.6    1713.1      1787 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9280  0.3751  0.4294  0.5209  0.6924 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.05293  0.2301  
## Number of obs: 1791, groups:  sentenceLength, 20
## 
## Fixed effects:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.0102     0.1169   8.639  < 2e-16 ***
## modeldeepseek-chat   0.8578     0.1528   5.615 1.97e-08 ***
## modelgpt-4o          0.5015     0.1442   3.478 0.000505 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) mdldp-
## mdldpsk-cht -0.489       
## modelgpt-4o -0.545  0.396
plot_model(goodness.glmer_modelOnly, type="pred", terms=c("model"))

ggsave(paste0(path, "./results/monosemes_model.png"), width= 7, height=7)

# test random slopes
# isSingular
# goodness.glmer_modelOnly_rs=glmer(as.factor(goodSentence)  ~ model +(1+model|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# pairwise comparison
# deepseek is better than claude; gpt is better than claude, but deepseek not significantly better than gpt.
emmeans(goodness.glmer_modelOnly, list(pairwise~model))
## $`emmeans of model`
##  model                     emmean    SE  df asymp.LCL asymp.UCL
##  claude-3-5-haiku-20241022   1.01 0.117 Inf     0.781      1.24
##  deepseek-chat               1.87 0.140 Inf     1.594      2.14
##  gpt-4o                      1.51 0.127 Inf     1.263      1.76
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $`pairwise differences of model`
##  1                                             estimate    SE  df z.ratio
##  (claude-3-5-haiku-20241022) - (deepseek-chat)   -0.858 0.153 Inf  -5.615
##  (claude-3-5-haiku-20241022) - (gpt-4o)          -0.502 0.144 Inf  -3.478
##  (deepseek-chat) - (gpt-4o)                       0.356 0.163 Inf   2.180
##  p.value
##  <0.0001
##   0.0015
##   0.0746
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 3 estimates
# scalelogfreq plus model
# -----------------------

# isSingular
# goodness.glmer_modelScalelogfreq=glmer(as.factor(goodSentence)  ~ scalelogfreq+model +(1|annotator)+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

goodness.glmer_modelScalelogfreq_add=glmer(as.factor(goodSentence)  ~ scalelogfreq+model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

anova(goodness.glmer_modelOnly, goodness.glmer_modelScalelogfreq_add)
## Data: data
## Models:
## goodness.glmer_modelOnly: as.factor(goodSentence) ~ model + (1 | sentenceLength)
## goodness.glmer_modelScalelogfreq_add: as.factor(goodSentence) ~ scalelogfreq + model + (1 | sentenceLength)
##                                      npar    AIC    BIC  logLik -2*log(L)
## goodness.glmer_modelOnly                4 1721.2 1743.1 -856.57    1713.2
## goodness.glmer_modelScalelogfreq_add    5 1719.0 1746.5 -854.52    1709.0
##                                       Chisq Df Pr(>Chisq)  
## goodness.glmer_modelOnly                                   
## goodness.glmer_modelScalelogfreq_add 4.1105  1    0.04262 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# "A likelihood-ratio test comparing a model with only LLM identity to a model additionally including scaled log frequency showed a significant improvement in fit (χ²(1) = 4.11, p = .043), indicating that lexical frequency contributes independently to sentence quality judgments."

goodness.glmer_modelScalelogfreq_inter=glmer(as.factor(goodSentence)  ~ scalelogfreq*model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_modelScalelogfreq_add, goodness.glmer_modelScalelogfreq_inter)
## Data: data
## Models:
## goodness.glmer_modelScalelogfreq_add: as.factor(goodSentence) ~ scalelogfreq + model + (1 | sentenceLength)
## goodness.glmer_modelScalelogfreq_inter: as.factor(goodSentence) ~ scalelogfreq * model + (1 | sentenceLength)
##                                        npar    AIC    BIC  logLik -2*log(L)
## goodness.glmer_modelScalelogfreq_add      5 1719.0 1746.5 -854.52    1709.0
## goodness.glmer_modelScalelogfreq_inter    7 1714.8 1753.2 -850.39    1700.8
##                                         Chisq Df Pr(>Chisq)  
## goodness.glmer_modelScalelogfreq_add                         
## goodness.glmer_modelScalelogfreq_inter 8.2641  2    0.01605 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# => inter is better

goodness.glmer_modelScalelogfreq_inter_rs=glmer(as.factor(goodSentence)  ~ scalelogfreq*model+(1+scalelogfreq|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

anova(goodness.glmer_modelScalelogfreq_inter, goodness.glmer_modelScalelogfreq_inter_rs)
## Data: data
## Models:
## goodness.glmer_modelScalelogfreq_inter: as.factor(goodSentence) ~ scalelogfreq * model + (1 | sentenceLength)
## goodness.glmer_modelScalelogfreq_inter_rs: as.factor(goodSentence) ~ scalelogfreq * model + (1 + scalelogfreq | sentenceLength)
##                                           npar    AIC    BIC  logLik -2*log(L)
## goodness.glmer_modelScalelogfreq_inter       7 1714.8 1753.2 -850.39    1700.8
## goodness.glmer_modelScalelogfreq_inter_rs    9 1706.5 1755.9 -844.24    1688.5
##                                            Chisq Df Pr(>Chisq)   
## goodness.glmer_modelScalelogfreq_inter                           
## goodness.glmer_modelScalelogfreq_inter_rs 12.292  2   0.002142 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# = rs is better

# take sclaloghyper on board
goodness.glmer_modelFrequencies_add=glmer(as.factor(goodSentence)  ~ scalelogfreq+scaleloghyper+model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

# not comparable, fitted on different data set.
# anova(goodness.glmer_modelScalelogfreq_inter_rs,goodness.glmer_modelFrequencies_add )

goodness.glmer_modelFrequencies_interact=glmer(as.factor(goodSentence)  ~ scalelogfreq*scaleloghyper*model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))

anova(goodness.glmer_modelFrequencies_add,goodness.glmer_modelFrequencies_interact)
## Data: data
## Models:
## goodness.glmer_modelFrequencies_add: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + model + (1 | sentenceLength)
## goodness.glmer_modelFrequencies_interact: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper * model + (1 | sentenceLength)
##                                          npar    AIC    BIC  logLik -2*log(L)
## goodness.glmer_modelFrequencies_add         6 1527.1 1559.4 -757.54    1515.1
## goodness.glmer_modelFrequencies_interact   13 1510.4 1580.4 -742.22    1484.4
##                                           Chisq Df Pr(>Chisq)    
## goodness.glmer_modelFrequencies_add                              
## goodness.glmer_modelFrequencies_interact 30.643  7  7.235e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# => Comparing the additive model (frequency + hypernym frequency + model) to the fully interacted model (frequency × hypernym frequency × model) reveals a substantial and highly significant improvement in model fit (χ²(7) = 30.64, p < .001). The interaction model yields a notably lower AIC (ΔAIC = −16.7), indicating that the relationship between lexical frequency measures and sentence quality differs systematically across generation models.

Anova(goodness.glmer_modelFrequencies_interact, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: as.factor(goodSentence)
##                                    Chisq Df Pr(>Chisq)    
## (Intercept)                      48.2802  1  3.695e-12 ***
## scalelogfreq                      0.0060  1   0.938364    
## scaleloghyper                     6.7258  1   0.009503 ** 
## model                            35.2266  2  2.242e-08 ***
## scalelogfreq:scaleloghyper        0.0049  1   0.944343    
## scalelogfreq:model                4.6329  2   0.098621 .  
## scaleloghyper:model               5.0760  2   0.079026 .  
## scalelogfreq:scaleloghyper:model 11.8255  2   0.002705 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_modelFrequencies_interact)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper * model +  
##     (1 | sentenceLength)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1510.4    1580.4    -742.2    1484.4      1598 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.7059  0.2913  0.3940  0.5258  1.3877 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  sentenceLength (Intercept) 0.1209   0.3478  
## Number of obs: 1611, groups:  sentenceLength, 20
## 
## Fixed effects:
##                                                Estimate Std. Error z value
## (Intercept)                                    0.993748   0.143018   6.948
## scalelogfreq                                   0.007933   0.102588   0.077
## scaleloghyper                                  0.264884   0.102137   2.593
## modeldeepseek-chat                             0.990484   0.175188   5.654
## modelgpt-4o                                    0.589279   0.158895   3.709
## scalelogfreq:scaleloghyper                    -0.007838   0.112269  -0.070
## scalelogfreq:modeldeepseek-chat                0.350926   0.163514   2.146
## scalelogfreq:modelgpt-4o                       0.156969   0.155702   1.008
## scaleloghyper:modeldeepseek-chat              -0.342076   0.186663  -1.833
## scaleloghyper:modelgpt-4o                      0.089030   0.156052   0.571
## scalelogfreq:scaleloghyper:modeldeepseek-chat -0.626448   0.192126  -3.261
## scalelogfreq:scaleloghyper:modelgpt-4o        -0.044516   0.172127  -0.259
##                                               Pr(>|z|)    
## (Intercept)                                   3.69e-12 ***
## scalelogfreq                                  0.938364    
## scaleloghyper                                 0.009503 ** 
## modeldeepseek-chat                            1.57e-08 ***
## modelgpt-4o                                   0.000208 ***
## scalelogfreq:scaleloghyper                    0.944343    
## scalelogfreq:modeldeepseek-chat               0.031861 *  
## scalelogfreq:modelgpt-4o                      0.313389    
## scaleloghyper:modeldeepseek-chat              0.066865 .  
## scaleloghyper:modelgpt-4o                     0.568329    
## scalelogfreq:scaleloghyper:modeldeepseek-chat 0.001112 ** 
## scalelogfreq:scaleloghyper:modelgpt-4o        0.795925    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) scllgf scllgh mdldp- mdlg-4 scllg: scllgf:- scllgf:-4
## scalelogfrq  0.019                                                      
## scalelghypr  0.041  0.038                                               
## mdldpsk-cht -0.402 -0.010 -0.044                                        
## modelgpt-4o -0.475 -0.042 -0.061  0.358                                 
## scllgfrq:sc  0.040  0.194 -0.279 -0.041 -0.039                          
## scllgfrq:m- -0.014 -0.604 -0.012  0.160  0.005 -0.130                   
## scllgfrq:-4  0.010 -0.646 -0.018  0.007  0.078 -0.136  0.400            
## scllghypr:- -0.029 -0.024 -0.550 -0.120  0.035  0.156 -0.185    0.011   
## scllghyp:-4 -0.028 -0.024 -0.650  0.032  0.138  0.182  0.012    0.015   
## scllgfrq::- -0.038 -0.101  0.179 -0.168  0.013 -0.586  0.126    0.076   
## scllgfr::-4 -0.051 -0.120  0.193  0.028  0.057 -0.649  0.087    0.254   
##             scllgh:- scllgh:-4 scl::-
## scalelogfrq                          
## scalelghypr                          
## mdldpsk-cht                          
## modelgpt-4o                          
## scllgfrq:sc                          
## scllgfrq:m-                          
## scllgfrq:-4                          
## scllghypr:-                          
## scllghyp:-4  0.357                   
## scllgfrq::- -0.007   -0.112          
## scllgfr::-4 -0.110   -0.245     0.388
plot_model(goodness.glmer_modelFrequencies_interact, type="pred", terms=c("scalelogfreq [all]", "scaleloghyper", "model"))

ggsave(paste0(path, "./results/monosemes_model_scalelogfreq_scaleloghyper_interact.png"), width= 7, height=7)

plot_model(goodness.glmer_modelFrequencies_interact, type="pred", terms=c("scalelogfreq [all]", "model", "scaleloghyper"))

ggsave(paste0(path, "./results/monosemes_model_scalelogfreq_scaleloghyper_interact_2.png"), width= 7, height=7)
max(vif(goodness.glmer_modelFrequencies_interact))
## [1] 2.922366
# 2.92

anova(goodness.glmer.freqInteract, goodness.glmer_modelFrequencies_interact)
## Data: data
## Models:
## goodness.glmer.freqInteract: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 | sentenceLength)
## goodness.glmer_modelFrequencies_interact: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper * model + (1 | sentenceLength)
##                                          npar    AIC    BIC  logLik -2*log(L)
## goodness.glmer.freqInteract                 5 1552.7 1579.7 -771.36    1542.7
## goodness.glmer_modelFrequencies_interact   13 1510.4 1580.4 -742.22    1484.4
##                                           Chisq Df Pr(>Chisq)    
## goodness.glmer.freqInteract                                      
## goodness.glmer_modelFrequencies_interact 58.297  8  1.005e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(goodness.glmer_modelFrequencies_interact, ~ model | scalelogfreq)
## NOTE: Results may be misleading due to involvement in interactions
## scalelogfreq = -0.00437:
##  model                     emmean    SE  df asymp.LCL asymp.UCL
##  claude-3-5-haiku-20241022  0.994 0.143 Inf     0.713      1.27
##  deepseek-chat              1.983 0.176 Inf     1.638      2.33
##  gpt-4o                     1.582 0.155 Inf     1.278      1.89
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95
emmeans(goodness.glmer_modelFrequencies_interact, ~ scalelogfreq | model)
## NOTE: Results may be misleading due to involvement in interactions
## model = claude-3-5-haiku-20241022:
##  scalelogfreq emmean    SE  df asymp.LCL asymp.UCL
##      -0.00437  0.994 0.143 Inf     0.713      1.27
## 
## model = deepseek-chat:
##  scalelogfreq emmean    SE  df asymp.LCL asymp.UCL
##      -0.00437  1.983 0.176 Inf     1.638      2.33
## 
## model = gpt-4o:
##  scalelogfreq emmean    SE  df asymp.LCL asymp.UCL
##      -0.00437  1.582 0.155 Inf     1.278      1.89
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95
emmeans(goodness.glmer_modelFrequencies_interact, list(pairwise~model))
## NOTE: Results may be misleading due to involvement in interactions
## $`emmeans of model`
##  model                     emmean    SE  df asymp.LCL asymp.UCL
##  claude-3-5-haiku-20241022  0.994 0.143 Inf     0.713      1.27
##  deepseek-chat              1.983 0.176 Inf     1.638      2.33
##  gpt-4o                     1.582 0.155 Inf     1.278      1.89
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $`pairwise differences of model`
##  1                                             estimate    SE  df z.ratio
##  (claude-3-5-haiku-20241022) - (deepseek-chat)   -0.989 0.175 Inf  -5.649
##  (claude-3-5-haiku-20241022) - (gpt-4o)          -0.589 0.159 Inf  -3.705
##  (deepseek-chat) - (gpt-4o)                       0.400 0.190 Inf   2.111
##  p.value
##  <0.0001
##   0.0006
##   0.0876
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 3 estimates
# here, it seems. deepseek significantly better than gpt p<0.05 but results may be misleading due to involvement in interactions

# variance inflation for regression methods
max(vif(goodness.glmer_modelFrequencies_interact))
## [1] 2.922366
#2.1994 perfect

emm_model <- emmeans(goodness.glmer_modelFrequencies_interact,
                     ~ model,
                     type = "response")
## NOTE: Results may be misleading due to involvement in interactions
emm_df <- as.data.frame(emm_model)
emm_df
##  model                      response         SE  df asymp.LCL asymp.UCL
##  claude-3-5-haiku-20241022 0.7298208 0.02819920 Inf 0.6711559 0.7814283
##  deepseek-chat             0.8789649 0.01871642 Inf 0.8372419 0.9111276
##  gpt-4o                    0.8295310 0.02194321 Inf 0.7821380 0.8683496
## 
## Unknown transformation "as.factor": no transformation done 
## Confidence level used: 0.95
# Open a PNG device
png(paste0(path,"./results/monosemes_predictedProbabilityGoodSentence.png"), width = 1200, height = 800, res = 150)

ggplot(emm_df, aes(x = model,
                   y = response,
                   ymin = asymp.LCL,
                   ymax = asymp.UCL)) +
  geom_pointrange(size = 0.8) +
  geom_line(aes(group = 1), linetype = "dashed", color = "grey50") +
  labs(
    x = "Model",
    y = "Predicted Probability of 'Good Sentence'",
    title = "Estimated Marginal Means by Model"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    plot.title = element_text(face = "bold")
  )
  
# Close the device to save the file
dev.off()
## quartz_off_screen 
##                 2