C. Zinn, 2025
path="/Users/zinn/Projects/EKUT/GermaNet/GermaNet_LLM/R_analyses/annotations/"
use_poststudy_data_use_adj <- TRUE
#use_poststudy_data_use_adj <- FALSE
if (use_poststudy_data_use_adj) {
data=read.table( paste0(path, "monosemes_poststudy_corrected.csv"), header=T, sep=";")
message("Loaded monosemes_poststudy_corrected.csv")
} else {
data=read.table( paste0(path, "monosemes_poststudy.csv"), header=T, sep=";")
message("Loaded monosemes_poststudy.csv")
}
## Loaded monosemes_poststudy_corrected.csv
# for Table 5. LLMs and their error types
counts <- data %>%
group_by(errorType, model) %>%
summarise(count = n(), .groups = "drop")
print(counts)
## # A tibble: 6 × 3
## errorType model count
## <chr> <chr> <int>
## 1 0 deepseek-chat 6
## 2 0 gpt-5.2-chat-latest 46
## 3 FL deepseek-chat 10
## 4 ha gpt-5.2-chat-latest 2
## 5 nn deepseek-chat 2
## 6 nn gpt-5.2-chat-latest 6
countsAnno <- data %>%
group_by(errorType, model, annotator) %>%
summarise(count = n(), .groups = "drop")
print(countsAnno)
## # A tibble: 11 × 4
## errorType model annotator count
## <chr> <chr> <chr> <int>
## 1 0 deepseek-chat et 4
## 2 0 deepseek-chat rb 2
## 3 0 gpt-5.2-chat-latest et 22
## 4 0 gpt-5.2-chat-latest rb 24
## 5 FL deepseek-chat et 5
## 6 FL deepseek-chat rb 5
## 7 ha gpt-5.2-chat-latest et 1
## 8 ha gpt-5.2-chat-latest rb 1
## 9 nn deepseek-chat rb 2
## 10 nn gpt-5.2-chat-latest et 4
## 11 nn gpt-5.2-chat-latest rb 2
# see how each sentence id is ranked by errorType per annotator (sorted by id)
countsId <- data %>%
count(id, errorType, annotator) %>%
arrange(id)
# print(countsId)
nrow(data)
## [1] 72
# 5.1.1 Length of example sentences
# ---------------------------------
mean(data$sentenceLength)
## [1] 12.16667
median(data$sentenceLength)
## [1] 11
mean(data$sentenceStringLength)
## [1] 82.97222
median(data$sentenceStringLength)
## [1] 80.5
# split by model
claude=data[data$model=="claude-3-5-haiku-20241022",]
mean(claude$sentenceLength)
## [1] NaN
mean(claude$sentenceStringLength)
## [1] NaN
deep=data[data$model=="deepseek-chat",]
mean(deep$sentenceLength)
## [1] 9.444444
mean(deep$sentenceStringLength)
## [1] 70.88889
chat=data[data$model=="gpt-4o",]
mean(chat$sentenceLength)
## [1] NaN
mean(chat$sentenceStringLength)
## [1] NaN
# lemma might influence sentence length, so this may be too simplistic.
# sl_simple=lm(sentenceLength ~ model, data=data)
sl = lmer(sentenceLength ~ model + (1 | lemma), data = data)
Anova(sl, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: sentenceLength
## Chisq Df Pr(>Chisq)
## (Intercept) 135.282 1 < 2.2e-16 ***
## model 21.445 1 3.642e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(sl)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sentenceLength ~ model + (1 | lemma)
## Data: data
##
## REML criterion at convergence: 253.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.40226 -0.66317 -0.07835 0.61341 2.37538
##
## Random effects:
## Groups Name Variance Std.Dev.
## lemma (Intercept) 5.521 2.350
## Residual 1.244 1.115
## Number of obs: 72, groups: lemma, 11
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 10.0075 0.8604 17.8828 11.631 8.99e-10 ***
## modelgpt-5.2-chat-latest 2.8150 0.6079 69.3818 4.631 1.65e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## mdlgp-5.2-- -0.546
data$logSentenceStringLength=log(data$sentenceStringLength)
# sentenceLength (not significant)
goodness.glmer=glmer(as.factor(goodSentence) ~ sentenceLength +(1|lemma), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 6.8001 1 0.009115 **
## sentenceLength 9.0446 1 0.002635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#summary(goodness.glmer)
# 5.1.2.A lemma & hypernym frequencies and sentence quality
# -------------------------------------------------------
data=read.table( paste0(path, "monosemes_corrected.csv"), header=T, sep=";")
nrow(data)
## [1] 1791
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(log(data$frequency))
mean(data$scalelogfreq)
## [1] 2.729649e-14
png(paste0(path, "./results/lemmaFrequency.png"), width = 800, height = 600)
plot(density(scale(data$logfreq)), main = "Density Plot")
# Close the device to save
dev.off()
## quartz_off_screen
## 2
ggplot(data, aes(x = frequency)) +
geom_histogram(bins = 50) +
labs(title = "Word Frequency Distribution",
x = "Word Frequency",
y = "Number of Word Types") +
theme_minimal()

# frequencies are highly skewed (they usually are!), log-transform:
ggplot(data, aes(x = frequency)) +
geom_histogram(bins = 50) +
scale_x_log10() +
labs(title = "Word Frequency Distribution (log scale)",
x = "Word Frequency (log10)",
y = "Number of Word Types") +
theme_minimal()

# Prepare data
df_zipf <- data %>%
arrange(desc(frequency)) %>%
mutate(rank = row_number())
# Plot (log–log)
ggplot(df_zipf, aes(x = rank, y = frequency)) +
geom_line() +
scale_x_log10() +
scale_y_log10() +
labs(
title = "Zipf Plot (Rank–Frequency Distribution)",
x = "Rank (log scale)",
y = "Frequency (log scale)"
) +
theme_minimal()

# goodSentence˜scalelogfreq
# .........................
# isSingular
# goodness.glmer.scalelogfreq=glmer(as.factor(goodSentence) ~ scalelogfreq +(1|annotator)+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# removing annotator helps with singularity
goodness.glmer.scalelogfreq=glmer(as.factor(goodSentence) ~ scalelogfreq +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer.scalelogfreq, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 223.4240 1 <2e-16 ***
## scalelogfreq 4.2871 1 0.0384 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ranef(goodness.glmer.scalelogfreq)
## $sentenceLength
## (Intercept)
## 7 -0.105788684
## 8 -0.081478364
## 9 -0.049633220
## 10 -0.376119440
## 11 0.284501864
## 12 -0.064219777
## 13 -0.066604158
## 14 0.093096135
## 15 0.024493814
## 16 0.139068935
## 17 -0.049147983
## 18 -0.164098872
## 19 0.166281629
## 20 0.337673906
## 21 0.007740963
## 22 -0.068946655
## 23 -0.061292063
## 24 0.021932917
## 25 -0.073512173
## 28 0.021763640
##
## with conditional variances for "sentenceLength"
summary(goodness.glmer.scalelogfreq)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq + (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1748.8 1765.3 -871.4 1742.8 1788
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5631 0.4151 0.4680 0.5002 0.6579
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.06547 0.2559
## Number of obs: 1791, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.43503 0.09601 14.947 <2e-16 ***
## scalelogfreq 0.12536 0.06054 2.071 0.0384 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## scalelogfrq 0.060
plot_model(goodness.glmer.scalelogfreq, type="pred", terms=c("scalelogfreq [all]"))
## You are calculating adjusted predictions on the population-level (i.e.
## `type = "fixed"`) for a *generalized* linear mixed model.
## This may produce biased estimates due to Jensen's inequality. Consider
## setting `bias_correction = TRUE` to correct for this bias.
## See also the documentation of the `bias_correction` argument.

ggsave(paste0(path, "./results/monosemes_scalelogfreq.png"), width= 7, height=7)
# 5.1.2.B hypernym frequency and sentence quality
# ------------------------------------------
# read the data anew
data=read.table( paste0(path, "monosemes_corrected.csv"), header=T, sep=";")
nrow(data)
## [1] 1791
View(data)
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(log(data$frequency))
range(data$scalelogfreq)
## [1] -3.003094 1.422094
# write NA in case there is no frequency information for the hypernym
data$frequencyHypernym[data$frequencyHypernym == 0] <- NA
data$loghyper=log(data$frequencyHypernym)
data$scaleloghyper=scale(data$loghyper)
data_with_nonzero_freq_hyps = data[data$frequencyHypernym=='NA',]
nrow(data_with_nonzero_freq_hyps)
## [1] 180
# 180
plot(data$logfreq, data$loghyper)

cor.test(data$logfreq, data$loghyper, method = c("kendall"))
##
## Kendall's rank correlation tau
##
## data: data$logfreq and data$loghyper
## z = -4.5984, p-value = 4.258e-06
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.07727761
cor.test(data$logfreq, data$loghyper, method = c("spearman"))
## Warning in cor.test.default(data$logfreq, data$loghyper, method =
## c("spearman")): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: data$logfreq and data$loghyper
## S = 769255633, p-value = 2.929e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1039146
cor.test(data$logfreq, data$loghyper, method = c("pearson"))
##
## Pearson's product-moment correlation
##
## data: data$logfreq and data$loghyper
## t = -2.701, df = 1609, p-value = 0.006986
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.11564238 -0.01840586
## sample estimates:
## cor
## -0.06718364
# isSingular
# goodness.glmer.scaleloghyper=glmer(as.factor(goodSentence) ~ scaleloghyper +(1|annotator)+(1|sentenceLength), data=data, family="binomial",control=glmerControl(optimizer="bobyqa"))
# removing annotator helps with singularity
goodness.glmer.scaleloghyper=glmer(as.factor(goodSentence) ~ scaleloghyper +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer.scaleloghyper, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 140.072 1 < 2.2e-16 ***
## scaleloghyper 10.462 1 0.001219 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.scaleloghyper)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scaleloghyper + (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1560.9 1577.1 -777.5 1554.9 1608
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6607 0.3951 0.4484 0.5007 0.7751
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.1267 0.356
## Number of obs: 1611, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4220 0.1202 11.835 < 2e-16 ***
## scaleloghyper 0.2034 0.0629 3.234 0.00122 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## scalelghypr 0.037
plot_model(goodness.glmer.scaleloghyper, type="pred", terms=c("scaleloghyper"))
## Data were 'prettified'. Consider using `terms="scaleloghyper [all]"` to
## get smooth plots.

ggsave(paste0(path, "./results/monosemes_scaleloghyper.png"), width= 7, height=7)
# 5.1.2.C combined frequencies
# --------------------------
goodness.glmer.freqAdditiv=glmer(as.factor(goodSentence) ~ scalelogfreq+scaleloghyper +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
goodness.glmer.freqAdditiv_rs1=glmer(as.factor(goodSentence) ~ scalelogfreq+scaleloghyper +(1+scalelogfreq|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer.freqAdditiv, goodness.glmer.freqAdditiv_rs1)
## Data: data
## Models:
## goodness.glmer.freqAdditiv: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqAdditiv_rs1: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 + scalelogfreq | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer.freqAdditiv 4 1555.5 1577.1 -773.77 1547.5
## goodness.glmer.freqAdditiv_rs1 6 1545.7 1578.0 -766.85 1533.7 13.822 2
## Pr(>Chisq)
## goodness.glmer.freqAdditiv
## goodness.glmer.freqAdditiv_rs1 0.0009967 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# -> rs1 is better
Anova(goodness.glmer.freqAdditiv_rs1, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 193.1568 1 < 2.2e-16 ***
## scalelogfreq 1.1157 1 0.2908448
## scaleloghyper 11.3875 1 0.0007394 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.freqAdditiv_rs1, type="III")
## Warning in summary.merMod(goodness.glmer.freqAdditiv_rs1, type = "III"):
## additional arguments ignored
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 +
## scalelogfreq | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1545.7 1578.0 -766.9 1533.7 1605
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0396 0.3598 0.4279 0.4951 1.0404
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## sentenceLength (Intercept) 0.08409 0.2900
## scalelogfreq 0.15830 0.3979 0.83
## Number of obs: 1611, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.53294 0.11030 13.898 < 2e-16 ***
## scalelogfreq 0.13925 0.13183 1.056 0.290845
## scaleloghyper 0.22127 0.06557 3.375 0.000739 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) scllgf
## scalelogfrq 0.448
## scalelghypr 0.028 0.052
plot_model(goodness.glmer.freqAdditiv_rs1, type="pred", terms=c("scalelogfreq [all]", "scaleloghyper"))

# isSingular
# goodness.glmer.freqAdditiv_rs2=glmer(as.factor(goodSentence) ~ scalelogfreq+scaleloghyper +(1+scalelogfreq*scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# isSingular
# goodness.glmer.freqAdditiv_rs2=glmer(as.factor(goodSentence) ~ scalelogfreq+scaleloghyper +(1+scalelogfreq+scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# isSingular
# goodness.glmer.freqAdditiv_rs2=glmer(as.factor(goodSentence) ~ scalelogfreq+scaleloghyper + (1+scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
p1 <- ggpredict(goodness.glmer.freqAdditiv, terms = "scalelogfreq [all]")
p2 <- ggpredict(goodness.glmer.freqAdditiv, terms = "scaleloghyper [all]")
plot(p1) + theme_minimal() + labs(y="Predicted P(good sentence)")

ggsave(paste0(path, "./results/monosemes_scalelogfreq_bw.png"), width= 7, height=7)
plot(p2) + theme_minimal() + labs(y="Predicted P(good sentence)")

ggsave(paste0(path, "./results/monosemes_scaleloghyper_bw.png"), width= 7, height=7)
# --- interaction between the combined frequencies?
# isSingular
# goodness.glmer.freqInteract
# removing annotator helps
goodness.glmer.freqInteract=glmer(as.factor(goodSentence) ~ scalelogfreq*scaleloghyper +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer.freqInteract, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 148.4832 1 < 2.2e-16 ***
## scalelogfreq 4.5031 1 0.0338330 *
## scaleloghyper 13.8612 1 0.0001968 ***
## scalelogfreq:scaleloghyper 4.7810 1 0.0287754 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.freqInteract)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 |
## sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1552.7 1579.7 -771.4 1542.7 1606
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7316 0.3849 0.4398 0.4892 1.0889
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.1207 0.3474
## Number of obs: 1611, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.43933 0.11812 12.185 < 2e-16 ***
## scalelogfreq 0.14150 0.06668 2.122 0.033833 *
## scaleloghyper 0.24531 0.06589 3.723 0.000197 ***
## scalelogfreq:scaleloghyper -0.15810 0.07230 -2.187 0.028775 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) scllgf scllgh
## scalelogfrq 0.046
## scalelghypr 0.050 -0.022
## scllgfrq:sc -0.032 0.252 -0.193
# random slope for sentence length
# isSingular
# goodness.glmer.freqInteract_rs=glmer(as.factor(goodSentence) ~ scalelogfreq*scaleloghyper +(1+scalelogfreq|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# isSingular
# goodness.glmer.freqInteract_rs=glmer(as.factor(goodSentence) ~ scalelogfreq*scaleloghyper +(1+scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# isSingular
# goodness.glmer.freqInteract_rs=glmer(as.factor(goodSentence) ~ scalelogfreq*scaleloghyper +(1+scalelogfreq*scaleloghyper|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# additiv model is better dann scaleloghyper
anova(goodness.glmer.freqAdditiv, goodness.glmer.scaleloghyper)
## Data: data
## Models:
## goodness.glmer.scaleloghyper: as.factor(goodSentence) ~ scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqAdditiv: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer.scaleloghyper 3 1560.9 1577.1 -777.47 1554.9
## goodness.glmer.freqAdditiv 4 1555.5 1577.1 -773.77 1547.5 7.4054 1
## Pr(>Chisq)
## goodness.glmer.scaleloghyper
## goodness.glmer.freqAdditiv 0.006503 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# not fitted with same dataset
# anova(goodness.glmer.freqAdditiv, goodness.glmer.scalelogfreq)
# interact model is better than additiv model
anova(goodness.glmer.freqInteract, goodness.glmer.freqAdditiv)
## Data: data
## Models:
## goodness.glmer.freqAdditiv: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqInteract: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer.freqAdditiv 4 1555.5 1577.1 -773.77 1547.5
## goodness.glmer.freqInteract 5 1552.7 1579.7 -771.36 1542.7 4.8008 1
## Pr(>Chisq)
## goodness.glmer.freqAdditiv
## goodness.glmer.freqInteract 0.02845 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(goodness.glmer.freqInteract, goodness.glmer.freqAdditiv_rs1)
## Data: data
## Models:
## goodness.glmer.freqInteract: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 | sentenceLength)
## goodness.glmer.freqAdditiv_rs1: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + (1 + scalelogfreq | sentenceLength)
## npar AIC BIC logLik -2*log(L) Chisq Df
## goodness.glmer.freqInteract 5 1552.7 1579.7 -771.36 1542.7
## goodness.glmer.freqAdditiv_rs1 6 1545.7 1578.0 -766.85 1533.7 9.0214 1
## Pr(>Chisq)
## goodness.glmer.freqInteract
## goodness.glmer.freqAdditiv_rs1 0.002668 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_model(goodness.glmer.freqInteract, type="pred", terms=c("scalelogfreq", "scaleloghyper"), ci.lvl=NA, show.p=T)
## Data were 'prettified'. Consider using `terms="scalelogfreq [all]"` to
## get smooth plots.

# ==> best model is the interaction model
Anova(goodness.glmer.freqInteract, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 148.4832 1 < 2.2e-16 ***
## scalelogfreq 4.5031 1 0.0338330 *
## scaleloghyper 13.8612 1 0.0001968 ***
## scalelogfreq:scaleloghyper 4.7810 1 0.0287754 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer.freqInteract)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 |
## sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1552.7 1579.7 -771.4 1542.7 1606
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7316 0.3849 0.4398 0.4892 1.0889
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.1207 0.3474
## Number of obs: 1611, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.43933 0.11812 12.185 < 2e-16 ***
## scalelogfreq 0.14150 0.06668 2.122 0.033833 *
## scaleloghyper 0.24531 0.06589 3.723 0.000197 ***
## scalelogfreq:scaleloghyper -0.15810 0.07230 -2.187 0.028775 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) scllgf scllgh
## scalelogfrq 0.046
## scalelghypr 0.050 -0.022
## scllgfrq:sc -0.032 0.252 -0.193
plot_model(goodness.glmer.freqAdditiv, type="pred", terms=c("scalelogfreq", "scaleloghyper"), ci.lvl=NA, show.p=T)
## Data were 'prettified'. Consider using `terms="scalelogfreq [all]"` to
## get smooth plots.

pred <- ggpredict(
goodness.glmer.freqInteract,
terms = c("scalelogfreq [-2:2]", "scaleloghyper [-1,0,1]")
)
plot(pred) +
labs(
x = "Lemma frequency (scaled log)",
y = "Predicted probability of good sentence",
color = "Hypernym frequency"
) +
theme_minimal()

ggsave(paste0(path, "./results/monosemes_frequencyInteraction.png"), width= 7, height=7)
#data=read.table( paste0(path, "monosemes.csv"), header=T, sep=";")
data=read.table( paste0(path, "monosemes_corrected.csv"), header=T, sep=";")
# 5.1.3 goodSentence ~ model + frequencies
# ----------------------------------------
# use scalelogfreq ensuring that log is not zero
nrow(data)
## [1] 1791
data$logfreq=log(data$frequency)
data$scalelogfreq=scale(log(data$frequency))
mean(data$scalelogfreq)
## [1] 2.729649e-14
# write NA in case there is no frequency information for the hypernym
data$frequencyHypernym[data$frequencyHypernym == 0] <- NA
data$loghyper=log(data$frequencyHypernym)
data$scaleloghyper=scale(data$loghyper)
# isSingular
# goodness.glmer_modelOnly=glmer(as.factor(goodSentence) ~ model +(1|sentenceLength)+(1|annotator), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# model only, (annotator had to go) ***
goodness.glmer_modelOnly=glmer(as.factor(goodSentence) ~ model +(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
Anova(goodness.glmer_modelOnly, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 74.638 1 < 2.2e-16 ***
## model 33.400 2 5.589e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_modelOnly)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ model + (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1721.1 1743.1 -856.6 1713.1 1787
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9280 0.3751 0.4294 0.5209 0.6924
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.05293 0.2301
## Number of obs: 1791, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0102 0.1169 8.639 < 2e-16 ***
## modeldeepseek-chat 0.8578 0.1528 5.615 1.97e-08 ***
## modelgpt-4o 0.5015 0.1442 3.478 0.000505 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) mdldp-
## mdldpsk-cht -0.489
## modelgpt-4o -0.545 0.396
plot_model(goodness.glmer_modelOnly, type="pred", terms=c("model"))

ggsave(paste0(path, "./results/monosemes_model.png"), width= 7, height=7)
# test random slopes
# isSingular
# goodness.glmer_modelOnly_rs=glmer(as.factor(goodSentence) ~ model +(1+model|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# pairwise comparison
# deepseek is better than claude; gpt is better than claude, but deepseek not significantly better than gpt.
emmeans(goodness.glmer_modelOnly, list(pairwise~model))
## $`emmeans of model`
## model emmean SE df asymp.LCL asymp.UCL
## claude-3-5-haiku-20241022 1.01 0.117 Inf 0.781 1.24
## deepseek-chat 1.87 0.140 Inf 1.594 2.14
## gpt-4o 1.51 0.127 Inf 1.263 1.76
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
##
## $`pairwise differences of model`
## 1 estimate SE df z.ratio
## (claude-3-5-haiku-20241022) - (deepseek-chat) -0.858 0.153 Inf -5.615
## (claude-3-5-haiku-20241022) - (gpt-4o) -0.502 0.144 Inf -3.478
## (deepseek-chat) - (gpt-4o) 0.356 0.163 Inf 2.180
## p.value
## <0.0001
## 0.0015
## 0.0746
##
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 3 estimates
# scalelogfreq plus model
# -----------------------
# isSingular
# goodness.glmer_modelScalelogfreq=glmer(as.factor(goodSentence) ~ scalelogfreq+model +(1|annotator)+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
goodness.glmer_modelScalelogfreq_add=glmer(as.factor(goodSentence) ~ scalelogfreq+model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_modelOnly, goodness.glmer_modelScalelogfreq_add)
## Data: data
## Models:
## goodness.glmer_modelOnly: as.factor(goodSentence) ~ model + (1 | sentenceLength)
## goodness.glmer_modelScalelogfreq_add: as.factor(goodSentence) ~ scalelogfreq + model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L)
## goodness.glmer_modelOnly 4 1721.2 1743.1 -856.57 1713.2
## goodness.glmer_modelScalelogfreq_add 5 1719.0 1746.5 -854.52 1709.0
## Chisq Df Pr(>Chisq)
## goodness.glmer_modelOnly
## goodness.glmer_modelScalelogfreq_add 4.1105 1 0.04262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# "A likelihood-ratio test comparing a model with only LLM identity to a model additionally including scaled log frequency showed a significant improvement in fit (χ²(1) = 4.11, p = .043), indicating that lexical frequency contributes independently to sentence quality judgments."
goodness.glmer_modelScalelogfreq_inter=glmer(as.factor(goodSentence) ~ scalelogfreq*model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_modelScalelogfreq_add, goodness.glmer_modelScalelogfreq_inter)
## Data: data
## Models:
## goodness.glmer_modelScalelogfreq_add: as.factor(goodSentence) ~ scalelogfreq + model + (1 | sentenceLength)
## goodness.glmer_modelScalelogfreq_inter: as.factor(goodSentence) ~ scalelogfreq * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L)
## goodness.glmer_modelScalelogfreq_add 5 1719.0 1746.5 -854.52 1709.0
## goodness.glmer_modelScalelogfreq_inter 7 1714.8 1753.2 -850.39 1700.8
## Chisq Df Pr(>Chisq)
## goodness.glmer_modelScalelogfreq_add
## goodness.glmer_modelScalelogfreq_inter 8.2641 2 0.01605 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# => inter is better
goodness.glmer_modelScalelogfreq_inter_rs=glmer(as.factor(goodSentence) ~ scalelogfreq*model+(1+scalelogfreq|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_modelScalelogfreq_inter, goodness.glmer_modelScalelogfreq_inter_rs)
## Data: data
## Models:
## goodness.glmer_modelScalelogfreq_inter: as.factor(goodSentence) ~ scalelogfreq * model + (1 | sentenceLength)
## goodness.glmer_modelScalelogfreq_inter_rs: as.factor(goodSentence) ~ scalelogfreq * model + (1 + scalelogfreq | sentenceLength)
## npar AIC BIC logLik -2*log(L)
## goodness.glmer_modelScalelogfreq_inter 7 1714.8 1753.2 -850.39 1700.8
## goodness.glmer_modelScalelogfreq_inter_rs 9 1706.5 1755.9 -844.24 1688.5
## Chisq Df Pr(>Chisq)
## goodness.glmer_modelScalelogfreq_inter
## goodness.glmer_modelScalelogfreq_inter_rs 12.292 2 0.002142 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# = rs is better
# take sclaloghyper on board
goodness.glmer_modelFrequencies_add=glmer(as.factor(goodSentence) ~ scalelogfreq+scaleloghyper+model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
# not comparable, fitted on different data set.
# anova(goodness.glmer_modelScalelogfreq_inter_rs,goodness.glmer_modelFrequencies_add )
goodness.glmer_modelFrequencies_interact=glmer(as.factor(goodSentence) ~ scalelogfreq*scaleloghyper*model+(1|sentenceLength), data=data,family="binomial",control=glmerControl(optimizer="bobyqa"))
anova(goodness.glmer_modelFrequencies_add,goodness.glmer_modelFrequencies_interact)
## Data: data
## Models:
## goodness.glmer_modelFrequencies_add: as.factor(goodSentence) ~ scalelogfreq + scaleloghyper + model + (1 | sentenceLength)
## goodness.glmer_modelFrequencies_interact: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L)
## goodness.glmer_modelFrequencies_add 6 1527.1 1559.4 -757.54 1515.1
## goodness.glmer_modelFrequencies_interact 13 1510.4 1580.4 -742.22 1484.4
## Chisq Df Pr(>Chisq)
## goodness.glmer_modelFrequencies_add
## goodness.glmer_modelFrequencies_interact 30.643 7 7.235e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# => Comparing the additive model (frequency + hypernym frequency + model) to the fully interacted model (frequency × hypernym frequency × model) reveals a substantial and highly significant improvement in model fit (χ²(7) = 30.64, p < .001). The interaction model yields a notably lower AIC (ΔAIC = −16.7), indicating that the relationship between lexical frequency measures and sentence quality differs systematically across generation models.
Anova(goodness.glmer_modelFrequencies_interact, type="III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: as.factor(goodSentence)
## Chisq Df Pr(>Chisq)
## (Intercept) 48.2802 1 3.695e-12 ***
## scalelogfreq 0.0060 1 0.938364
## scaleloghyper 6.7258 1 0.009503 **
## model 35.2266 2 2.242e-08 ***
## scalelogfreq:scaleloghyper 0.0049 1 0.944343
## scalelogfreq:model 4.6329 2 0.098621 .
## scaleloghyper:model 5.0760 2 0.079026 .
## scalelogfreq:scaleloghyper:model 11.8255 2 0.002705 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(goodness.glmer_modelFrequencies_interact)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper * model +
## (1 | sentenceLength)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1510.4 1580.4 -742.2 1484.4 1598
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.7059 0.2913 0.3940 0.5258 1.3877
##
## Random effects:
## Groups Name Variance Std.Dev.
## sentenceLength (Intercept) 0.1209 0.3478
## Number of obs: 1611, groups: sentenceLength, 20
##
## Fixed effects:
## Estimate Std. Error z value
## (Intercept) 0.993748 0.143018 6.948
## scalelogfreq 0.007933 0.102588 0.077
## scaleloghyper 0.264884 0.102137 2.593
## modeldeepseek-chat 0.990484 0.175188 5.654
## modelgpt-4o 0.589279 0.158895 3.709
## scalelogfreq:scaleloghyper -0.007838 0.112269 -0.070
## scalelogfreq:modeldeepseek-chat 0.350926 0.163514 2.146
## scalelogfreq:modelgpt-4o 0.156969 0.155702 1.008
## scaleloghyper:modeldeepseek-chat -0.342076 0.186663 -1.833
## scaleloghyper:modelgpt-4o 0.089030 0.156052 0.571
## scalelogfreq:scaleloghyper:modeldeepseek-chat -0.626448 0.192126 -3.261
## scalelogfreq:scaleloghyper:modelgpt-4o -0.044516 0.172127 -0.259
## Pr(>|z|)
## (Intercept) 3.69e-12 ***
## scalelogfreq 0.938364
## scaleloghyper 0.009503 **
## modeldeepseek-chat 1.57e-08 ***
## modelgpt-4o 0.000208 ***
## scalelogfreq:scaleloghyper 0.944343
## scalelogfreq:modeldeepseek-chat 0.031861 *
## scalelogfreq:modelgpt-4o 0.313389
## scaleloghyper:modeldeepseek-chat 0.066865 .
## scaleloghyper:modelgpt-4o 0.568329
## scalelogfreq:scaleloghyper:modeldeepseek-chat 0.001112 **
## scalelogfreq:scaleloghyper:modelgpt-4o 0.795925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) scllgf scllgh mdldp- mdlg-4 scllg: scllgf:- scllgf:-4
## scalelogfrq 0.019
## scalelghypr 0.041 0.038
## mdldpsk-cht -0.402 -0.010 -0.044
## modelgpt-4o -0.475 -0.042 -0.061 0.358
## scllgfrq:sc 0.040 0.194 -0.279 -0.041 -0.039
## scllgfrq:m- -0.014 -0.604 -0.012 0.160 0.005 -0.130
## scllgfrq:-4 0.010 -0.646 -0.018 0.007 0.078 -0.136 0.400
## scllghypr:- -0.029 -0.024 -0.550 -0.120 0.035 0.156 -0.185 0.011
## scllghyp:-4 -0.028 -0.024 -0.650 0.032 0.138 0.182 0.012 0.015
## scllgfrq::- -0.038 -0.101 0.179 -0.168 0.013 -0.586 0.126 0.076
## scllgfr::-4 -0.051 -0.120 0.193 0.028 0.057 -0.649 0.087 0.254
## scllgh:- scllgh:-4 scl::-
## scalelogfrq
## scalelghypr
## mdldpsk-cht
## modelgpt-4o
## scllgfrq:sc
## scllgfrq:m-
## scllgfrq:-4
## scllghypr:-
## scllghyp:-4 0.357
## scllgfrq::- -0.007 -0.112
## scllgfr::-4 -0.110 -0.245 0.388
plot_model(goodness.glmer_modelFrequencies_interact, type="pred", terms=c("scalelogfreq [all]", "scaleloghyper", "model"))

ggsave(paste0(path, "./results/monosemes_model_scalelogfreq_scaleloghyper_interact.png"), width= 7, height=7)
plot_model(goodness.glmer_modelFrequencies_interact, type="pred", terms=c("scalelogfreq [all]", "model", "scaleloghyper"))

ggsave(paste0(path, "./results/monosemes_model_scalelogfreq_scaleloghyper_interact_2.png"), width= 7, height=7)
max(vif(goodness.glmer_modelFrequencies_interact))
## [1] 2.922366
# 2.92
anova(goodness.glmer.freqInteract, goodness.glmer_modelFrequencies_interact)
## Data: data
## Models:
## goodness.glmer.freqInteract: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper + (1 | sentenceLength)
## goodness.glmer_modelFrequencies_interact: as.factor(goodSentence) ~ scalelogfreq * scaleloghyper * model + (1 | sentenceLength)
## npar AIC BIC logLik -2*log(L)
## goodness.glmer.freqInteract 5 1552.7 1579.7 -771.36 1542.7
## goodness.glmer_modelFrequencies_interact 13 1510.4 1580.4 -742.22 1484.4
## Chisq Df Pr(>Chisq)
## goodness.glmer.freqInteract
## goodness.glmer_modelFrequencies_interact 58.297 8 1.005e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
emmeans(goodness.glmer_modelFrequencies_interact, ~ model | scalelogfreq)
## NOTE: Results may be misleading due to involvement in interactions
## scalelogfreq = -0.00437:
## model emmean SE df asymp.LCL asymp.UCL
## claude-3-5-haiku-20241022 0.994 0.143 Inf 0.713 1.27
## deepseek-chat 1.983 0.176 Inf 1.638 2.33
## gpt-4o 1.582 0.155 Inf 1.278 1.89
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
emmeans(goodness.glmer_modelFrequencies_interact, ~ scalelogfreq | model)
## NOTE: Results may be misleading due to involvement in interactions
## model = claude-3-5-haiku-20241022:
## scalelogfreq emmean SE df asymp.LCL asymp.UCL
## -0.00437 0.994 0.143 Inf 0.713 1.27
##
## model = deepseek-chat:
## scalelogfreq emmean SE df asymp.LCL asymp.UCL
## -0.00437 1.983 0.176 Inf 1.638 2.33
##
## model = gpt-4o:
## scalelogfreq emmean SE df asymp.LCL asymp.UCL
## -0.00437 1.582 0.155 Inf 1.278 1.89
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
emmeans(goodness.glmer_modelFrequencies_interact, list(pairwise~model))
## NOTE: Results may be misleading due to involvement in interactions
## $`emmeans of model`
## model emmean SE df asymp.LCL asymp.UCL
## claude-3-5-haiku-20241022 0.994 0.143 Inf 0.713 1.27
## deepseek-chat 1.983 0.176 Inf 1.638 2.33
## gpt-4o 1.582 0.155 Inf 1.278 1.89
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
##
## $`pairwise differences of model`
## 1 estimate SE df z.ratio
## (claude-3-5-haiku-20241022) - (deepseek-chat) -0.989 0.175 Inf -5.649
## (claude-3-5-haiku-20241022) - (gpt-4o) -0.589 0.159 Inf -3.705
## (deepseek-chat) - (gpt-4o) 0.400 0.190 Inf 2.111
## p.value
## <0.0001
## 0.0006
## 0.0876
##
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 3 estimates
# here, it seems. deepseek significantly better than gpt p<0.05 but results may be misleading due to involvement in interactions
# variance inflation for regression methods
max(vif(goodness.glmer_modelFrequencies_interact))
## [1] 2.922366
#2.1994 perfect
emm_model <- emmeans(goodness.glmer_modelFrequencies_interact,
~ model,
type = "response")
## NOTE: Results may be misleading due to involvement in interactions
emm_df <- as.data.frame(emm_model)
emm_df
## model response SE df asymp.LCL asymp.UCL
## claude-3-5-haiku-20241022 0.7298208 0.02819920 Inf 0.6711559 0.7814283
## deepseek-chat 0.8789649 0.01871642 Inf 0.8372419 0.9111276
## gpt-4o 0.8295310 0.02194321 Inf 0.7821380 0.8683496
##
## Unknown transformation "as.factor": no transformation done
## Confidence level used: 0.95
# Open a PNG device
png(paste0(path,"./results/monosemes_predictedProbabilityGoodSentence.png"), width = 1200, height = 800, res = 150)
ggplot(emm_df, aes(x = model,
y = response,
ymin = asymp.LCL,
ymax = asymp.UCL)) +
geom_pointrange(size = 0.8) +
geom_line(aes(group = 1), linetype = "dashed", color = "grey50") +
labs(
x = "Model",
y = "Predicted Probability of 'Good Sentence'",
title = "Estimated Marginal Means by Model"
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(angle = 25, hjust = 1),
plot.title = element_text(face = "bold")
)
# Close the device to save the file
dev.off()
## quartz_off_screen
## 2